json_scanner 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +69 -3
- data/ext/json_scanner/json_scanner.c +449 -94
- data/ext/json_scanner/json_scanner.h +1 -0
- data/lib/json_scanner/version.rb +1 -1
- data/spec/extensiontesttask.rb +128 -0
- data/spec/json_scanner_spec.c +0 -0
- data/spec/json_scanner_spec.rb +303 -16
- metadata +5 -3
@@ -1,8 +1,12 @@
|
|
1
1
|
#include "json_scanner.h"
|
2
2
|
|
3
3
|
VALUE rb_mJsonScanner;
|
4
|
-
VALUE
|
4
|
+
VALUE rb_cJsonScannerConfig;
|
5
5
|
VALUE rb_eJsonScannerParseError;
|
6
|
+
#define BYTES_CONSUMED "bytes_consumed"
|
7
|
+
ID rb_iv_bytes_consumed;
|
8
|
+
#define SCAN_KWARGS_SIZE 8
|
9
|
+
ID scan_kwargs_table[SCAN_KWARGS_SIZE];
|
6
10
|
|
7
11
|
VALUE null_sym;
|
8
12
|
VALUE boolean_sym;
|
@@ -11,11 +15,13 @@ VALUE string_sym;
|
|
11
15
|
VALUE object_sym;
|
12
16
|
VALUE array_sym;
|
13
17
|
|
18
|
+
VALUE any_key_sym;
|
19
|
+
|
14
20
|
enum matcher_type
|
15
21
|
{
|
16
22
|
MATCHER_KEY,
|
17
23
|
MATCHER_INDEX,
|
18
|
-
|
24
|
+
MATCHER_ANY_KEY,
|
19
25
|
MATCHER_INDEX_RANGE,
|
20
26
|
// MATCHER_KEYS_LIST,
|
21
27
|
// MATCHER_KEY_REGEX,
|
@@ -70,30 +76,115 @@ typedef struct
|
|
70
76
|
typedef struct
|
71
77
|
{
|
72
78
|
int with_path;
|
73
|
-
|
79
|
+
int symbolize_path_keys;
|
74
80
|
int paths_len;
|
75
|
-
|
81
|
+
paths_t *paths;
|
76
82
|
int current_path_len;
|
77
83
|
int max_path_len;
|
84
|
+
path_elem_t *current_path;
|
78
85
|
// Easier to use a Ruby array for result than convert later
|
86
|
+
// must be supplied by the caller and RB_GC_GUARD-ed if it isn't on the stack
|
79
87
|
VALUE points_list;
|
80
88
|
// by depth
|
81
89
|
size_t *starts;
|
82
90
|
// VALUE rb_err;
|
83
91
|
yajl_handle handle;
|
92
|
+
size_t yajl_bytes_consumed;
|
84
93
|
} scan_ctx;
|
85
94
|
|
95
|
+
inline size_t scan_ctx_get_bytes_consumed(scan_ctx *ctx)
|
96
|
+
{
|
97
|
+
return ctx->yajl_bytes_consumed + yajl_get_bytes_consumed(ctx->handle);
|
98
|
+
}
|
99
|
+
|
100
|
+
inline void scan_ctx_update_bytes_consumed(scan_ctx *ctx)
|
101
|
+
{
|
102
|
+
ctx->yajl_bytes_consumed += yajl_get_bytes_consumed(ctx->handle);
|
103
|
+
}
|
104
|
+
|
105
|
+
void scan_ctx_debug(scan_ctx *ctx)
|
106
|
+
{
|
107
|
+
// actually might have been cleared by GC already, be careful, debug only when in valid state
|
108
|
+
VALUE points_list_inspect = ctx->points_list == Qundef ? rb_str_new_cstr("undef") : rb_sprintf("%" PRIsVALUE, rb_inspect(ctx->points_list));
|
109
|
+
fprintf(stderr, "\nscan_ctx {\n");
|
110
|
+
fprintf(stderr, " with_path: %s,\n", ctx->with_path ? "true" : "false");
|
111
|
+
fprintf(stderr, " symbolize_path_keys: %s,\n", ctx->symbolize_path_keys ? "true" : "false");
|
112
|
+
fprintf(stderr, " paths_len: %d,\n", ctx->paths_len);
|
113
|
+
|
114
|
+
fprintf(stderr, " paths: [\n");
|
115
|
+
for (int i = 0; ctx->paths && i < ctx->paths_len; i++)
|
116
|
+
{
|
117
|
+
fprintf(stderr, " [");
|
118
|
+
for (int j = 0; j < ctx->paths[i].len; j++)
|
119
|
+
{
|
120
|
+
switch (ctx->paths[i].elems[j].type)
|
121
|
+
{
|
122
|
+
case MATCHER_KEY:
|
123
|
+
fprintf(stderr, "'%.*s'", (int)ctx->paths[i].elems[j].value.key.len, ctx->paths[i].elems[j].value.key.val);
|
124
|
+
break;
|
125
|
+
case MATCHER_INDEX:
|
126
|
+
fprintf(stderr, "%ld", ctx->paths[i].elems[j].value.index);
|
127
|
+
break;
|
128
|
+
case MATCHER_INDEX_RANGE:
|
129
|
+
fprintf(stderr, "(%ld..%ld)", ctx->paths[i].elems[j].value.range.start, ctx->paths[i].elems[j].value.range.end);
|
130
|
+
break;
|
131
|
+
case MATCHER_ANY_KEY:
|
132
|
+
fprintf(stderr, "('*'..'*')");
|
133
|
+
break;
|
134
|
+
}
|
135
|
+
if (j < ctx->paths[i].len - 1)
|
136
|
+
fprintf(stderr, ", ");
|
137
|
+
}
|
138
|
+
fprintf(stderr, "],\n");
|
139
|
+
}
|
140
|
+
fprintf(stderr, " ],\n");
|
141
|
+
|
142
|
+
fprintf(stderr, " current_path_len: %d,\n", ctx->current_path_len);
|
143
|
+
fprintf(stderr, " max_path_len: %d,\n", ctx->max_path_len);
|
144
|
+
fprintf(stderr, " current_path: [");
|
145
|
+
for (int i = 0; i < ctx->current_path_len; i++)
|
146
|
+
{
|
147
|
+
switch (ctx->current_path[i].type)
|
148
|
+
{
|
149
|
+
case PATH_KEY:
|
150
|
+
fprintf(stderr, "'%.*s'", (int)ctx->current_path[i].value.key.len, ctx->current_path[i].value.key.val);
|
151
|
+
break;
|
152
|
+
case PATH_INDEX:
|
153
|
+
fprintf(stderr, "%ld", ctx->current_path[i].value.index);
|
154
|
+
break;
|
155
|
+
}
|
156
|
+
if (i < ctx->current_path_len - 1)
|
157
|
+
fprintf(stderr, ", ");
|
158
|
+
}
|
159
|
+
fprintf(stderr, "],\n");
|
160
|
+
|
161
|
+
fprintf(stderr, " points_list: %.*s,\n", RSTRING_LENINT(points_list_inspect), RSTRING_PTR(points_list_inspect));
|
162
|
+
fprintf(stderr, " starts: [");
|
163
|
+
for (int i = 0; i <= ctx->max_path_len; i++)
|
164
|
+
{
|
165
|
+
fprintf(stderr, "%ld", ctx->starts[i]);
|
166
|
+
if (i < ctx->max_path_len)
|
167
|
+
fprintf(stderr, ", ");
|
168
|
+
}
|
169
|
+
fprintf(stderr, "],\n");
|
170
|
+
|
171
|
+
fprintf(stderr, " handle: %p,\n", ctx->handle);
|
172
|
+
fprintf(stderr, " yajl_bytes_consumed: %ld,\n", ctx->yajl_bytes_consumed);
|
173
|
+
fprintf(stderr, "}\n\n\n");
|
174
|
+
}
|
175
|
+
|
86
176
|
// FIXME: This will cause memory leak if ruby_xmalloc raises
|
87
|
-
|
177
|
+
// path_ary must be RB_GC_GUARD-ed by the caller
|
178
|
+
VALUE scan_ctx_init(scan_ctx *ctx, VALUE path_ary, VALUE string_keys)
|
88
179
|
{
|
89
180
|
int path_ary_len;
|
90
|
-
scan_ctx *ctx;
|
91
181
|
paths_t *paths;
|
92
182
|
// TODO: Allow to_ary and sized enumerables
|
93
183
|
rb_check_type(path_ary, T_ARRAY);
|
94
184
|
path_ary_len = rb_long2int(rb_array_len(path_ary));
|
95
185
|
// Check types early before any allocations, so exception is ok
|
96
186
|
// TODO: Fix this, just handle errors
|
187
|
+
// It's not possible that another Ruby thread changes path_ary items between these two loops, because C call holds GVL
|
97
188
|
for (int i = 0; i < path_ary_len; i++)
|
98
189
|
{
|
99
190
|
int path_len;
|
@@ -103,38 +194,42 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
|
|
103
194
|
for (int j = 0; j < path_len; j++)
|
104
195
|
{
|
105
196
|
VALUE entry = rb_ary_entry(path, j);
|
106
|
-
|
107
|
-
if (type == T_STRING)
|
197
|
+
switch (TYPE(entry))
|
108
198
|
{
|
199
|
+
case T_SYMBOL:
|
200
|
+
entry = rb_sym2str(entry);
|
201
|
+
/* fall through */
|
202
|
+
case T_STRING:
|
109
203
|
#if LONG_MAX > SIZE_MAX
|
110
204
|
RSTRING_LENINT(entry);
|
111
205
|
#endif
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
206
|
+
break;
|
207
|
+
case T_FIXNUM:
|
208
|
+
case T_BIGNUM:
|
209
|
+
NUM2LONG(entry);
|
210
|
+
break;
|
211
|
+
default:
|
118
212
|
{
|
119
213
|
VALUE range_beg, range_end;
|
120
214
|
long end_val;
|
121
215
|
int open_ended;
|
122
216
|
if (rb_range_values(entry, &range_beg, &range_end, &open_ended) != Qtrue)
|
123
|
-
|
124
|
-
if (
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
217
|
+
return rb_exc_new_cstr(rb_eArgError, "path elements must be strings, integers, or ranges");
|
218
|
+
if (range_beg != any_key_sym || range_end != any_key_sym)
|
219
|
+
{
|
220
|
+
if (NUM2LONG(range_beg) < 0L)
|
221
|
+
return rb_exc_new_cstr(rb_eArgError, "range start must be positive");
|
222
|
+
end_val = NUM2LONG(range_end);
|
223
|
+
if (end_val < -1L)
|
224
|
+
return rb_exc_new_cstr(rb_eArgError, "range end must be positive or -1");
|
225
|
+
if (end_val == -1L && open_ended)
|
226
|
+
return rb_exc_new_cstr(rb_eArgError, "range with -1 end must be closed");
|
227
|
+
}
|
228
|
+
}
|
131
229
|
}
|
132
230
|
}
|
133
231
|
}
|
134
232
|
|
135
|
-
ctx = ruby_xmalloc(sizeof(scan_ctx));
|
136
|
-
|
137
|
-
ctx->with_path = RTEST(with_path);
|
138
233
|
ctx->max_path_len = 0;
|
139
234
|
|
140
235
|
paths = ruby_xmalloc(sizeof(paths_t) * path_ary_len);
|
@@ -149,9 +244,20 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
|
|
149
244
|
for (int j = 0; j < path_len; j++)
|
150
245
|
{
|
151
246
|
VALUE entry = rb_ary_entry(path, j);
|
152
|
-
|
153
|
-
if (type == T_STRING)
|
247
|
+
switch (TYPE(entry))
|
154
248
|
{
|
249
|
+
case T_SYMBOL:
|
250
|
+
entry = rb_sym2str(entry);
|
251
|
+
/* fall through */
|
252
|
+
case T_STRING:
|
253
|
+
{
|
254
|
+
if (string_keys != Qundef)
|
255
|
+
{
|
256
|
+
// If string_keys is provided, we need to duplicate the string
|
257
|
+
// to avoid use-after-free issues and to add the newly created string to the string_keys array
|
258
|
+
entry = rb_str_dup(entry);
|
259
|
+
rb_ary_push(string_keys, entry);
|
260
|
+
}
|
155
261
|
paths[i].elems[j].type = MATCHER_KEY;
|
156
262
|
paths[i].elems[j].value.key.val = RSTRING_PTR(entry);
|
157
263
|
#if LONG_MAX > SIZE_MAX
|
@@ -160,25 +266,36 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
|
|
160
266
|
paths[i].elems[j].value.key.len = RSTRING_LEN(entry);
|
161
267
|
#endif
|
162
268
|
}
|
163
|
-
|
269
|
+
break;
|
270
|
+
case T_FIXNUM:
|
271
|
+
case T_BIGNUM:
|
164
272
|
{
|
165
273
|
paths[i].elems[j].type = MATCHER_INDEX;
|
166
274
|
paths[i].elems[j].value.index = FIX2LONG(entry);
|
167
275
|
}
|
168
|
-
|
276
|
+
break;
|
277
|
+
default:
|
169
278
|
{
|
170
279
|
VALUE range_beg, range_end;
|
171
280
|
int open_ended;
|
172
|
-
paths[i].elems[j].type = MATCHER_INDEX_RANGE;
|
173
281
|
rb_range_values(entry, &range_beg, &range_end, &open_ended);
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
paths[i].elems[j].value.range.
|
282
|
+
if (range_beg == any_key_sym && range_end == any_key_sym)
|
283
|
+
{
|
284
|
+
paths[i].elems[j].type = MATCHER_ANY_KEY;
|
285
|
+
}
|
286
|
+
else
|
287
|
+
{
|
288
|
+
paths[i].elems[j].type = MATCHER_INDEX_RANGE;
|
289
|
+
paths[i].elems[j].value.range.start = NUM2LONG(range_beg);
|
290
|
+
paths[i].elems[j].value.range.end = NUM2LONG(range_end);
|
291
|
+
// (value..-1) works as expected, (value...-1) is forbidden above
|
292
|
+
if (paths[i].elems[j].value.range.end == -1L)
|
293
|
+
paths[i].elems[j].value.range.end = LONG_MAX;
|
294
|
+
// -1 here is fine, so, (0...0) works just as expected - doesn't match anything
|
295
|
+
if (open_ended)
|
296
|
+
paths[i].elems[j].value.range.end--;
|
297
|
+
}
|
298
|
+
}
|
182
299
|
}
|
183
300
|
}
|
184
301
|
paths[i].len = path_len;
|
@@ -189,32 +306,37 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
|
|
189
306
|
ctx->paths_len = path_ary_len;
|
190
307
|
ctx->current_path = ruby_xmalloc2(sizeof(path_elem_t), ctx->max_path_len);
|
191
308
|
|
192
|
-
ctx->
|
193
|
-
|
194
|
-
|
195
|
-
{
|
196
|
-
rb_ary_push(ctx->points_list, rb_ary_new());
|
197
|
-
}
|
309
|
+
ctx->starts = ruby_xmalloc2(sizeof(size_t), ctx->max_path_len + 1);
|
310
|
+
return Qundef; // no error
|
311
|
+
}
|
198
312
|
|
199
|
-
|
313
|
+
// resets temporary values in the config
|
314
|
+
void scan_ctx_reset(scan_ctx *ctx, VALUE points_list, int with_path, int symbolize_path_keys)
|
315
|
+
{
|
316
|
+
// TODO: reset matched_depth if implemented
|
317
|
+
ctx->current_path_len = 0;
|
200
318
|
// ctx->rb_err = Qnil;
|
201
319
|
ctx->handle = NULL;
|
202
|
-
|
203
|
-
|
320
|
+
ctx->yajl_bytes_consumed = 0;
|
321
|
+
ctx->points_list = points_list;
|
322
|
+
ctx->with_path = with_path;
|
323
|
+
ctx->symbolize_path_keys = symbolize_path_keys;
|
204
324
|
}
|
205
325
|
|
206
326
|
void scan_ctx_free(scan_ctx *ctx)
|
207
327
|
{
|
328
|
+
// fprintf(stderr, "scan_ctx_free\n");
|
208
329
|
if (!ctx)
|
209
330
|
return;
|
210
331
|
ruby_xfree(ctx->starts);
|
211
332
|
ruby_xfree(ctx->current_path);
|
333
|
+
if (!ctx->paths)
|
334
|
+
return;
|
212
335
|
for (int i = 0; i < ctx->paths_len; i++)
|
213
336
|
{
|
214
337
|
ruby_xfree(ctx->paths[i].elems);
|
215
338
|
}
|
216
339
|
ruby_xfree(ctx->paths);
|
217
|
-
ruby_xfree(ctx);
|
218
340
|
}
|
219
341
|
|
220
342
|
// noexcept
|
@@ -239,42 +361,70 @@ typedef enum
|
|
239
361
|
} value_type;
|
240
362
|
|
241
363
|
// noexcept
|
242
|
-
|
364
|
+
VALUE create_point(scan_ctx *sctx, value_type type, size_t length)
|
243
365
|
{
|
244
|
-
VALUE values[3];
|
245
|
-
|
366
|
+
VALUE values[3], point;
|
367
|
+
size_t curr_pos = scan_ctx_get_bytes_consumed(sctx);
|
368
|
+
point = rb_ary_new_capa(3);
|
246
369
|
// noexcept
|
247
|
-
values[1] =
|
370
|
+
values[1] = ULL2NUM(curr_pos);
|
248
371
|
switch (type)
|
249
372
|
{
|
250
373
|
// FIXME: size_t can be longer than ulong
|
251
374
|
case null_value:
|
252
|
-
values[0] =
|
375
|
+
values[0] = ULL2NUM(curr_pos - length);
|
253
376
|
values[2] = null_sym;
|
254
377
|
break;
|
255
378
|
case boolean_value:
|
256
|
-
values[0] =
|
379
|
+
values[0] = ULL2NUM(curr_pos - length);
|
257
380
|
values[2] = boolean_sym;
|
258
381
|
break;
|
259
382
|
case number_value:
|
260
|
-
values[0] =
|
383
|
+
values[0] = ULL2NUM(curr_pos - length);
|
261
384
|
values[2] = number_sym;
|
262
385
|
break;
|
263
386
|
case string_value:
|
264
|
-
values[0] =
|
387
|
+
values[0] = ULL2NUM(curr_pos - length);
|
265
388
|
values[2] = string_sym;
|
266
389
|
break;
|
267
390
|
case object_value:
|
268
|
-
values[0] =
|
391
|
+
values[0] = ULL2NUM(sctx->starts[sctx->current_path_len]);
|
269
392
|
values[2] = object_sym;
|
270
393
|
break;
|
271
394
|
case array_value:
|
272
|
-
values[0] =
|
395
|
+
values[0] = ULL2NUM(sctx->starts[sctx->current_path_len]);
|
273
396
|
values[2] = array_sym;
|
274
397
|
break;
|
275
398
|
}
|
276
399
|
// rb_ary_cat raise only in case of a frozen array or if len is too long
|
277
|
-
rb_ary_cat(
|
400
|
+
rb_ary_cat(point, values, 3);
|
401
|
+
return point;
|
402
|
+
}
|
403
|
+
|
404
|
+
// noexcept
|
405
|
+
VALUE create_path(scan_ctx *sctx)
|
406
|
+
{
|
407
|
+
VALUE path = rb_ary_new_capa(sctx->current_path_len);
|
408
|
+
for (int i = 0; i < sctx->current_path_len; i++)
|
409
|
+
{
|
410
|
+
VALUE entry;
|
411
|
+
switch (sctx->current_path[i].type)
|
412
|
+
{
|
413
|
+
case PATH_KEY:
|
414
|
+
if (sctx->symbolize_path_keys)
|
415
|
+
entry = rb_id2sym(rb_intern2(sctx->current_path[i].value.key.val, sctx->current_path[i].value.key.len));
|
416
|
+
else
|
417
|
+
entry = rb_str_new(sctx->current_path[i].value.key.val, sctx->current_path[i].value.key.len);
|
418
|
+
break;
|
419
|
+
case PATH_INDEX:
|
420
|
+
entry = LONG2NUM(sctx->current_path[i].value.index);
|
421
|
+
break;
|
422
|
+
default:
|
423
|
+
entry = Qnil;
|
424
|
+
}
|
425
|
+
rb_ary_push(path, entry);
|
426
|
+
}
|
427
|
+
return path;
|
278
428
|
}
|
279
429
|
|
280
430
|
// noexcept
|
@@ -282,7 +432,8 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
|
|
282
432
|
{
|
283
433
|
// TODO: Abort parsing if all paths are matched and no more mathces are possible: only trivial key/index matchers at the current level
|
284
434
|
// TODO: Don't re-compare already matched prefixes; hard to invalidate, though
|
285
|
-
|
435
|
+
// TODO: Might fail in case of no memory
|
436
|
+
VALUE point = Qundef, path;
|
286
437
|
int match;
|
287
438
|
for (int i = 0; i < sctx->paths_len; i++)
|
288
439
|
{
|
@@ -294,6 +445,10 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
|
|
294
445
|
{
|
295
446
|
switch (sctx->paths[i].elems[j].type)
|
296
447
|
{
|
448
|
+
case MATCHER_ANY_KEY:
|
449
|
+
if (sctx->current_path[j].type != PATH_KEY)
|
450
|
+
match = false;
|
451
|
+
break;
|
297
452
|
case MATCHER_KEY:
|
298
453
|
if (sctx->current_path[j].type != PATH_KEY ||
|
299
454
|
sctx->current_path[j].value.key.len != sctx->paths[i].elems[j].value.key.len ||
|
@@ -319,7 +474,12 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
|
|
319
474
|
{
|
320
475
|
if (point == Qundef)
|
321
476
|
{
|
322
|
-
create_point(
|
477
|
+
point = create_point(sctx, type, length);
|
478
|
+
if (sctx->with_path)
|
479
|
+
{
|
480
|
+
path = create_path(sctx);
|
481
|
+
point = rb_ary_new_from_args(2, path, point);
|
482
|
+
}
|
323
483
|
}
|
324
484
|
// rb_ary_push raises only in case of a frozen array, which is not the case
|
325
485
|
// rb_ary_entry is safe
|
@@ -382,11 +542,9 @@ int scan_on_start_object(void *ctx)
|
|
382
542
|
return true;
|
383
543
|
}
|
384
544
|
increment_arr_index(sctx);
|
545
|
+
sctx->starts[sctx->current_path_len] = scan_ctx_get_bytes_consumed(sctx) - 1;
|
385
546
|
if (sctx->current_path_len < sctx->max_path_len)
|
386
|
-
{
|
387
|
-
sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
|
388
547
|
sctx->current_path[sctx->current_path_len].type = PATH_KEY;
|
389
|
-
}
|
390
548
|
sctx->current_path_len++;
|
391
549
|
return true;
|
392
550
|
}
|
@@ -409,9 +567,8 @@ int scan_on_end_object(void *ctx)
|
|
409
567
|
{
|
410
568
|
scan_ctx *sctx = (scan_ctx *)ctx;
|
411
569
|
sctx->current_path_len--;
|
412
|
-
if (sctx->current_path_len
|
413
|
-
|
414
|
-
save_point(sctx, object_value, 0);
|
570
|
+
if (sctx->current_path_len <= sctx->max_path_len)
|
571
|
+
save_point(sctx, object_value, 0);
|
415
572
|
return true;
|
416
573
|
}
|
417
574
|
|
@@ -425,9 +582,9 @@ int scan_on_start_array(void *ctx)
|
|
425
582
|
return true;
|
426
583
|
}
|
427
584
|
increment_arr_index(sctx);
|
585
|
+
sctx->starts[sctx->current_path_len] = scan_ctx_get_bytes_consumed(sctx) - 1;
|
428
586
|
if (sctx->current_path_len < sctx->max_path_len)
|
429
587
|
{
|
430
|
-
sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
|
431
588
|
sctx->current_path[sctx->current_path_len].type = PATH_INDEX;
|
432
589
|
sctx->current_path[sctx->current_path_len].value.index = -1;
|
433
590
|
}
|
@@ -440,12 +597,112 @@ int scan_on_end_array(void *ctx)
|
|
440
597
|
{
|
441
598
|
scan_ctx *sctx = (scan_ctx *)ctx;
|
442
599
|
sctx->current_path_len--;
|
443
|
-
if (sctx->current_path_len
|
444
|
-
|
445
|
-
save_point(sctx, array_value, 0);
|
600
|
+
if (sctx->current_path_len <= sctx->max_path_len)
|
601
|
+
save_point(sctx, array_value, 0);
|
446
602
|
return true;
|
447
603
|
}
|
448
604
|
|
605
|
+
void config_free(void *data)
|
606
|
+
{
|
607
|
+
scan_ctx_free((scan_ctx *)data);
|
608
|
+
ruby_xfree(data);
|
609
|
+
}
|
610
|
+
|
611
|
+
size_t config_size(const void *data)
|
612
|
+
{
|
613
|
+
// see ObjectSpace.memsize_of
|
614
|
+
scan_ctx *ctx = (scan_ctx *)data;
|
615
|
+
size_t res = sizeof(scan_ctx);
|
616
|
+
// current_path
|
617
|
+
if (ctx->current_path != NULL)
|
618
|
+
res += ctx->max_path_len * sizeof(path_elem_t);
|
619
|
+
// starts
|
620
|
+
if (ctx->starts != NULL)
|
621
|
+
res += ctx->max_path_len * sizeof(size_t);
|
622
|
+
if (ctx->paths != NULL)
|
623
|
+
{
|
624
|
+
res += ctx->paths_len * sizeof(paths_t);
|
625
|
+
for (int i = 0; i < ctx->paths_len; i++)
|
626
|
+
{
|
627
|
+
res += ctx->paths[i].len * sizeof(path_matcher_elem_t);
|
628
|
+
}
|
629
|
+
}
|
630
|
+
return res;
|
631
|
+
}
|
632
|
+
|
633
|
+
static const rb_data_type_t config_type = {
|
634
|
+
.wrap_struct_name = "json_scanner_config",
|
635
|
+
.function = {
|
636
|
+
.dfree = config_free,
|
637
|
+
.dsize = config_size,
|
638
|
+
},
|
639
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
640
|
+
};
|
641
|
+
|
642
|
+
VALUE config_alloc(VALUE self)
|
643
|
+
{
|
644
|
+
scan_ctx *ctx = ruby_xmalloc(sizeof(scan_ctx));
|
645
|
+
ctx->paths = NULL;
|
646
|
+
ctx->paths_len = 0;
|
647
|
+
ctx->current_path = NULL;
|
648
|
+
ctx->max_path_len = 0;
|
649
|
+
ctx->starts = NULL;
|
650
|
+
scan_ctx_reset(ctx, Qundef, false, false);
|
651
|
+
return TypedData_Wrap_Struct(self, &config_type, ctx);
|
652
|
+
}
|
653
|
+
|
654
|
+
VALUE config_m_initialize(VALUE self, VALUE path_ary)
|
655
|
+
{
|
656
|
+
scan_ctx *ctx;
|
657
|
+
VALUE scan_ctx_init_err, string_keys;
|
658
|
+
TypedData_Get_Struct(self, scan_ctx, &config_type, ctx);
|
659
|
+
string_keys = rb_ary_new();
|
660
|
+
scan_ctx_init_err = scan_ctx_init(ctx, path_ary, string_keys);
|
661
|
+
if (scan_ctx_init_err != Qundef)
|
662
|
+
{
|
663
|
+
rb_exc_raise(scan_ctx_init_err);
|
664
|
+
}
|
665
|
+
rb_iv_set(self, "string_keys", string_keys);
|
666
|
+
return self;
|
667
|
+
}
|
668
|
+
|
669
|
+
VALUE config_m_inspect(VALUE self)
|
670
|
+
{
|
671
|
+
scan_ctx *ctx;
|
672
|
+
VALUE res;
|
673
|
+
TypedData_Get_Struct(self, scan_ctx, &config_type, ctx);
|
674
|
+
res = rb_sprintf("#<%" PRIsVALUE " [", rb_class_name(CLASS_OF(self)));
|
675
|
+
for (int i = 0; ctx->paths && i < ctx->paths_len; i++)
|
676
|
+
{
|
677
|
+
rb_str_cat_cstr(res, "[");
|
678
|
+
for (int j = 0; j < ctx->paths[i].len; j++)
|
679
|
+
{
|
680
|
+
switch (ctx->paths[i].elems[j].type)
|
681
|
+
{
|
682
|
+
case MATCHER_KEY:
|
683
|
+
rb_str_catf(res, "'%.*s'", (int)ctx->paths[i].elems[j].value.key.len, ctx->paths[i].elems[j].value.key.val);
|
684
|
+
break;
|
685
|
+
case MATCHER_INDEX:
|
686
|
+
rb_str_catf(res, "%ld", ctx->paths[i].elems[j].value.index);
|
687
|
+
break;
|
688
|
+
case MATCHER_INDEX_RANGE:
|
689
|
+
rb_str_catf(res, "(%ld..%ld)", ctx->paths[i].elems[j].value.range.start, ctx->paths[i].elems[j].value.range.end);
|
690
|
+
break;
|
691
|
+
case MATCHER_ANY_KEY:
|
692
|
+
rb_str_cat_cstr(res, "('*'..'*')");
|
693
|
+
break;
|
694
|
+
}
|
695
|
+
if (j < ctx->paths[i].len - 1)
|
696
|
+
rb_str_cat_cstr(res, ", ");
|
697
|
+
}
|
698
|
+
rb_str_cat_cstr(res, "]");
|
699
|
+
if (i < ctx->paths_len - 1)
|
700
|
+
rb_str_cat_cstr(res, ", ");
|
701
|
+
}
|
702
|
+
rb_str_cat_cstr(res, "]>");
|
703
|
+
return res;
|
704
|
+
}
|
705
|
+
|
449
706
|
static yajl_callbacks scan_callbacks = {
|
450
707
|
scan_on_null,
|
451
708
|
scan_on_boolean,
|
@@ -459,19 +716,43 @@ static yajl_callbacks scan_callbacks = {
|
|
459
716
|
scan_on_start_array,
|
460
717
|
scan_on_end_array};
|
461
718
|
|
462
|
-
//
|
463
|
-
|
719
|
+
// def scan(json_str, path_arr, opts)
|
720
|
+
// opts
|
721
|
+
// with_path: false, verbose_error: false,
|
722
|
+
// the following opts converted to bool and passed to yajl_config if provided, ignored if not provided
|
723
|
+
// allow_comments, dont_validate_strings, allow_trailing_garbage, allow_multiple_values, allow_partial_values
|
724
|
+
VALUE scan(int argc, VALUE *argv, VALUE self)
|
464
725
|
{
|
726
|
+
VALUE json_str, path_ary, with_path_flag, kwargs;
|
727
|
+
VALUE kwargs_values[SCAN_KWARGS_SIZE];
|
728
|
+
|
729
|
+
int with_path = false, verbose_error = false, symbolize_path_keys = false;
|
465
730
|
char *json_text;
|
466
731
|
size_t json_text_len;
|
467
732
|
yajl_handle handle;
|
468
733
|
yajl_status stat;
|
469
734
|
scan_ctx *ctx;
|
470
|
-
|
735
|
+
int free_ctx = true;
|
736
|
+
VALUE err_msg = Qnil, bytes_consumed, err, result;
|
471
737
|
// Turned out callbacks can't raise exceptions
|
472
738
|
// VALUE callback_err;
|
473
|
-
|
474
|
-
|
739
|
+
#if RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7)
|
740
|
+
rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "21:", &json_str, &path_ary, &with_path_flag, &kwargs);
|
741
|
+
#else
|
742
|
+
rb_scan_args(argc, argv, "21:", &json_str, &path_ary, &with_path_flag, &kwargs);
|
743
|
+
#endif
|
744
|
+
// rb_io_write(rb_stderr, rb_sprintf("with_path_flag: %" PRIsVALUE " \n", with_path_flag));
|
745
|
+
with_path = RTEST(with_path_flag);
|
746
|
+
if (kwargs != Qnil)
|
747
|
+
{
|
748
|
+
rb_get_kwargs(kwargs, scan_kwargs_table, 0, SCAN_KWARGS_SIZE, kwargs_values);
|
749
|
+
if (kwargs_values[0] != Qundef)
|
750
|
+
with_path = RTEST(kwargs_values[0]);
|
751
|
+
if (kwargs_values[1] != Qundef)
|
752
|
+
verbose_error = RTEST(kwargs_values[1]);
|
753
|
+
if (kwargs_values[7] != Qundef)
|
754
|
+
symbolize_path_keys = RTEST(kwargs_values[7]);
|
755
|
+
}
|
475
756
|
rb_check_type(json_str, T_STRING);
|
476
757
|
json_text = RSTRING_PTR(json_str);
|
477
758
|
#if LONG_MAX > SIZE_MAX
|
@@ -479,32 +760,96 @@ VALUE scan(VALUE self, VALUE json_str, VALUE path_ary, VALUE with_path)
|
|
479
760
|
#else
|
480
761
|
json_text_len = RSTRING_LEN(json_str);
|
481
762
|
#endif
|
482
|
-
|
763
|
+
if (rb_obj_is_kind_of(path_ary, rb_cJsonScannerConfig))
|
764
|
+
{
|
765
|
+
free_ctx = false;
|
766
|
+
TypedData_Get_Struct(path_ary, scan_ctx, &config_type, ctx);
|
767
|
+
}
|
768
|
+
else
|
769
|
+
{
|
770
|
+
VALUE scan_ctx_init_err;
|
771
|
+
ctx = ruby_xmalloc(sizeof(scan_ctx));
|
772
|
+
scan_ctx_init_err = scan_ctx_init(ctx, path_ary, Qundef);
|
773
|
+
if (scan_ctx_init_err != Qundef)
|
774
|
+
{
|
775
|
+
ruby_xfree(ctx);
|
776
|
+
rb_exc_raise(scan_ctx_init_err);
|
777
|
+
}
|
778
|
+
}
|
779
|
+
// Need to keep a ref to result array on the stack to prevent it from being GC-ed
|
780
|
+
result = rb_ary_new_capa(ctx->paths_len);
|
781
|
+
for (int i = 0; i < ctx->paths_len; i++)
|
782
|
+
{
|
783
|
+
rb_ary_push(result, rb_ary_new());
|
784
|
+
}
|
785
|
+
scan_ctx_reset(ctx, result, with_path, symbolize_path_keys);
|
786
|
+
// scan_ctx_debug(ctx);
|
483
787
|
|
484
788
|
handle = yajl_alloc(&scan_callbacks, NULL, (void *)ctx);
|
789
|
+
if (kwargs != Qnil) // it's safe to read kwargs_values only if rb_get_kwargs was called
|
790
|
+
{
|
791
|
+
if (kwargs_values[2] != Qundef)
|
792
|
+
yajl_config(handle, yajl_allow_comments, RTEST(kwargs_values[2]));
|
793
|
+
if (kwargs_values[3] != Qundef)
|
794
|
+
yajl_config(handle, yajl_dont_validate_strings, RTEST(kwargs_values[3]));
|
795
|
+
if (kwargs_values[4] != Qundef)
|
796
|
+
yajl_config(handle, yajl_allow_trailing_garbage, RTEST(kwargs_values[4]));
|
797
|
+
if (kwargs_values[5] != Qundef)
|
798
|
+
yajl_config(handle, yajl_allow_multiple_values, RTEST(kwargs_values[5]));
|
799
|
+
if (kwargs_values[6] != Qundef)
|
800
|
+
yajl_config(handle, yajl_allow_partial_values, RTEST(kwargs_values[6]));
|
801
|
+
}
|
485
802
|
ctx->handle = handle;
|
486
|
-
// TODO: make it configurable
|
487
|
-
// yajl_config(handle, yajl_allow_comments, true);
|
488
|
-
// yajl_config(handle, yajl_allow_trailing_garbage, true);
|
489
803
|
stat = yajl_parse(handle, (unsigned char *)json_text, json_text_len);
|
804
|
+
scan_ctx_update_bytes_consumed(ctx);
|
490
805
|
if (stat == yajl_status_ok)
|
806
|
+
{
|
491
807
|
stat = yajl_complete_parse(handle);
|
808
|
+
scan_ctx_update_bytes_consumed(ctx);
|
809
|
+
}
|
492
810
|
|
493
811
|
if (stat != yajl_status_ok)
|
494
812
|
{
|
495
|
-
char *str = (char *)yajl_get_error(handle,
|
496
|
-
|
813
|
+
char *str = (char *)yajl_get_error(handle, verbose_error, (unsigned char *)json_text, json_text_len);
|
814
|
+
err_msg = rb_utf8_str_new_cstr(str);
|
815
|
+
// TODO: maybe use scan_ctx_get_bytes_consumed here too? But it makes difference in premature EOF
|
816
|
+
bytes_consumed = ULL2NUM(yajl_get_bytes_consumed(handle));
|
497
817
|
yajl_free_error(handle, (unsigned char *)str);
|
498
818
|
}
|
819
|
+
// // Needed when yajl_allow_partial_values is set
|
820
|
+
// if (ctx->current_path_len > 0)
|
821
|
+
// {
|
822
|
+
// if (ctx->current_path_len > ctx->max_path_len)
|
823
|
+
// ctx->current_path_len = ctx->max_path_len;
|
824
|
+
// for (int i = ctx->current_path_len - 1; i > 0; i--)
|
825
|
+
// {
|
826
|
+
// switch (ctx->current_path[i].type)
|
827
|
+
// {
|
828
|
+
// case PATH_KEY:
|
829
|
+
// scan_on_end_object(ctx);
|
830
|
+
// break;
|
831
|
+
// case PATH_INDEX:
|
832
|
+
// scan_on_end_array(ctx);
|
833
|
+
// break;
|
834
|
+
// }
|
835
|
+
// }
|
836
|
+
// }
|
499
837
|
// callback_err = ctx->rb_err;
|
500
|
-
|
501
|
-
|
838
|
+
if (free_ctx)
|
839
|
+
{
|
840
|
+
// fprintf(stderr, "free_ctx\n");
|
841
|
+
scan_ctx_free(ctx);
|
842
|
+
ruby_xfree(ctx);
|
843
|
+
}
|
502
844
|
yajl_free(handle);
|
503
|
-
if (
|
504
|
-
|
845
|
+
if (err_msg != Qnil)
|
846
|
+
{
|
847
|
+
err = rb_exc_new_str(rb_eJsonScannerParseError, err_msg);
|
848
|
+
rb_ivar_set(err, rb_iv_bytes_consumed, bytes_consumed);
|
849
|
+
rb_exc_raise(err);
|
850
|
+
}
|
505
851
|
// if (callback_err != Qnil)
|
506
852
|
// rb_exc_raise(callback_err);
|
507
|
-
// TODO: report yajl_get_bytes_consumed(handle)
|
508
853
|
return result;
|
509
854
|
}
|
510
855
|
|
@@ -512,19 +857,29 @@ RUBY_FUNC_EXPORTED void
|
|
512
857
|
Init_json_scanner(void)
|
513
858
|
{
|
514
859
|
rb_mJsonScanner = rb_define_module("JsonScanner");
|
860
|
+
rb_cJsonScannerConfig = rb_define_class_under(rb_mJsonScanner, "Config", rb_cObject);
|
861
|
+
rb_define_alloc_func(rb_cJsonScannerConfig, config_alloc);
|
862
|
+
rb_define_method(rb_cJsonScannerConfig, "initialize", config_m_initialize, 1);
|
863
|
+
rb_define_method(rb_cJsonScannerConfig, "inspect", config_m_inspect, 0);
|
515
864
|
rb_define_const(rb_mJsonScanner, "ANY_INDEX", rb_range_new(INT2FIX(0), INT2FIX(-1), false));
|
516
|
-
|
865
|
+
any_key_sym = rb_id2sym(rb_intern("*"));
|
866
|
+
rb_define_const(rb_mJsonScanner, "ANY_KEY", rb_range_new(any_key_sym, any_key_sym, false));
|
517
867
|
rb_eJsonScannerParseError = rb_define_class_under(rb_mJsonScanner, "ParseError", rb_eRuntimeError);
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
rb_define_const(rb_mJsonScannerOptions, "ALLOW_MULTIPLE_VALUES", INT2FIX(yajl_allow_multiple_values));
|
522
|
-
rb_define_const(rb_mJsonScannerOptions, "ALLOW_PARTIAL_VALUES", INT2FIX(yajl_allow_partial_values));
|
523
|
-
rb_define_module_function(rb_mJsonScanner, "scan", scan, 3);
|
868
|
+
rb_define_attr(rb_eJsonScannerParseError, BYTES_CONSUMED, true, false);
|
869
|
+
rb_iv_bytes_consumed = rb_intern("@" BYTES_CONSUMED);
|
870
|
+
rb_define_module_function(rb_mJsonScanner, "scan", scan, -1);
|
524
871
|
null_sym = rb_id2sym(rb_intern("null"));
|
525
872
|
boolean_sym = rb_id2sym(rb_intern("boolean"));
|
526
873
|
number_sym = rb_id2sym(rb_intern("number"));
|
527
874
|
string_sym = rb_id2sym(rb_intern("string"));
|
528
875
|
object_sym = rb_id2sym(rb_intern("object"));
|
529
876
|
array_sym = rb_id2sym(rb_intern("array"));
|
877
|
+
scan_kwargs_table[0] = rb_intern("with_path");
|
878
|
+
scan_kwargs_table[1] = rb_intern("verbose_error");
|
879
|
+
scan_kwargs_table[2] = rb_intern("allow_comments");
|
880
|
+
scan_kwargs_table[3] = rb_intern("dont_validate_strings");
|
881
|
+
scan_kwargs_table[4] = rb_intern("allow_trailing_garbage");
|
882
|
+
scan_kwargs_table[5] = rb_intern("allow_multiple_values");
|
883
|
+
scan_kwargs_table[6] = rb_intern("allow_partial_values");
|
884
|
+
scan_kwargs_table[7] = rb_intern("symbolize_path_keys");
|
530
885
|
}
|