json_scanner 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,12 @@
1
1
  #include "json_scanner.h"
2
2
 
3
3
  VALUE rb_mJsonScanner;
4
- VALUE rb_mJsonScannerOptions;
4
+ VALUE rb_cJsonScannerConfig;
5
5
  VALUE rb_eJsonScannerParseError;
6
+ #define BYTES_CONSUMED "bytes_consumed"
7
+ ID rb_iv_bytes_consumed;
8
+ #define SCAN_KWARGS_SIZE 8
9
+ ID scan_kwargs_table[SCAN_KWARGS_SIZE];
6
10
 
7
11
  VALUE null_sym;
8
12
  VALUE boolean_sym;
@@ -11,11 +15,13 @@ VALUE string_sym;
11
15
  VALUE object_sym;
12
16
  VALUE array_sym;
13
17
 
18
+ VALUE any_key_sym;
19
+
14
20
  enum matcher_type
15
21
  {
16
22
  MATCHER_KEY,
17
23
  MATCHER_INDEX,
18
- // MATCHER_ANY_KEY,
24
+ MATCHER_ANY_KEY,
19
25
  MATCHER_INDEX_RANGE,
20
26
  // MATCHER_KEYS_LIST,
21
27
  // MATCHER_KEY_REGEX,
@@ -70,30 +76,115 @@ typedef struct
70
76
  typedef struct
71
77
  {
72
78
  int with_path;
73
- paths_t *paths;
79
+ int symbolize_path_keys;
74
80
  int paths_len;
75
- path_elem_t *current_path;
81
+ paths_t *paths;
76
82
  int current_path_len;
77
83
  int max_path_len;
84
+ path_elem_t *current_path;
78
85
  // Easier to use a Ruby array for result than convert later
86
+ // must be supplied by the caller and RB_GC_GUARD-ed if it isn't on the stack
79
87
  VALUE points_list;
80
88
  // by depth
81
89
  size_t *starts;
82
90
  // VALUE rb_err;
83
91
  yajl_handle handle;
92
+ size_t yajl_bytes_consumed;
84
93
  } scan_ctx;
85
94
 
95
+ inline size_t scan_ctx_get_bytes_consumed(scan_ctx *ctx)
96
+ {
97
+ return ctx->yajl_bytes_consumed + yajl_get_bytes_consumed(ctx->handle);
98
+ }
99
+
100
+ inline void scan_ctx_update_bytes_consumed(scan_ctx *ctx)
101
+ {
102
+ ctx->yajl_bytes_consumed += yajl_get_bytes_consumed(ctx->handle);
103
+ }
104
+
105
+ void scan_ctx_debug(scan_ctx *ctx)
106
+ {
107
+ // actually might have been cleared by GC already, be careful, debug only when in valid state
108
+ VALUE points_list_inspect = ctx->points_list == Qundef ? rb_str_new_cstr("undef") : rb_sprintf("%" PRIsVALUE, rb_inspect(ctx->points_list));
109
+ fprintf(stderr, "\nscan_ctx {\n");
110
+ fprintf(stderr, " with_path: %s,\n", ctx->with_path ? "true" : "false");
111
+ fprintf(stderr, " symbolize_path_keys: %s,\n", ctx->symbolize_path_keys ? "true" : "false");
112
+ fprintf(stderr, " paths_len: %d,\n", ctx->paths_len);
113
+
114
+ fprintf(stderr, " paths: [\n");
115
+ for (int i = 0; ctx->paths && i < ctx->paths_len; i++)
116
+ {
117
+ fprintf(stderr, " [");
118
+ for (int j = 0; j < ctx->paths[i].len; j++)
119
+ {
120
+ switch (ctx->paths[i].elems[j].type)
121
+ {
122
+ case MATCHER_KEY:
123
+ fprintf(stderr, "'%.*s'", (int)ctx->paths[i].elems[j].value.key.len, ctx->paths[i].elems[j].value.key.val);
124
+ break;
125
+ case MATCHER_INDEX:
126
+ fprintf(stderr, "%ld", ctx->paths[i].elems[j].value.index);
127
+ break;
128
+ case MATCHER_INDEX_RANGE:
129
+ fprintf(stderr, "(%ld..%ld)", ctx->paths[i].elems[j].value.range.start, ctx->paths[i].elems[j].value.range.end);
130
+ break;
131
+ case MATCHER_ANY_KEY:
132
+ fprintf(stderr, "('*'..'*')");
133
+ break;
134
+ }
135
+ if (j < ctx->paths[i].len - 1)
136
+ fprintf(stderr, ", ");
137
+ }
138
+ fprintf(stderr, "],\n");
139
+ }
140
+ fprintf(stderr, " ],\n");
141
+
142
+ fprintf(stderr, " current_path_len: %d,\n", ctx->current_path_len);
143
+ fprintf(stderr, " max_path_len: %d,\n", ctx->max_path_len);
144
+ fprintf(stderr, " current_path: [");
145
+ for (int i = 0; i < ctx->current_path_len; i++)
146
+ {
147
+ switch (ctx->current_path[i].type)
148
+ {
149
+ case PATH_KEY:
150
+ fprintf(stderr, "'%.*s'", (int)ctx->current_path[i].value.key.len, ctx->current_path[i].value.key.val);
151
+ break;
152
+ case PATH_INDEX:
153
+ fprintf(stderr, "%ld", ctx->current_path[i].value.index);
154
+ break;
155
+ }
156
+ if (i < ctx->current_path_len - 1)
157
+ fprintf(stderr, ", ");
158
+ }
159
+ fprintf(stderr, "],\n");
160
+
161
+ fprintf(stderr, " points_list: %.*s,\n", RSTRING_LENINT(points_list_inspect), RSTRING_PTR(points_list_inspect));
162
+ fprintf(stderr, " starts: [");
163
+ for (int i = 0; i <= ctx->max_path_len; i++)
164
+ {
165
+ fprintf(stderr, "%ld", ctx->starts[i]);
166
+ if (i < ctx->max_path_len)
167
+ fprintf(stderr, ", ");
168
+ }
169
+ fprintf(stderr, "],\n");
170
+
171
+ fprintf(stderr, " handle: %p,\n", ctx->handle);
172
+ fprintf(stderr, " yajl_bytes_consumed: %ld,\n", ctx->yajl_bytes_consumed);
173
+ fprintf(stderr, "}\n\n\n");
174
+ }
175
+
86
176
  // FIXME: This will cause memory leak if ruby_xmalloc raises
87
- scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
177
+ // path_ary must be RB_GC_GUARD-ed by the caller
178
+ VALUE scan_ctx_init(scan_ctx *ctx, VALUE path_ary, VALUE string_keys)
88
179
  {
89
180
  int path_ary_len;
90
- scan_ctx *ctx;
91
181
  paths_t *paths;
92
182
  // TODO: Allow to_ary and sized enumerables
93
183
  rb_check_type(path_ary, T_ARRAY);
94
184
  path_ary_len = rb_long2int(rb_array_len(path_ary));
95
185
  // Check types early before any allocations, so exception is ok
96
186
  // TODO: Fix this, just handle errors
187
+ // It's not possible that another Ruby thread changes path_ary items between these two loops, because C call holds GVL
97
188
  for (int i = 0; i < path_ary_len; i++)
98
189
  {
99
190
  int path_len;
@@ -103,38 +194,42 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
103
194
  for (int j = 0; j < path_len; j++)
104
195
  {
105
196
  VALUE entry = rb_ary_entry(path, j);
106
- int type = TYPE(entry);
107
- if (type == T_STRING)
197
+ switch (TYPE(entry))
108
198
  {
199
+ case T_SYMBOL:
200
+ entry = rb_sym2str(entry);
201
+ /* fall through */
202
+ case T_STRING:
109
203
  #if LONG_MAX > SIZE_MAX
110
204
  RSTRING_LENINT(entry);
111
205
  #endif
112
- }
113
- else if (type == T_FIXNUM || type == T_BIGNUM)
114
- {
115
- RB_NUM2LONG(entry);
116
- }
117
- else
206
+ break;
207
+ case T_FIXNUM:
208
+ case T_BIGNUM:
209
+ NUM2LONG(entry);
210
+ break;
211
+ default:
118
212
  {
119
213
  VALUE range_beg, range_end;
120
214
  long end_val;
121
215
  int open_ended;
122
216
  if (rb_range_values(entry, &range_beg, &range_end, &open_ended) != Qtrue)
123
- rb_raise(rb_eArgError, "path elements must be strings, integers, or ranges");
124
- if (RB_NUM2LONG(range_beg) < 0L)
125
- rb_raise(rb_eArgError, "range start must be positive");
126
- end_val = RB_NUM2LONG(range_end);
127
- if (end_val < -1L)
128
- rb_raise(rb_eArgError, "range end must be positive or -1");
129
- if (end_val == -1L && open_ended)
130
- rb_raise(rb_eArgError, "range with -1 end must be closed");
217
+ return rb_exc_new_cstr(rb_eArgError, "path elements must be strings, integers, or ranges");
218
+ if (range_beg != any_key_sym || range_end != any_key_sym)
219
+ {
220
+ if (NUM2LONG(range_beg) < 0L)
221
+ return rb_exc_new_cstr(rb_eArgError, "range start must be positive");
222
+ end_val = NUM2LONG(range_end);
223
+ if (end_val < -1L)
224
+ return rb_exc_new_cstr(rb_eArgError, "range end must be positive or -1");
225
+ if (end_val == -1L && open_ended)
226
+ return rb_exc_new_cstr(rb_eArgError, "range with -1 end must be closed");
227
+ }
228
+ }
131
229
  }
132
230
  }
133
231
  }
134
232
 
135
- ctx = ruby_xmalloc(sizeof(scan_ctx));
136
-
137
- ctx->with_path = RTEST(with_path);
138
233
  ctx->max_path_len = 0;
139
234
 
140
235
  paths = ruby_xmalloc(sizeof(paths_t) * path_ary_len);
@@ -149,9 +244,20 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
149
244
  for (int j = 0; j < path_len; j++)
150
245
  {
151
246
  VALUE entry = rb_ary_entry(path, j);
152
- int type = TYPE(entry);
153
- if (type == T_STRING)
247
+ switch (TYPE(entry))
154
248
  {
249
+ case T_SYMBOL:
250
+ entry = rb_sym2str(entry);
251
+ /* fall through */
252
+ case T_STRING:
253
+ {
254
+ if (string_keys != Qundef)
255
+ {
256
+ // If string_keys is provided, we need to duplicate the string
257
+ // to avoid use-after-free issues and to add the newly created string to the string_keys array
258
+ entry = rb_str_dup(entry);
259
+ rb_ary_push(string_keys, entry);
260
+ }
155
261
  paths[i].elems[j].type = MATCHER_KEY;
156
262
  paths[i].elems[j].value.key.val = RSTRING_PTR(entry);
157
263
  #if LONG_MAX > SIZE_MAX
@@ -160,25 +266,36 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
160
266
  paths[i].elems[j].value.key.len = RSTRING_LEN(entry);
161
267
  #endif
162
268
  }
163
- else if (type == T_FIXNUM || type == T_BIGNUM)
269
+ break;
270
+ case T_FIXNUM:
271
+ case T_BIGNUM:
164
272
  {
165
273
  paths[i].elems[j].type = MATCHER_INDEX;
166
274
  paths[i].elems[j].value.index = FIX2LONG(entry);
167
275
  }
168
- else
276
+ break;
277
+ default:
169
278
  {
170
279
  VALUE range_beg, range_end;
171
280
  int open_ended;
172
- paths[i].elems[j].type = MATCHER_INDEX_RANGE;
173
281
  rb_range_values(entry, &range_beg, &range_end, &open_ended);
174
- paths[i].elems[j].value.range.start = RB_NUM2LONG(range_beg);
175
- paths[i].elems[j].value.range.end = RB_NUM2LONG(range_end);
176
- // (value..-1) works as expected, (value...-1) is forbidden above
177
- if (paths[i].elems[j].value.range.end == -1L)
178
- paths[i].elems[j].value.range.end = LONG_MAX;
179
- // -1 here is fine, so, (0...0) works just as expected - doesn't match anything
180
- if (open_ended)
181
- paths[i].elems[j].value.range.end--;
282
+ if (range_beg == any_key_sym && range_end == any_key_sym)
283
+ {
284
+ paths[i].elems[j].type = MATCHER_ANY_KEY;
285
+ }
286
+ else
287
+ {
288
+ paths[i].elems[j].type = MATCHER_INDEX_RANGE;
289
+ paths[i].elems[j].value.range.start = NUM2LONG(range_beg);
290
+ paths[i].elems[j].value.range.end = NUM2LONG(range_end);
291
+ // (value..-1) works as expected, (value...-1) is forbidden above
292
+ if (paths[i].elems[j].value.range.end == -1L)
293
+ paths[i].elems[j].value.range.end = LONG_MAX;
294
+ // -1 here is fine, so, (0...0) works just as expected - doesn't match anything
295
+ if (open_ended)
296
+ paths[i].elems[j].value.range.end--;
297
+ }
298
+ }
182
299
  }
183
300
  }
184
301
  paths[i].len = path_len;
@@ -189,32 +306,37 @@ scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
189
306
  ctx->paths_len = path_ary_len;
190
307
  ctx->current_path = ruby_xmalloc2(sizeof(path_elem_t), ctx->max_path_len);
191
308
 
192
- ctx->current_path_len = 0;
193
- ctx->points_list = rb_ary_new_capa(path_ary_len);
194
- for (int i = 0; i < path_ary_len; i++)
195
- {
196
- rb_ary_push(ctx->points_list, rb_ary_new());
197
- }
309
+ ctx->starts = ruby_xmalloc2(sizeof(size_t), ctx->max_path_len + 1);
310
+ return Qundef; // no error
311
+ }
198
312
 
199
- ctx->starts = ruby_xmalloc2(sizeof(size_t), ctx->max_path_len);
313
+ // resets temporary values in the config
314
+ void scan_ctx_reset(scan_ctx *ctx, VALUE points_list, int with_path, int symbolize_path_keys)
315
+ {
316
+ // TODO: reset matched_depth if implemented
317
+ ctx->current_path_len = 0;
200
318
  // ctx->rb_err = Qnil;
201
319
  ctx->handle = NULL;
202
-
203
- return ctx;
320
+ ctx->yajl_bytes_consumed = 0;
321
+ ctx->points_list = points_list;
322
+ ctx->with_path = with_path;
323
+ ctx->symbolize_path_keys = symbolize_path_keys;
204
324
  }
205
325
 
206
326
  void scan_ctx_free(scan_ctx *ctx)
207
327
  {
328
+ // fprintf(stderr, "scan_ctx_free\n");
208
329
  if (!ctx)
209
330
  return;
210
331
  ruby_xfree(ctx->starts);
211
332
  ruby_xfree(ctx->current_path);
333
+ if (!ctx->paths)
334
+ return;
212
335
  for (int i = 0; i < ctx->paths_len; i++)
213
336
  {
214
337
  ruby_xfree(ctx->paths[i].elems);
215
338
  }
216
339
  ruby_xfree(ctx->paths);
217
- ruby_xfree(ctx);
218
340
  }
219
341
 
220
342
  // noexcept
@@ -239,42 +361,70 @@ typedef enum
239
361
  } value_type;
240
362
 
241
363
  // noexcept
242
- void create_point(VALUE *point, scan_ctx *sctx, value_type type, size_t length, size_t curr_pos)
364
+ VALUE create_point(scan_ctx *sctx, value_type type, size_t length)
243
365
  {
244
- VALUE values[3];
245
- *point = rb_ary_new_capa(3);
366
+ VALUE values[3], point;
367
+ size_t curr_pos = scan_ctx_get_bytes_consumed(sctx);
368
+ point = rb_ary_new_capa(3);
246
369
  // noexcept
247
- values[1] = RB_ULONG2NUM(curr_pos);
370
+ values[1] = ULL2NUM(curr_pos);
248
371
  switch (type)
249
372
  {
250
373
  // FIXME: size_t can be longer than ulong
251
374
  case null_value:
252
- values[0] = RB_ULONG2NUM(curr_pos - length);
375
+ values[0] = ULL2NUM(curr_pos - length);
253
376
  values[2] = null_sym;
254
377
  break;
255
378
  case boolean_value:
256
- values[0] = RB_ULONG2NUM(curr_pos - length);
379
+ values[0] = ULL2NUM(curr_pos - length);
257
380
  values[2] = boolean_sym;
258
381
  break;
259
382
  case number_value:
260
- values[0] = RB_ULONG2NUM(curr_pos - length);
383
+ values[0] = ULL2NUM(curr_pos - length);
261
384
  values[2] = number_sym;
262
385
  break;
263
386
  case string_value:
264
- values[0] = RB_ULONG2NUM(curr_pos - length);
387
+ values[0] = ULL2NUM(curr_pos - length);
265
388
  values[2] = string_sym;
266
389
  break;
267
390
  case object_value:
268
- values[0] = RB_ULONG2NUM(sctx->starts[sctx->current_path_len]);
391
+ values[0] = ULL2NUM(sctx->starts[sctx->current_path_len]);
269
392
  values[2] = object_sym;
270
393
  break;
271
394
  case array_value:
272
- values[0] = RB_ULONG2NUM(sctx->starts[sctx->current_path_len]);
395
+ values[0] = ULL2NUM(sctx->starts[sctx->current_path_len]);
273
396
  values[2] = array_sym;
274
397
  break;
275
398
  }
276
399
  // rb_ary_cat raise only in case of a frozen array or if len is too long
277
- rb_ary_cat(*point, values, 3);
400
+ rb_ary_cat(point, values, 3);
401
+ return point;
402
+ }
403
+
404
+ // noexcept
405
+ VALUE create_path(scan_ctx *sctx)
406
+ {
407
+ VALUE path = rb_ary_new_capa(sctx->current_path_len);
408
+ for (int i = 0; i < sctx->current_path_len; i++)
409
+ {
410
+ VALUE entry;
411
+ switch (sctx->current_path[i].type)
412
+ {
413
+ case PATH_KEY:
414
+ if (sctx->symbolize_path_keys)
415
+ entry = rb_id2sym(rb_intern2(sctx->current_path[i].value.key.val, sctx->current_path[i].value.key.len));
416
+ else
417
+ entry = rb_str_new(sctx->current_path[i].value.key.val, sctx->current_path[i].value.key.len);
418
+ break;
419
+ case PATH_INDEX:
420
+ entry = LONG2NUM(sctx->current_path[i].value.index);
421
+ break;
422
+ default:
423
+ entry = Qnil;
424
+ }
425
+ rb_ary_push(path, entry);
426
+ }
427
+ return path;
278
428
  }
279
429
 
280
430
  // noexcept
@@ -282,7 +432,8 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
282
432
  {
283
433
  // TODO: Abort parsing if all paths are matched and no more mathces are possible: only trivial key/index matchers at the current level
284
434
  // TODO: Don't re-compare already matched prefixes; hard to invalidate, though
285
- VALUE point = Qundef;
435
+ // TODO: Might fail in case of no memory
436
+ VALUE point = Qundef, path;
286
437
  int match;
287
438
  for (int i = 0; i < sctx->paths_len; i++)
288
439
  {
@@ -294,6 +445,10 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
294
445
  {
295
446
  switch (sctx->paths[i].elems[j].type)
296
447
  {
448
+ case MATCHER_ANY_KEY:
449
+ if (sctx->current_path[j].type != PATH_KEY)
450
+ match = false;
451
+ break;
297
452
  case MATCHER_KEY:
298
453
  if (sctx->current_path[j].type != PATH_KEY ||
299
454
  sctx->current_path[j].value.key.len != sctx->paths[i].elems[j].value.key.len ||
@@ -319,7 +474,12 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
319
474
  {
320
475
  if (point == Qundef)
321
476
  {
322
- create_point(&point, sctx, type, length, yajl_get_bytes_consumed(sctx->handle));
477
+ point = create_point(sctx, type, length);
478
+ if (sctx->with_path)
479
+ {
480
+ path = create_path(sctx);
481
+ point = rb_ary_new_from_args(2, path, point);
482
+ }
323
483
  }
324
484
  // rb_ary_push raises only in case of a frozen array, which is not the case
325
485
  // rb_ary_entry is safe
@@ -382,11 +542,9 @@ int scan_on_start_object(void *ctx)
382
542
  return true;
383
543
  }
384
544
  increment_arr_index(sctx);
545
+ sctx->starts[sctx->current_path_len] = scan_ctx_get_bytes_consumed(sctx) - 1;
385
546
  if (sctx->current_path_len < sctx->max_path_len)
386
- {
387
- sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
388
547
  sctx->current_path[sctx->current_path_len].type = PATH_KEY;
389
- }
390
548
  sctx->current_path_len++;
391
549
  return true;
392
550
  }
@@ -409,9 +567,8 @@ int scan_on_end_object(void *ctx)
409
567
  {
410
568
  scan_ctx *sctx = (scan_ctx *)ctx;
411
569
  sctx->current_path_len--;
412
- if (sctx->current_path_len >= sctx->max_path_len)
413
- return true;
414
- save_point(sctx, object_value, 0);
570
+ if (sctx->current_path_len <= sctx->max_path_len)
571
+ save_point(sctx, object_value, 0);
415
572
  return true;
416
573
  }
417
574
 
@@ -425,9 +582,9 @@ int scan_on_start_array(void *ctx)
425
582
  return true;
426
583
  }
427
584
  increment_arr_index(sctx);
585
+ sctx->starts[sctx->current_path_len] = scan_ctx_get_bytes_consumed(sctx) - 1;
428
586
  if (sctx->current_path_len < sctx->max_path_len)
429
587
  {
430
- sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
431
588
  sctx->current_path[sctx->current_path_len].type = PATH_INDEX;
432
589
  sctx->current_path[sctx->current_path_len].value.index = -1;
433
590
  }
@@ -440,12 +597,112 @@ int scan_on_end_array(void *ctx)
440
597
  {
441
598
  scan_ctx *sctx = (scan_ctx *)ctx;
442
599
  sctx->current_path_len--;
443
- if (sctx->current_path_len >= sctx->max_path_len)
444
- return true;
445
- save_point(sctx, array_value, 0);
600
+ if (sctx->current_path_len <= sctx->max_path_len)
601
+ save_point(sctx, array_value, 0);
446
602
  return true;
447
603
  }
448
604
 
605
+ void config_free(void *data)
606
+ {
607
+ scan_ctx_free((scan_ctx *)data);
608
+ ruby_xfree(data);
609
+ }
610
+
611
+ size_t config_size(const void *data)
612
+ {
613
+ // see ObjectSpace.memsize_of
614
+ scan_ctx *ctx = (scan_ctx *)data;
615
+ size_t res = sizeof(scan_ctx);
616
+ // current_path
617
+ if (ctx->current_path != NULL)
618
+ res += ctx->max_path_len * sizeof(path_elem_t);
619
+ // starts
620
+ if (ctx->starts != NULL)
621
+ res += ctx->max_path_len * sizeof(size_t);
622
+ if (ctx->paths != NULL)
623
+ {
624
+ res += ctx->paths_len * sizeof(paths_t);
625
+ for (int i = 0; i < ctx->paths_len; i++)
626
+ {
627
+ res += ctx->paths[i].len * sizeof(path_matcher_elem_t);
628
+ }
629
+ }
630
+ return res;
631
+ }
632
+
633
+ static const rb_data_type_t config_type = {
634
+ .wrap_struct_name = "json_scanner_config",
635
+ .function = {
636
+ .dfree = config_free,
637
+ .dsize = config_size,
638
+ },
639
+ .flags = RUBY_TYPED_FREE_IMMEDIATELY,
640
+ };
641
+
642
+ VALUE config_alloc(VALUE self)
643
+ {
644
+ scan_ctx *ctx = ruby_xmalloc(sizeof(scan_ctx));
645
+ ctx->paths = NULL;
646
+ ctx->paths_len = 0;
647
+ ctx->current_path = NULL;
648
+ ctx->max_path_len = 0;
649
+ ctx->starts = NULL;
650
+ scan_ctx_reset(ctx, Qundef, false, false);
651
+ return TypedData_Wrap_Struct(self, &config_type, ctx);
652
+ }
653
+
654
+ VALUE config_m_initialize(VALUE self, VALUE path_ary)
655
+ {
656
+ scan_ctx *ctx;
657
+ VALUE scan_ctx_init_err, string_keys;
658
+ TypedData_Get_Struct(self, scan_ctx, &config_type, ctx);
659
+ string_keys = rb_ary_new();
660
+ scan_ctx_init_err = scan_ctx_init(ctx, path_ary, string_keys);
661
+ if (scan_ctx_init_err != Qundef)
662
+ {
663
+ rb_exc_raise(scan_ctx_init_err);
664
+ }
665
+ rb_iv_set(self, "string_keys", string_keys);
666
+ return self;
667
+ }
668
+
669
+ VALUE config_m_inspect(VALUE self)
670
+ {
671
+ scan_ctx *ctx;
672
+ VALUE res;
673
+ TypedData_Get_Struct(self, scan_ctx, &config_type, ctx);
674
+ res = rb_sprintf("#<%" PRIsVALUE " [", rb_class_name(CLASS_OF(self)));
675
+ for (int i = 0; ctx->paths && i < ctx->paths_len; i++)
676
+ {
677
+ rb_str_cat_cstr(res, "[");
678
+ for (int j = 0; j < ctx->paths[i].len; j++)
679
+ {
680
+ switch (ctx->paths[i].elems[j].type)
681
+ {
682
+ case MATCHER_KEY:
683
+ rb_str_catf(res, "'%.*s'", (int)ctx->paths[i].elems[j].value.key.len, ctx->paths[i].elems[j].value.key.val);
684
+ break;
685
+ case MATCHER_INDEX:
686
+ rb_str_catf(res, "%ld", ctx->paths[i].elems[j].value.index);
687
+ break;
688
+ case MATCHER_INDEX_RANGE:
689
+ rb_str_catf(res, "(%ld..%ld)", ctx->paths[i].elems[j].value.range.start, ctx->paths[i].elems[j].value.range.end);
690
+ break;
691
+ case MATCHER_ANY_KEY:
692
+ rb_str_cat_cstr(res, "('*'..'*')");
693
+ break;
694
+ }
695
+ if (j < ctx->paths[i].len - 1)
696
+ rb_str_cat_cstr(res, ", ");
697
+ }
698
+ rb_str_cat_cstr(res, "]");
699
+ if (i < ctx->paths_len - 1)
700
+ rb_str_cat_cstr(res, ", ");
701
+ }
702
+ rb_str_cat_cstr(res, "]>");
703
+ return res;
704
+ }
705
+
449
706
  static yajl_callbacks scan_callbacks = {
450
707
  scan_on_null,
451
708
  scan_on_boolean,
@@ -459,19 +716,43 @@ static yajl_callbacks scan_callbacks = {
459
716
  scan_on_start_array,
460
717
  scan_on_end_array};
461
718
 
462
- // TODO: make with_path optional kw: `with_path: false`
463
- VALUE scan(VALUE self, VALUE json_str, VALUE path_ary, VALUE with_path)
719
+ // def scan(json_str, path_arr, opts)
720
+ // opts
721
+ // with_path: false, verbose_error: false,
722
+ // the following opts converted to bool and passed to yajl_config if provided, ignored if not provided
723
+ // allow_comments, dont_validate_strings, allow_trailing_garbage, allow_multiple_values, allow_partial_values
724
+ VALUE scan(int argc, VALUE *argv, VALUE self)
464
725
  {
726
+ VALUE json_str, path_ary, with_path_flag, kwargs;
727
+ VALUE kwargs_values[SCAN_KWARGS_SIZE];
728
+
729
+ int with_path = false, verbose_error = false, symbolize_path_keys = false;
465
730
  char *json_text;
466
731
  size_t json_text_len;
467
732
  yajl_handle handle;
468
733
  yajl_status stat;
469
734
  scan_ctx *ctx;
470
- VALUE err = Qnil, result;
735
+ int free_ctx = true;
736
+ VALUE err_msg = Qnil, bytes_consumed, err, result;
471
737
  // Turned out callbacks can't raise exceptions
472
738
  // VALUE callback_err;
473
- // TODO
474
- int opt_verbose_error = 0;
739
+ #if RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7)
740
+ rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "21:", &json_str, &path_ary, &with_path_flag, &kwargs);
741
+ #else
742
+ rb_scan_args(argc, argv, "21:", &json_str, &path_ary, &with_path_flag, &kwargs);
743
+ #endif
744
+ // rb_io_write(rb_stderr, rb_sprintf("with_path_flag: %" PRIsVALUE " \n", with_path_flag));
745
+ with_path = RTEST(with_path_flag);
746
+ if (kwargs != Qnil)
747
+ {
748
+ rb_get_kwargs(kwargs, scan_kwargs_table, 0, SCAN_KWARGS_SIZE, kwargs_values);
749
+ if (kwargs_values[0] != Qundef)
750
+ with_path = RTEST(kwargs_values[0]);
751
+ if (kwargs_values[1] != Qundef)
752
+ verbose_error = RTEST(kwargs_values[1]);
753
+ if (kwargs_values[7] != Qundef)
754
+ symbolize_path_keys = RTEST(kwargs_values[7]);
755
+ }
475
756
  rb_check_type(json_str, T_STRING);
476
757
  json_text = RSTRING_PTR(json_str);
477
758
  #if LONG_MAX > SIZE_MAX
@@ -479,32 +760,96 @@ VALUE scan(VALUE self, VALUE json_str, VALUE path_ary, VALUE with_path)
479
760
  #else
480
761
  json_text_len = RSTRING_LEN(json_str);
481
762
  #endif
482
- ctx = scan_ctx_init(path_ary, with_path);
763
+ if (rb_obj_is_kind_of(path_ary, rb_cJsonScannerConfig))
764
+ {
765
+ free_ctx = false;
766
+ TypedData_Get_Struct(path_ary, scan_ctx, &config_type, ctx);
767
+ }
768
+ else
769
+ {
770
+ VALUE scan_ctx_init_err;
771
+ ctx = ruby_xmalloc(sizeof(scan_ctx));
772
+ scan_ctx_init_err = scan_ctx_init(ctx, path_ary, Qundef);
773
+ if (scan_ctx_init_err != Qundef)
774
+ {
775
+ ruby_xfree(ctx);
776
+ rb_exc_raise(scan_ctx_init_err);
777
+ }
778
+ }
779
+ // Need to keep a ref to result array on the stack to prevent it from being GC-ed
780
+ result = rb_ary_new_capa(ctx->paths_len);
781
+ for (int i = 0; i < ctx->paths_len; i++)
782
+ {
783
+ rb_ary_push(result, rb_ary_new());
784
+ }
785
+ scan_ctx_reset(ctx, result, with_path, symbolize_path_keys);
786
+ // scan_ctx_debug(ctx);
483
787
 
484
788
  handle = yajl_alloc(&scan_callbacks, NULL, (void *)ctx);
789
+ if (kwargs != Qnil) // it's safe to read kwargs_values only if rb_get_kwargs was called
790
+ {
791
+ if (kwargs_values[2] != Qundef)
792
+ yajl_config(handle, yajl_allow_comments, RTEST(kwargs_values[2]));
793
+ if (kwargs_values[3] != Qundef)
794
+ yajl_config(handle, yajl_dont_validate_strings, RTEST(kwargs_values[3]));
795
+ if (kwargs_values[4] != Qundef)
796
+ yajl_config(handle, yajl_allow_trailing_garbage, RTEST(kwargs_values[4]));
797
+ if (kwargs_values[5] != Qundef)
798
+ yajl_config(handle, yajl_allow_multiple_values, RTEST(kwargs_values[5]));
799
+ if (kwargs_values[6] != Qundef)
800
+ yajl_config(handle, yajl_allow_partial_values, RTEST(kwargs_values[6]));
801
+ }
485
802
  ctx->handle = handle;
486
- // TODO: make it configurable
487
- // yajl_config(handle, yajl_allow_comments, true);
488
- // yajl_config(handle, yajl_allow_trailing_garbage, true);
489
803
  stat = yajl_parse(handle, (unsigned char *)json_text, json_text_len);
804
+ scan_ctx_update_bytes_consumed(ctx);
490
805
  if (stat == yajl_status_ok)
806
+ {
491
807
  stat = yajl_complete_parse(handle);
808
+ scan_ctx_update_bytes_consumed(ctx);
809
+ }
492
810
 
493
811
  if (stat != yajl_status_ok)
494
812
  {
495
- char *str = (char *)yajl_get_error(handle, opt_verbose_error, (unsigned char *)json_text, json_text_len);
496
- err = rb_str_new_cstr(str);
813
+ char *str = (char *)yajl_get_error(handle, verbose_error, (unsigned char *)json_text, json_text_len);
814
+ err_msg = rb_utf8_str_new_cstr(str);
815
+ // TODO: maybe use scan_ctx_get_bytes_consumed here too? But it makes difference in premature EOF
816
+ bytes_consumed = ULL2NUM(yajl_get_bytes_consumed(handle));
497
817
  yajl_free_error(handle, (unsigned char *)str);
498
818
  }
819
+ // // Needed when yajl_allow_partial_values is set
820
+ // if (ctx->current_path_len > 0)
821
+ // {
822
+ // if (ctx->current_path_len > ctx->max_path_len)
823
+ // ctx->current_path_len = ctx->max_path_len;
824
+ // for (int i = ctx->current_path_len - 1; i > 0; i--)
825
+ // {
826
+ // switch (ctx->current_path[i].type)
827
+ // {
828
+ // case PATH_KEY:
829
+ // scan_on_end_object(ctx);
830
+ // break;
831
+ // case PATH_INDEX:
832
+ // scan_on_end_array(ctx);
833
+ // break;
834
+ // }
835
+ // }
836
+ // }
499
837
  // callback_err = ctx->rb_err;
500
- result = ctx->points_list;
501
- scan_ctx_free(ctx);
838
+ if (free_ctx)
839
+ {
840
+ // fprintf(stderr, "free_ctx\n");
841
+ scan_ctx_free(ctx);
842
+ ruby_xfree(ctx);
843
+ }
502
844
  yajl_free(handle);
503
- if (err != Qnil)
504
- rb_exc_raise(rb_exc_new_str(rb_eJsonScannerParseError, err));
845
+ if (err_msg != Qnil)
846
+ {
847
+ err = rb_exc_new_str(rb_eJsonScannerParseError, err_msg);
848
+ rb_ivar_set(err, rb_iv_bytes_consumed, bytes_consumed);
849
+ rb_exc_raise(err);
850
+ }
505
851
  // if (callback_err != Qnil)
506
852
  // rb_exc_raise(callback_err);
507
- // TODO: report yajl_get_bytes_consumed(handle)
508
853
  return result;
509
854
  }
510
855
 
@@ -512,19 +857,29 @@ RUBY_FUNC_EXPORTED void
512
857
  Init_json_scanner(void)
513
858
  {
514
859
  rb_mJsonScanner = rb_define_module("JsonScanner");
860
+ rb_cJsonScannerConfig = rb_define_class_under(rb_mJsonScanner, "Config", rb_cObject);
861
+ rb_define_alloc_func(rb_cJsonScannerConfig, config_alloc);
862
+ rb_define_method(rb_cJsonScannerConfig, "initialize", config_m_initialize, 1);
863
+ rb_define_method(rb_cJsonScannerConfig, "inspect", config_m_inspect, 0);
515
864
  rb_define_const(rb_mJsonScanner, "ANY_INDEX", rb_range_new(INT2FIX(0), INT2FIX(-1), false));
516
- rb_mJsonScannerOptions = rb_define_module_under(rb_mJsonScanner, "Options");
865
+ any_key_sym = rb_id2sym(rb_intern("*"));
866
+ rb_define_const(rb_mJsonScanner, "ANY_KEY", rb_range_new(any_key_sym, any_key_sym, false));
517
867
  rb_eJsonScannerParseError = rb_define_class_under(rb_mJsonScanner, "ParseError", rb_eRuntimeError);
518
- rb_define_const(rb_mJsonScannerOptions, "ALLOW_COMMENTS", INT2FIX(yajl_allow_comments));
519
- rb_define_const(rb_mJsonScannerOptions, "DONT_VALIDATE_STRINGS", INT2FIX(yajl_dont_validate_strings));
520
- rb_define_const(rb_mJsonScannerOptions, "ALLOW_TRAILING_GARBAGE", INT2FIX(yajl_allow_trailing_garbage));
521
- rb_define_const(rb_mJsonScannerOptions, "ALLOW_MULTIPLE_VALUES", INT2FIX(yajl_allow_multiple_values));
522
- rb_define_const(rb_mJsonScannerOptions, "ALLOW_PARTIAL_VALUES", INT2FIX(yajl_allow_partial_values));
523
- rb_define_module_function(rb_mJsonScanner, "scan", scan, 3);
868
+ rb_define_attr(rb_eJsonScannerParseError, BYTES_CONSUMED, true, false);
869
+ rb_iv_bytes_consumed = rb_intern("@" BYTES_CONSUMED);
870
+ rb_define_module_function(rb_mJsonScanner, "scan", scan, -1);
524
871
  null_sym = rb_id2sym(rb_intern("null"));
525
872
  boolean_sym = rb_id2sym(rb_intern("boolean"));
526
873
  number_sym = rb_id2sym(rb_intern("number"));
527
874
  string_sym = rb_id2sym(rb_intern("string"));
528
875
  object_sym = rb_id2sym(rb_intern("object"));
529
876
  array_sym = rb_id2sym(rb_intern("array"));
877
+ scan_kwargs_table[0] = rb_intern("with_path");
878
+ scan_kwargs_table[1] = rb_intern("verbose_error");
879
+ scan_kwargs_table[2] = rb_intern("allow_comments");
880
+ scan_kwargs_table[3] = rb_intern("dont_validate_strings");
881
+ scan_kwargs_table[4] = rb_intern("allow_trailing_garbage");
882
+ scan_kwargs_table[5] = rb_intern("allow_multiple_values");
883
+ scan_kwargs_table[6] = rb_intern("allow_partial_values");
884
+ scan_kwargs_table[7] = rb_intern("symbolize_path_keys");
530
885
  }