json_scanner 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +51 -0
- data/ext/json_scanner/extconf.rb +16 -0
- data/ext/json_scanner/json_scanner.c +512 -0
- data/ext/json_scanner/json_scanner.h +12 -0
- data/lib/json_scanner/version.rb +5 -0
- data/lib/json_scanner.rb +9 -0
- data/sig/json_scanner.rbs +4 -0
- data/spec/json_scanner_spec.rb +37 -0
- data/spec/spec_helper.rb +15 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d0af7c4c2fce9ca74ec96c00e8972088b34c99a005b8fc966d1a0e9ae7d75dcb
|
4
|
+
data.tar.gz: 70f2365add4838ef7409d3ff9568ab59d1a3771ac38d6168757bf8de1de71b1d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 96958c94108fafca33f68f091dcea150e549e1fa61c02aaf62790f0d2f77c8762abe7b702a4cf95b9d4e28929a3dd1ce77681bb7cc0f6e7e8fdd22d32f74f378
|
7
|
+
data.tar.gz: 3da8a8713b1f1994d50ee3032d450b5d34070c843e7c2625db1f2945c5a6c1cdc223d9b957c0386562ea1fdf68d2d03c6fc985ca2f0ddda8a7f3b83ff2c19b36
|
data/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
[](https://github.com/uvlad7/json_scanner/actions/workflows/main.yml)
|
2
|
+
|
3
|
+
# JsonScanner
|
4
|
+
|
5
|
+
Extract values from JSON without full parsing. This gem uses yajl lib to scan a json string and allows you to parse pieces of it.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Install the gem and add to the application's Gemfile by executing:
|
10
|
+
|
11
|
+
$ bundle add json_scanner
|
12
|
+
|
13
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
14
|
+
|
15
|
+
$ gem install json_scanner
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require "json"
|
21
|
+
require "json_scanner"
|
22
|
+
|
23
|
+
large_json = "[#{"4," * 100_000}42#{",2" * 100_000}]"
|
24
|
+
where_is_42 = JsonScanner.scan(large_json, [[100_000]], false).first
|
25
|
+
# => [[200001, 200003, :number]]
|
26
|
+
where_is_42.map do |begin_pos, end_pos, _type|
|
27
|
+
JSON.parse(large_json.byteslice(begin_pos...end_pos), quirks_mode: true)
|
28
|
+
end
|
29
|
+
# => [42]
|
30
|
+
|
31
|
+
emoji_json = '{"grin": "😁", "heart": "😍", "rofl": "🤣"}'
|
32
|
+
begin_pos, end_pos, = JsonScanner.scan(emoji_json, [["heart"]], false).first.first
|
33
|
+
emoji_json.byteslice(begin_pos...end_pos)
|
34
|
+
# => "\"😍\""
|
35
|
+
emoji_json.force_encoding(Encoding::BINARY)[begin_pos...end_pos].force_encoding(Encoding::UTF_8)
|
36
|
+
# => "\"😍\""
|
37
|
+
```
|
38
|
+
|
39
|
+
## Development
|
40
|
+
|
41
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
42
|
+
|
43
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
44
|
+
|
45
|
+
## Contributing
|
46
|
+
|
47
|
+
Bug reports and pull requests are welcome on GitHub at [github](https://github.com/uvlad7/json_scanner).
|
48
|
+
|
49
|
+
## License
|
50
|
+
|
51
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "mkmf"
|
4
|
+
|
5
|
+
# Makes all symbols private by default to avoid unintended conflict
|
6
|
+
# with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
|
7
|
+
# selectively, or entirely remove this flag.
|
8
|
+
append_cflags("-fvisibility=hidden")
|
9
|
+
|
10
|
+
dir_config("yajl", "", "")
|
11
|
+
|
12
|
+
unless have_library("yajl") && have_header("yajl/yajl_parse.h") && have_header("yajl/yajl_gen.h")
|
13
|
+
abort "yajl library not found"
|
14
|
+
end
|
15
|
+
|
16
|
+
create_makefile("json_scanner/json_scanner")
|
@@ -0,0 +1,512 @@
|
|
1
|
+
#include "json_scanner.h"
|
2
|
+
|
3
|
+
VALUE rb_mJsonScanner;
|
4
|
+
VALUE rb_mJsonScannerOptions;
|
5
|
+
VALUE rb_eJsonScannerParseError;
|
6
|
+
|
7
|
+
VALUE null_sym;
|
8
|
+
VALUE boolean_sym;
|
9
|
+
VALUE number_sym;
|
10
|
+
VALUE string_sym;
|
11
|
+
VALUE object_sym;
|
12
|
+
VALUE array_sym;
|
13
|
+
|
14
|
+
enum matcher_type
|
15
|
+
{
|
16
|
+
MATCHER_KEY,
|
17
|
+
MATCHER_INDEX,
|
18
|
+
// MATCHER_ANY_KEY,
|
19
|
+
// MATCHER_ANY_INDEX,
|
20
|
+
MATCHER_INDEX_RANGE,
|
21
|
+
// MATCHER_KEYS_LIST,
|
22
|
+
};
|
23
|
+
|
24
|
+
enum path_type
|
25
|
+
{
|
26
|
+
PATH_KEY,
|
27
|
+
PATH_INDEX,
|
28
|
+
};
|
29
|
+
|
30
|
+
typedef struct
|
31
|
+
{
|
32
|
+
const char *val;
|
33
|
+
size_t len;
|
34
|
+
} hashkey_t;
|
35
|
+
|
36
|
+
typedef struct
|
37
|
+
{
|
38
|
+
long start;
|
39
|
+
long end;
|
40
|
+
} range_t;
|
41
|
+
|
42
|
+
typedef struct
|
43
|
+
{
|
44
|
+
enum matcher_type type;
|
45
|
+
union
|
46
|
+
{
|
47
|
+
hashkey_t key;
|
48
|
+
long index;
|
49
|
+
range_t range;
|
50
|
+
} value;
|
51
|
+
} path_matcher_elem_t;
|
52
|
+
|
53
|
+
typedef struct
|
54
|
+
{
|
55
|
+
enum path_type type;
|
56
|
+
union
|
57
|
+
{
|
58
|
+
hashkey_t key;
|
59
|
+
long index;
|
60
|
+
} value;
|
61
|
+
} path_elem_t;
|
62
|
+
|
63
|
+
typedef struct
|
64
|
+
{
|
65
|
+
path_matcher_elem_t *elems;
|
66
|
+
int len;
|
67
|
+
int matched_depth;
|
68
|
+
} paths_t;
|
69
|
+
|
70
|
+
typedef struct
|
71
|
+
{
|
72
|
+
int with_path;
|
73
|
+
paths_t *paths;
|
74
|
+
int paths_len;
|
75
|
+
path_elem_t *current_path;
|
76
|
+
int current_path_len;
|
77
|
+
int max_path_len;
|
78
|
+
// Easier to use a Ruby array for result than convert later
|
79
|
+
VALUE points_list;
|
80
|
+
// by depth
|
81
|
+
size_t *starts;
|
82
|
+
// VALUE rb_err;
|
83
|
+
yajl_handle handle;
|
84
|
+
} scan_ctx;
|
85
|
+
|
86
|
+
// FIXME: This will cause memory leak if ruby_xmalloc raises
|
87
|
+
scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
|
88
|
+
{
|
89
|
+
// TODO: Allow to_ary and sized enumerables
|
90
|
+
rb_check_type(path_ary, T_ARRAY);
|
91
|
+
int path_ary_len = rb_long2int(rb_array_len(path_ary));
|
92
|
+
// Check types early before any allocations, so exception is ok
|
93
|
+
// TODO: Fix this, just handle errors
|
94
|
+
for (int i = 0; i < path_ary_len; i++)
|
95
|
+
{
|
96
|
+
VALUE path = rb_ary_entry(path_ary, i);
|
97
|
+
rb_check_type(path, T_ARRAY);
|
98
|
+
int path_len = rb_long2int(rb_array_len(path));
|
99
|
+
for (int j = 0; j < path_len; j++)
|
100
|
+
{
|
101
|
+
VALUE entry = rb_ary_entry(path, j);
|
102
|
+
int type = TYPE(entry);
|
103
|
+
if (type == T_STRING)
|
104
|
+
{
|
105
|
+
#if LONG_MAX > SIZE_MAX
|
106
|
+
RSTRING_LENINT(entry);
|
107
|
+
#endif
|
108
|
+
}
|
109
|
+
else if (type == T_FIXNUM || type == T_BIGNUM)
|
110
|
+
{
|
111
|
+
RB_NUM2LONG(entry);
|
112
|
+
}
|
113
|
+
else
|
114
|
+
{
|
115
|
+
VALUE range_beg, range_end;
|
116
|
+
int open_ended;
|
117
|
+
if (rb_range_values(entry, &range_beg, &range_end, &open_ended) != Qtrue)
|
118
|
+
rb_raise(rb_eArgError, "path elements must be strings, integers, or ranges");
|
119
|
+
RB_NUM2LONG(range_beg);
|
120
|
+
RB_NUM2LONG(range_end);
|
121
|
+
}
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
scan_ctx *ctx = ruby_xmalloc(sizeof(scan_ctx));
|
126
|
+
|
127
|
+
ctx->with_path = RB_TEST(with_path);
|
128
|
+
ctx->max_path_len = 0;
|
129
|
+
|
130
|
+
paths_t *paths = ruby_xmalloc(sizeof(paths_t) * path_ary_len);
|
131
|
+
for (int i = 0; i < path_ary_len; i++)
|
132
|
+
{
|
133
|
+
VALUE path = rb_ary_entry(path_ary, i);
|
134
|
+
int path_len = rb_long2int(rb_array_len(path));
|
135
|
+
if (path_len > ctx->max_path_len)
|
136
|
+
ctx->max_path_len = path_len;
|
137
|
+
paths[i].elems = ruby_xmalloc2(sizeof(path_matcher_elem_t), path_len);
|
138
|
+
for (int j = 0; j < path_len; j++)
|
139
|
+
{
|
140
|
+
VALUE entry = rb_ary_entry(path, j);
|
141
|
+
int type = TYPE(entry);
|
142
|
+
if (type == T_STRING)
|
143
|
+
{
|
144
|
+
paths[i].elems[j].type = MATCHER_KEY;
|
145
|
+
paths[i].elems[j].value.key.val = RSTRING_PTR(entry);
|
146
|
+
#if LONG_MAX > SIZE_MAX
|
147
|
+
paths[i].elems[j].value.key.len = RSTRING_LENINT(entry);
|
148
|
+
#else
|
149
|
+
paths[i].elems[j].value.key.len = RSTRING_LEN(entry);
|
150
|
+
#endif
|
151
|
+
}
|
152
|
+
else if (type == T_FIXNUM || type == T_BIGNUM)
|
153
|
+
{
|
154
|
+
paths[i].elems[j].type = MATCHER_INDEX;
|
155
|
+
paths[i].elems[j].value.index = FIX2LONG(entry);
|
156
|
+
}
|
157
|
+
else
|
158
|
+
{
|
159
|
+
paths[i].elems[j].type = MATCHER_INDEX_RANGE;
|
160
|
+
VALUE range_beg, range_end;
|
161
|
+
int open_ended;
|
162
|
+
rb_range_values(entry, &range_beg, &range_end, &open_ended);
|
163
|
+
paths[i].elems[j].value.range.start = RB_NUM2LONG(range_beg);
|
164
|
+
paths[i].elems[j].value.range.end = RB_NUM2LONG(range_end);
|
165
|
+
if (open_ended)
|
166
|
+
paths[i].elems[j].value.range.end--;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
paths[i].len = path_len;
|
170
|
+
paths[i].matched_depth = 0;
|
171
|
+
}
|
172
|
+
|
173
|
+
ctx->paths = paths;
|
174
|
+
ctx->paths_len = path_ary_len;
|
175
|
+
ctx->current_path = ruby_xmalloc2(sizeof(path_elem_t), ctx->max_path_len);
|
176
|
+
|
177
|
+
ctx->current_path_len = 0;
|
178
|
+
ctx->points_list = rb_ary_new_capa(path_ary_len);
|
179
|
+
for (int i = 0; i < path_ary_len; i++)
|
180
|
+
{
|
181
|
+
rb_ary_push(ctx->points_list, rb_ary_new());
|
182
|
+
}
|
183
|
+
|
184
|
+
ctx->starts = ruby_xmalloc2(sizeof(size_t), ctx->max_path_len);
|
185
|
+
// ctx->rb_err = Qnil;
|
186
|
+
ctx->handle = NULL;
|
187
|
+
|
188
|
+
return ctx;
|
189
|
+
}
|
190
|
+
|
191
|
+
void scan_ctx_free(scan_ctx *ctx)
|
192
|
+
{
|
193
|
+
if (!ctx)
|
194
|
+
return;
|
195
|
+
ruby_xfree(ctx->starts);
|
196
|
+
ruby_xfree(ctx->current_path);
|
197
|
+
for (int i = 0; i < ctx->paths_len; i++)
|
198
|
+
{
|
199
|
+
ruby_xfree(ctx->paths[i].elems);
|
200
|
+
}
|
201
|
+
ruby_xfree(ctx->paths);
|
202
|
+
ruby_xfree(ctx);
|
203
|
+
}
|
204
|
+
|
205
|
+
// noexcept
|
206
|
+
inline void increment_arr_index(scan_ctx *sctx)
|
207
|
+
{
|
208
|
+
// remember - any value can be root
|
209
|
+
// TODO: Maybe make current_path_len 1 shorter and get rid of -1; need to change all compares
|
210
|
+
if (sctx->current_path_len && sctx->current_path[sctx->current_path_len - 1].type == PATH_INDEX)
|
211
|
+
{
|
212
|
+
sctx->current_path[sctx->current_path_len - 1].value.index++;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
|
216
|
+
typedef enum
|
217
|
+
{
|
218
|
+
null_value,
|
219
|
+
boolean_value,
|
220
|
+
number_value,
|
221
|
+
string_value,
|
222
|
+
object_value,
|
223
|
+
array_value,
|
224
|
+
} value_type;
|
225
|
+
|
226
|
+
// noexcept
|
227
|
+
void create_point(VALUE *point, scan_ctx *sctx, value_type type, size_t length, size_t curr_pos)
|
228
|
+
{
|
229
|
+
*point = rb_ary_new_capa(3);
|
230
|
+
VALUE values[3];
|
231
|
+
// noexcept
|
232
|
+
values[1] = RB_ULONG2NUM(curr_pos);
|
233
|
+
switch (type)
|
234
|
+
{
|
235
|
+
// FIXME: size_t can be longer than ulong
|
236
|
+
case null_value:
|
237
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
238
|
+
values[2] = null_sym;
|
239
|
+
break;
|
240
|
+
case boolean_value:
|
241
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
242
|
+
values[2] = boolean_sym;
|
243
|
+
break;
|
244
|
+
case number_value:
|
245
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
246
|
+
values[2] = number_sym;
|
247
|
+
break;
|
248
|
+
case string_value:
|
249
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
250
|
+
values[2] = string_sym;
|
251
|
+
break;
|
252
|
+
case object_value:
|
253
|
+
values[0] = RB_ULONG2NUM(sctx->starts[sctx->current_path_len]);
|
254
|
+
values[2] = object_sym;
|
255
|
+
break;
|
256
|
+
case array_value:
|
257
|
+
values[0] = RB_ULONG2NUM(sctx->starts[sctx->current_path_len]);
|
258
|
+
values[2] = array_sym;
|
259
|
+
break;
|
260
|
+
}
|
261
|
+
// rb_ary_cat raise only in case of a frozen array or if len is too long
|
262
|
+
rb_ary_cat(*point, values, 3);
|
263
|
+
}
|
264
|
+
|
265
|
+
// noexcept
|
266
|
+
void save_point(scan_ctx *sctx, value_type type, size_t length)
|
267
|
+
{
|
268
|
+
// TODO: Abort parsing if all paths are matched and no more mathces are possible: only trivial key/index matchers at the current level
|
269
|
+
// TODO: Don't re-compare already matched prefixes; hard to invalidate, though
|
270
|
+
VALUE point = Qundef;
|
271
|
+
for (int i = 0; i < sctx->paths_len; i++)
|
272
|
+
{
|
273
|
+
if (sctx->paths[i].len != sctx->current_path_len)
|
274
|
+
continue;
|
275
|
+
|
276
|
+
int match = true;
|
277
|
+
for (int j = 0; j < sctx->current_path_len; j++)
|
278
|
+
{
|
279
|
+
switch (sctx->paths[i].elems[j].type)
|
280
|
+
{
|
281
|
+
case MATCHER_KEY:
|
282
|
+
if (sctx->current_path[j].type != PATH_KEY ||
|
283
|
+
sctx->current_path[j].value.key.len != sctx->paths[i].elems[j].value.key.len ||
|
284
|
+
strncmp(sctx->current_path[j].value.key.val, sctx->paths[i].elems[j].value.key.val, sctx->current_path[j].value.key.len))
|
285
|
+
match = false;
|
286
|
+
break;
|
287
|
+
case MATCHER_INDEX:
|
288
|
+
if (sctx->current_path[j].type != PATH_INDEX ||
|
289
|
+
sctx->current_path[j].value.index != sctx->paths[i].elems[j].value.index)
|
290
|
+
match = false;
|
291
|
+
break;
|
292
|
+
case MATCHER_INDEX_RANGE:
|
293
|
+
if (sctx->current_path[j].type != PATH_INDEX ||
|
294
|
+
sctx->current_path[j].value.index < sctx->paths[i].elems[j].value.range.start ||
|
295
|
+
sctx->current_path[j].value.index > sctx->paths[i].elems[j].value.range.end)
|
296
|
+
match = false;
|
297
|
+
break;
|
298
|
+
}
|
299
|
+
if (!match)
|
300
|
+
break;
|
301
|
+
}
|
302
|
+
if (match)
|
303
|
+
{
|
304
|
+
if (point == Qundef)
|
305
|
+
{
|
306
|
+
create_point(&point, sctx, type, length, yajl_get_bytes_consumed(sctx->handle));
|
307
|
+
}
|
308
|
+
// rb_ary_push raises only in case of a frozen array, which is not the case
|
309
|
+
// rb_ary_entry is safe
|
310
|
+
rb_ary_push(rb_ary_entry(sctx->points_list, i), point);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
}
|
314
|
+
|
315
|
+
// noexcept
|
316
|
+
int scan_on_null(void *ctx)
|
317
|
+
{
|
318
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
319
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
320
|
+
return true;
|
321
|
+
increment_arr_index(sctx);
|
322
|
+
save_point(sctx, null_value, 4);
|
323
|
+
return true;
|
324
|
+
}
|
325
|
+
|
326
|
+
// noexcept
|
327
|
+
int scan_on_boolean(void *ctx, int bool_val)
|
328
|
+
{
|
329
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
330
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
331
|
+
return true;
|
332
|
+
increment_arr_index(sctx);
|
333
|
+
save_point(sctx, boolean_value, bool_val ? 4 : 5);
|
334
|
+
return true;
|
335
|
+
}
|
336
|
+
|
337
|
+
// noexcept
|
338
|
+
int scan_on_number(void *ctx, const char *val, size_t len)
|
339
|
+
{
|
340
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
341
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
342
|
+
return true;
|
343
|
+
increment_arr_index(sctx);
|
344
|
+
save_point(sctx, number_value, len);
|
345
|
+
return true;
|
346
|
+
}
|
347
|
+
|
348
|
+
// noexcept
|
349
|
+
int scan_on_string(void *ctx, const unsigned char *val, size_t len)
|
350
|
+
{
|
351
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
352
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
353
|
+
return true;
|
354
|
+
increment_arr_index(sctx);
|
355
|
+
save_point(sctx, string_value, len + 2);
|
356
|
+
return true;
|
357
|
+
}
|
358
|
+
|
359
|
+
// noexcept
|
360
|
+
int scan_on_start_object(void *ctx)
|
361
|
+
{
|
362
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
363
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
364
|
+
{
|
365
|
+
sctx->current_path_len++;
|
366
|
+
return true;
|
367
|
+
}
|
368
|
+
increment_arr_index(sctx);
|
369
|
+
if (sctx->current_path_len < sctx->max_path_len)
|
370
|
+
{
|
371
|
+
sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
|
372
|
+
sctx->current_path[sctx->current_path_len].type = PATH_KEY;
|
373
|
+
}
|
374
|
+
sctx->current_path_len++;
|
375
|
+
return true;
|
376
|
+
}
|
377
|
+
|
378
|
+
// noexcept
|
379
|
+
int scan_on_key(void *ctx, const unsigned char *key, size_t len)
|
380
|
+
{
|
381
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
382
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
383
|
+
return true;
|
384
|
+
// Can't be called without scan_on_start_object being called before
|
385
|
+
// So current_path_len at least 1 and key.type is set to PATH_KEY;
|
386
|
+
sctx->current_path[sctx->current_path_len - 1].value.key.val = (char *) key;
|
387
|
+
sctx->current_path[sctx->current_path_len - 1].value.key.len = len;
|
388
|
+
return true;
|
389
|
+
}
|
390
|
+
|
391
|
+
// noexcept
|
392
|
+
int scan_on_end_object(void *ctx)
|
393
|
+
{
|
394
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
395
|
+
sctx->current_path_len--;
|
396
|
+
if (sctx->current_path_len >= sctx->max_path_len)
|
397
|
+
return true;
|
398
|
+
save_point(sctx, object_value, 0);
|
399
|
+
return true;
|
400
|
+
}
|
401
|
+
|
402
|
+
// noexcept
|
403
|
+
int scan_on_start_array(void *ctx)
|
404
|
+
{
|
405
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
406
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
407
|
+
{
|
408
|
+
sctx->current_path_len++;
|
409
|
+
return true;
|
410
|
+
}
|
411
|
+
increment_arr_index(sctx);
|
412
|
+
if (sctx->current_path_len < sctx->max_path_len)
|
413
|
+
{
|
414
|
+
sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
|
415
|
+
sctx->current_path[sctx->current_path_len].type = PATH_INDEX;
|
416
|
+
sctx->current_path[sctx->current_path_len].value.index = -1;
|
417
|
+
}
|
418
|
+
sctx->current_path_len++;
|
419
|
+
return true;
|
420
|
+
}
|
421
|
+
|
422
|
+
// noexcept
|
423
|
+
int scan_on_end_array(void *ctx)
|
424
|
+
{
|
425
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
426
|
+
sctx->current_path_len--;
|
427
|
+
if (sctx->current_path_len >= sctx->max_path_len)
|
428
|
+
return true;
|
429
|
+
save_point(sctx, array_value, 0);
|
430
|
+
return true;
|
431
|
+
}
|
432
|
+
|
433
|
+
static yajl_callbacks scan_callbacks = {
|
434
|
+
scan_on_null,
|
435
|
+
scan_on_boolean,
|
436
|
+
NULL,
|
437
|
+
NULL,
|
438
|
+
scan_on_number,
|
439
|
+
scan_on_string,
|
440
|
+
scan_on_start_object,
|
441
|
+
scan_on_key,
|
442
|
+
scan_on_end_object,
|
443
|
+
scan_on_start_array,
|
444
|
+
scan_on_end_array};
|
445
|
+
|
446
|
+
// TODO: make with_path optional kw: `with_path: false`
|
447
|
+
VALUE scan(VALUE self, VALUE json_str, VALUE path_ary, VALUE with_path)
|
448
|
+
{
|
449
|
+
rb_check_type(json_str, T_STRING);
|
450
|
+
char *json_text = RSTRING_PTR(json_str);
|
451
|
+
#if LONG_MAX > SIZE_MAX
|
452
|
+
size_t json_text_len = RSTRING_LENINT(json_str);
|
453
|
+
#else
|
454
|
+
size_t json_text_len = RSTRING_LEN(json_str);
|
455
|
+
#endif
|
456
|
+
yajl_handle handle;
|
457
|
+
// TODO
|
458
|
+
int opt_verbose_error = 0;
|
459
|
+
yajl_status stat;
|
460
|
+
scan_ctx *ctx = scan_ctx_init(path_ary, with_path);
|
461
|
+
VALUE err = Qnil;
|
462
|
+
VALUE result;
|
463
|
+
// Turned out callbacks can't raise exceptions
|
464
|
+
// VALUE callback_err;
|
465
|
+
|
466
|
+
handle = yajl_alloc(&scan_callbacks, NULL, (void *)ctx);
|
467
|
+
ctx->handle = handle;
|
468
|
+
// TODO: make it configurable
|
469
|
+
// yajl_config(handle, yajl_allow_comments, true);
|
470
|
+
// yajl_config(handle, yajl_allow_trailing_garbage, true);
|
471
|
+
stat = yajl_parse(handle, (unsigned char *) json_text, json_text_len);
|
472
|
+
if (stat == yajl_status_ok)
|
473
|
+
stat = yajl_complete_parse(handle);
|
474
|
+
|
475
|
+
if (stat != yajl_status_ok)
|
476
|
+
{
|
477
|
+
char *str = (char *) yajl_get_error(handle, opt_verbose_error, (unsigned char *) json_text, json_text_len);
|
478
|
+
err = rb_str_new_cstr(str);
|
479
|
+
yajl_free_error(handle, (unsigned char *) str);
|
480
|
+
}
|
481
|
+
// callback_err = ctx->rb_err;
|
482
|
+
result = ctx->points_list;
|
483
|
+
scan_ctx_free(ctx);
|
484
|
+
yajl_free(handle);
|
485
|
+
if (err != Qnil)
|
486
|
+
rb_exc_raise(rb_exc_new_str(rb_eJsonScannerParseError, err));
|
487
|
+
// if (callback_err != Qnil)
|
488
|
+
// rb_exc_raise(callback_err);
|
489
|
+
// TODO: report yajl_get_bytes_consumed(handle)
|
490
|
+
return result;
|
491
|
+
}
|
492
|
+
|
493
|
+
RUBY_FUNC_EXPORTED void
|
494
|
+
Init_json_scanner(void)
|
495
|
+
{
|
496
|
+
rb_mJsonScanner = rb_define_module("JsonScanner");
|
497
|
+
rb_define_const(rb_mJsonScanner, "ALL", rb_range_new(INT2FIX(0), INT2FIX(-1), false));
|
498
|
+
rb_mJsonScannerOptions = rb_define_module_under(rb_mJsonScanner, "Options");
|
499
|
+
rb_eJsonScannerParseError = rb_define_class_under(rb_mJsonScanner, "ParseError", rb_eRuntimeError);
|
500
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_COMMENTS", INT2FIX(yajl_allow_comments));
|
501
|
+
rb_define_const(rb_mJsonScannerOptions, "DONT_VALIDATE_STRINGS", INT2FIX(yajl_dont_validate_strings));
|
502
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_TRAILING_GARBAGE", INT2FIX(yajl_allow_trailing_garbage));
|
503
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_MULTIPLE_VALUES", INT2FIX(yajl_allow_multiple_values));
|
504
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_PARTIAL_VALUES", INT2FIX(yajl_allow_partial_values));
|
505
|
+
rb_define_module_function(rb_mJsonScanner, "scan", scan, 3);
|
506
|
+
null_sym = rb_id2sym(rb_intern("null"));
|
507
|
+
boolean_sym = rb_id2sym(rb_intern("boolean"));
|
508
|
+
number_sym = rb_id2sym(rb_intern("number"));
|
509
|
+
string_sym = rb_id2sym(rb_intern("string"));
|
510
|
+
object_sym = rb_id2sym(rb_intern("object"));
|
511
|
+
array_sym = rb_id2sym(rb_intern("array"));
|
512
|
+
}
|
data/lib/json_scanner.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "spec_helper"
|
4
|
+
|
5
|
+
RSpec.describe JsonScanner do
|
6
|
+
it "has a version number" do
|
7
|
+
expect(described_class::VERSION).not_to be nil
|
8
|
+
end
|
9
|
+
|
10
|
+
it "scans json" do
|
11
|
+
result = described_class.scan('["1", {"a": 2}]', [[0], [1, "a"], []], false)
|
12
|
+
expect(result).to eq([[[1, 4, :string]], [[12, 13, :number]], [[0, 15, :array]]])
|
13
|
+
expect(described_class.scan('"2"', [[]], false)).to eq([[[0, 3, :string]]])
|
14
|
+
expect(
|
15
|
+
described_class.scan("[0,1,2,3,4,5,6,7]", [[(0..2)], [(4...6)]], false)
|
16
|
+
).to eq(
|
17
|
+
[[[1, 2, :number], [3, 4, :number], [5, 6, :number]], [[9, 10, :number], [11, 12, :number]]]
|
18
|
+
)
|
19
|
+
expect(described_class.scan('{"a": 1}', [["a"], []], false)).to eq(
|
20
|
+
[[[6, 7, :number]], [[0, 8, :object]]]
|
21
|
+
)
|
22
|
+
expect do
|
23
|
+
begin
|
24
|
+
GC.stress = true
|
25
|
+
# TODO: investigate
|
26
|
+
# got "munmap_chunk(): invalid pointer" in in console once after
|
27
|
+
# JsonScanner.scan '[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[]]]]]]]]]]]]]]]]]]]]]]', [[0,0,0,0,0,0,0]], true + Ctrl+D
|
28
|
+
# (last arg wasn't handled at the time)
|
29
|
+
# but I don't think it's a problem of tht extension or libyajl, it happened at exit and I free everything before
|
30
|
+
# `JsonScanner.scan` returns
|
31
|
+
described_class.scan "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[]]]]]]]]]]]]]]]]]]]]]]", [[0, 0, 0, 0, 0, 0, 0]], false
|
32
|
+
ensure
|
33
|
+
GC.stress = false
|
34
|
+
end
|
35
|
+
end.to raise_error described_class::ParseError
|
36
|
+
end
|
37
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "json_scanner"
|
4
|
+
|
5
|
+
RSpec.configure do |config|
|
6
|
+
# Enable flags like --only-failures and --next-failure
|
7
|
+
config.example_status_persistence_file_path = ".rspec_status"
|
8
|
+
|
9
|
+
# Disable RSpec exposing methods globally on `Module` and `main`
|
10
|
+
config.disable_monkey_patching!
|
11
|
+
|
12
|
+
config.expect_with :rspec do |c|
|
13
|
+
c.syntax = :expect
|
14
|
+
end
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: json_scanner
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- uvlad7
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-12-15 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: This gem uses yajl lib to scan a json string and allows you to parse
|
14
|
+
pieces of it
|
15
|
+
email:
|
16
|
+
- uvlad7@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions:
|
19
|
+
- ext/json_scanner/extconf.rb
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- README.md
|
23
|
+
- ext/json_scanner/extconf.rb
|
24
|
+
- ext/json_scanner/json_scanner.c
|
25
|
+
- ext/json_scanner/json_scanner.h
|
26
|
+
- lib/json_scanner.rb
|
27
|
+
- lib/json_scanner/version.rb
|
28
|
+
- sig/json_scanner.rbs
|
29
|
+
- spec/json_scanner_spec.rb
|
30
|
+
- spec/spec_helper.rb
|
31
|
+
homepage: https://github.com/uvlad7/json_scanner
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata:
|
35
|
+
homepage_uri: https://github.com/uvlad7/json_scanner
|
36
|
+
source_code_uri: https://github.com/uvlad7/json_scanner
|
37
|
+
changelog_uri: https://github.com/uvlad7/json_scanner/CHANGELOG.md
|
38
|
+
rubygems_mfa_required: 'true'
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.3.8
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements:
|
54
|
+
- libyajl2, v2.1
|
55
|
+
- libyajl-dev, v2.1
|
56
|
+
rubygems_version: 3.5.7
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Extract values from JSON without full parsing
|
60
|
+
test_files: []
|