json_scanner 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +51 -0
- data/ext/json_scanner/extconf.rb +16 -0
- data/ext/json_scanner/json_scanner.c +512 -0
- data/ext/json_scanner/json_scanner.h +12 -0
- data/lib/json_scanner/version.rb +5 -0
- data/lib/json_scanner.rb +9 -0
- data/sig/json_scanner.rbs +4 -0
- data/spec/json_scanner_spec.rb +37 -0
- data/spec/spec_helper.rb +15 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d0af7c4c2fce9ca74ec96c00e8972088b34c99a005b8fc966d1a0e9ae7d75dcb
|
4
|
+
data.tar.gz: 70f2365add4838ef7409d3ff9568ab59d1a3771ac38d6168757bf8de1de71b1d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 96958c94108fafca33f68f091dcea150e549e1fa61c02aaf62790f0d2f77c8762abe7b702a4cf95b9d4e28929a3dd1ce77681bb7cc0f6e7e8fdd22d32f74f378
|
7
|
+
data.tar.gz: 3da8a8713b1f1994d50ee3032d450b5d34070c843e7c2625db1f2945c5a6c1cdc223d9b957c0386562ea1fdf68d2d03c6fc985ca2f0ddda8a7f3b83ff2c19b36
|
data/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
[![Tests](https://github.com/uvlad7/json_scanner/actions/workflows/main.yml/badge.svg)](https://github.com/uvlad7/json_scanner/actions/workflows/main.yml)
|
2
|
+
|
3
|
+
# JsonScanner
|
4
|
+
|
5
|
+
Extract values from JSON without full parsing. This gem uses yajl lib to scan a json string and allows you to parse pieces of it.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Install the gem and add to the application's Gemfile by executing:
|
10
|
+
|
11
|
+
$ bundle add json_scanner
|
12
|
+
|
13
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
14
|
+
|
15
|
+
$ gem install json_scanner
|
16
|
+
|
17
|
+
## Usage
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require "json"
|
21
|
+
require "json_scanner"
|
22
|
+
|
23
|
+
large_json = "[#{"4," * 100_000}42#{",2" * 100_000}]"
|
24
|
+
where_is_42 = JsonScanner.scan(large_json, [[100_000]], false).first
|
25
|
+
# => [[200001, 200003, :number]]
|
26
|
+
where_is_42.map do |begin_pos, end_pos, _type|
|
27
|
+
JSON.parse(large_json.byteslice(begin_pos...end_pos), quirks_mode: true)
|
28
|
+
end
|
29
|
+
# => [42]
|
30
|
+
|
31
|
+
emoji_json = '{"grin": "😁", "heart": "😍", "rofl": "🤣"}'
|
32
|
+
begin_pos, end_pos, = JsonScanner.scan(emoji_json, [["heart"]], false).first.first
|
33
|
+
emoji_json.byteslice(begin_pos...end_pos)
|
34
|
+
# => "\"😍\""
|
35
|
+
emoji_json.force_encoding(Encoding::BINARY)[begin_pos...end_pos].force_encoding(Encoding::UTF_8)
|
36
|
+
# => "\"😍\""
|
37
|
+
```
|
38
|
+
|
39
|
+
## Development
|
40
|
+
|
41
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
42
|
+
|
43
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
44
|
+
|
45
|
+
## Contributing
|
46
|
+
|
47
|
+
Bug reports and pull requests are welcome on GitHub at [github](https://github.com/uvlad7/json_scanner).
|
48
|
+
|
49
|
+
## License
|
50
|
+
|
51
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "mkmf"
|
4
|
+
|
5
|
+
# Makes all symbols private by default to avoid unintended conflict
|
6
|
+
# with other gems. To explicitly export symbols you can use RUBY_FUNC_EXPORTED
|
7
|
+
# selectively, or entirely remove this flag.
|
8
|
+
append_cflags("-fvisibility=hidden")
|
9
|
+
|
10
|
+
dir_config("yajl", "", "")
|
11
|
+
|
12
|
+
unless have_library("yajl") && have_header("yajl/yajl_parse.h") && have_header("yajl/yajl_gen.h")
|
13
|
+
abort "yajl library not found"
|
14
|
+
end
|
15
|
+
|
16
|
+
create_makefile("json_scanner/json_scanner")
|
@@ -0,0 +1,512 @@
|
|
1
|
+
#include "json_scanner.h"
|
2
|
+
|
3
|
+
VALUE rb_mJsonScanner;
|
4
|
+
VALUE rb_mJsonScannerOptions;
|
5
|
+
VALUE rb_eJsonScannerParseError;
|
6
|
+
|
7
|
+
VALUE null_sym;
|
8
|
+
VALUE boolean_sym;
|
9
|
+
VALUE number_sym;
|
10
|
+
VALUE string_sym;
|
11
|
+
VALUE object_sym;
|
12
|
+
VALUE array_sym;
|
13
|
+
|
14
|
+
enum matcher_type
|
15
|
+
{
|
16
|
+
MATCHER_KEY,
|
17
|
+
MATCHER_INDEX,
|
18
|
+
// MATCHER_ANY_KEY,
|
19
|
+
// MATCHER_ANY_INDEX,
|
20
|
+
MATCHER_INDEX_RANGE,
|
21
|
+
// MATCHER_KEYS_LIST,
|
22
|
+
};
|
23
|
+
|
24
|
+
enum path_type
|
25
|
+
{
|
26
|
+
PATH_KEY,
|
27
|
+
PATH_INDEX,
|
28
|
+
};
|
29
|
+
|
30
|
+
typedef struct
|
31
|
+
{
|
32
|
+
const char *val;
|
33
|
+
size_t len;
|
34
|
+
} hashkey_t;
|
35
|
+
|
36
|
+
typedef struct
|
37
|
+
{
|
38
|
+
long start;
|
39
|
+
long end;
|
40
|
+
} range_t;
|
41
|
+
|
42
|
+
typedef struct
|
43
|
+
{
|
44
|
+
enum matcher_type type;
|
45
|
+
union
|
46
|
+
{
|
47
|
+
hashkey_t key;
|
48
|
+
long index;
|
49
|
+
range_t range;
|
50
|
+
} value;
|
51
|
+
} path_matcher_elem_t;
|
52
|
+
|
53
|
+
typedef struct
|
54
|
+
{
|
55
|
+
enum path_type type;
|
56
|
+
union
|
57
|
+
{
|
58
|
+
hashkey_t key;
|
59
|
+
long index;
|
60
|
+
} value;
|
61
|
+
} path_elem_t;
|
62
|
+
|
63
|
+
typedef struct
|
64
|
+
{
|
65
|
+
path_matcher_elem_t *elems;
|
66
|
+
int len;
|
67
|
+
int matched_depth;
|
68
|
+
} paths_t;
|
69
|
+
|
70
|
+
typedef struct
|
71
|
+
{
|
72
|
+
int with_path;
|
73
|
+
paths_t *paths;
|
74
|
+
int paths_len;
|
75
|
+
path_elem_t *current_path;
|
76
|
+
int current_path_len;
|
77
|
+
int max_path_len;
|
78
|
+
// Easier to use a Ruby array for result than convert later
|
79
|
+
VALUE points_list;
|
80
|
+
// by depth
|
81
|
+
size_t *starts;
|
82
|
+
// VALUE rb_err;
|
83
|
+
yajl_handle handle;
|
84
|
+
} scan_ctx;
|
85
|
+
|
86
|
+
// FIXME: This will cause memory leak if ruby_xmalloc raises
|
87
|
+
scan_ctx *scan_ctx_init(VALUE path_ary, VALUE with_path)
|
88
|
+
{
|
89
|
+
// TODO: Allow to_ary and sized enumerables
|
90
|
+
rb_check_type(path_ary, T_ARRAY);
|
91
|
+
int path_ary_len = rb_long2int(rb_array_len(path_ary));
|
92
|
+
// Check types early before any allocations, so exception is ok
|
93
|
+
// TODO: Fix this, just handle errors
|
94
|
+
for (int i = 0; i < path_ary_len; i++)
|
95
|
+
{
|
96
|
+
VALUE path = rb_ary_entry(path_ary, i);
|
97
|
+
rb_check_type(path, T_ARRAY);
|
98
|
+
int path_len = rb_long2int(rb_array_len(path));
|
99
|
+
for (int j = 0; j < path_len; j++)
|
100
|
+
{
|
101
|
+
VALUE entry = rb_ary_entry(path, j);
|
102
|
+
int type = TYPE(entry);
|
103
|
+
if (type == T_STRING)
|
104
|
+
{
|
105
|
+
#if LONG_MAX > SIZE_MAX
|
106
|
+
RSTRING_LENINT(entry);
|
107
|
+
#endif
|
108
|
+
}
|
109
|
+
else if (type == T_FIXNUM || type == T_BIGNUM)
|
110
|
+
{
|
111
|
+
RB_NUM2LONG(entry);
|
112
|
+
}
|
113
|
+
else
|
114
|
+
{
|
115
|
+
VALUE range_beg, range_end;
|
116
|
+
int open_ended;
|
117
|
+
if (rb_range_values(entry, &range_beg, &range_end, &open_ended) != Qtrue)
|
118
|
+
rb_raise(rb_eArgError, "path elements must be strings, integers, or ranges");
|
119
|
+
RB_NUM2LONG(range_beg);
|
120
|
+
RB_NUM2LONG(range_end);
|
121
|
+
}
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
scan_ctx *ctx = ruby_xmalloc(sizeof(scan_ctx));
|
126
|
+
|
127
|
+
ctx->with_path = RB_TEST(with_path);
|
128
|
+
ctx->max_path_len = 0;
|
129
|
+
|
130
|
+
paths_t *paths = ruby_xmalloc(sizeof(paths_t) * path_ary_len);
|
131
|
+
for (int i = 0; i < path_ary_len; i++)
|
132
|
+
{
|
133
|
+
VALUE path = rb_ary_entry(path_ary, i);
|
134
|
+
int path_len = rb_long2int(rb_array_len(path));
|
135
|
+
if (path_len > ctx->max_path_len)
|
136
|
+
ctx->max_path_len = path_len;
|
137
|
+
paths[i].elems = ruby_xmalloc2(sizeof(path_matcher_elem_t), path_len);
|
138
|
+
for (int j = 0; j < path_len; j++)
|
139
|
+
{
|
140
|
+
VALUE entry = rb_ary_entry(path, j);
|
141
|
+
int type = TYPE(entry);
|
142
|
+
if (type == T_STRING)
|
143
|
+
{
|
144
|
+
paths[i].elems[j].type = MATCHER_KEY;
|
145
|
+
paths[i].elems[j].value.key.val = RSTRING_PTR(entry);
|
146
|
+
#if LONG_MAX > SIZE_MAX
|
147
|
+
paths[i].elems[j].value.key.len = RSTRING_LENINT(entry);
|
148
|
+
#else
|
149
|
+
paths[i].elems[j].value.key.len = RSTRING_LEN(entry);
|
150
|
+
#endif
|
151
|
+
}
|
152
|
+
else if (type == T_FIXNUM || type == T_BIGNUM)
|
153
|
+
{
|
154
|
+
paths[i].elems[j].type = MATCHER_INDEX;
|
155
|
+
paths[i].elems[j].value.index = FIX2LONG(entry);
|
156
|
+
}
|
157
|
+
else
|
158
|
+
{
|
159
|
+
paths[i].elems[j].type = MATCHER_INDEX_RANGE;
|
160
|
+
VALUE range_beg, range_end;
|
161
|
+
int open_ended;
|
162
|
+
rb_range_values(entry, &range_beg, &range_end, &open_ended);
|
163
|
+
paths[i].elems[j].value.range.start = RB_NUM2LONG(range_beg);
|
164
|
+
paths[i].elems[j].value.range.end = RB_NUM2LONG(range_end);
|
165
|
+
if (open_ended)
|
166
|
+
paths[i].elems[j].value.range.end--;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
paths[i].len = path_len;
|
170
|
+
paths[i].matched_depth = 0;
|
171
|
+
}
|
172
|
+
|
173
|
+
ctx->paths = paths;
|
174
|
+
ctx->paths_len = path_ary_len;
|
175
|
+
ctx->current_path = ruby_xmalloc2(sizeof(path_elem_t), ctx->max_path_len);
|
176
|
+
|
177
|
+
ctx->current_path_len = 0;
|
178
|
+
ctx->points_list = rb_ary_new_capa(path_ary_len);
|
179
|
+
for (int i = 0; i < path_ary_len; i++)
|
180
|
+
{
|
181
|
+
rb_ary_push(ctx->points_list, rb_ary_new());
|
182
|
+
}
|
183
|
+
|
184
|
+
ctx->starts = ruby_xmalloc2(sizeof(size_t), ctx->max_path_len);
|
185
|
+
// ctx->rb_err = Qnil;
|
186
|
+
ctx->handle = NULL;
|
187
|
+
|
188
|
+
return ctx;
|
189
|
+
}
|
190
|
+
|
191
|
+
void scan_ctx_free(scan_ctx *ctx)
|
192
|
+
{
|
193
|
+
if (!ctx)
|
194
|
+
return;
|
195
|
+
ruby_xfree(ctx->starts);
|
196
|
+
ruby_xfree(ctx->current_path);
|
197
|
+
for (int i = 0; i < ctx->paths_len; i++)
|
198
|
+
{
|
199
|
+
ruby_xfree(ctx->paths[i].elems);
|
200
|
+
}
|
201
|
+
ruby_xfree(ctx->paths);
|
202
|
+
ruby_xfree(ctx);
|
203
|
+
}
|
204
|
+
|
205
|
+
// noexcept
|
206
|
+
inline void increment_arr_index(scan_ctx *sctx)
|
207
|
+
{
|
208
|
+
// remember - any value can be root
|
209
|
+
// TODO: Maybe make current_path_len 1 shorter and get rid of -1; need to change all compares
|
210
|
+
if (sctx->current_path_len && sctx->current_path[sctx->current_path_len - 1].type == PATH_INDEX)
|
211
|
+
{
|
212
|
+
sctx->current_path[sctx->current_path_len - 1].value.index++;
|
213
|
+
}
|
214
|
+
}
|
215
|
+
|
216
|
+
typedef enum
|
217
|
+
{
|
218
|
+
null_value,
|
219
|
+
boolean_value,
|
220
|
+
number_value,
|
221
|
+
string_value,
|
222
|
+
object_value,
|
223
|
+
array_value,
|
224
|
+
} value_type;
|
225
|
+
|
226
|
+
// noexcept
|
227
|
+
void create_point(VALUE *point, scan_ctx *sctx, value_type type, size_t length, size_t curr_pos)
|
228
|
+
{
|
229
|
+
*point = rb_ary_new_capa(3);
|
230
|
+
VALUE values[3];
|
231
|
+
// noexcept
|
232
|
+
values[1] = RB_ULONG2NUM(curr_pos);
|
233
|
+
switch (type)
|
234
|
+
{
|
235
|
+
// FIXME: size_t can be longer than ulong
|
236
|
+
case null_value:
|
237
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
238
|
+
values[2] = null_sym;
|
239
|
+
break;
|
240
|
+
case boolean_value:
|
241
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
242
|
+
values[2] = boolean_sym;
|
243
|
+
break;
|
244
|
+
case number_value:
|
245
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
246
|
+
values[2] = number_sym;
|
247
|
+
break;
|
248
|
+
case string_value:
|
249
|
+
values[0] = RB_ULONG2NUM(curr_pos - length);
|
250
|
+
values[2] = string_sym;
|
251
|
+
break;
|
252
|
+
case object_value:
|
253
|
+
values[0] = RB_ULONG2NUM(sctx->starts[sctx->current_path_len]);
|
254
|
+
values[2] = object_sym;
|
255
|
+
break;
|
256
|
+
case array_value:
|
257
|
+
values[0] = RB_ULONG2NUM(sctx->starts[sctx->current_path_len]);
|
258
|
+
values[2] = array_sym;
|
259
|
+
break;
|
260
|
+
}
|
261
|
+
// rb_ary_cat raise only in case of a frozen array or if len is too long
|
262
|
+
rb_ary_cat(*point, values, 3);
|
263
|
+
}
|
264
|
+
|
265
|
+
// noexcept
|
266
|
+
void save_point(scan_ctx *sctx, value_type type, size_t length)
|
267
|
+
{
|
268
|
+
// TODO: Abort parsing if all paths are matched and no more mathces are possible: only trivial key/index matchers at the current level
|
269
|
+
// TODO: Don't re-compare already matched prefixes; hard to invalidate, though
|
270
|
+
VALUE point = Qundef;
|
271
|
+
for (int i = 0; i < sctx->paths_len; i++)
|
272
|
+
{
|
273
|
+
if (sctx->paths[i].len != sctx->current_path_len)
|
274
|
+
continue;
|
275
|
+
|
276
|
+
int match = true;
|
277
|
+
for (int j = 0; j < sctx->current_path_len; j++)
|
278
|
+
{
|
279
|
+
switch (sctx->paths[i].elems[j].type)
|
280
|
+
{
|
281
|
+
case MATCHER_KEY:
|
282
|
+
if (sctx->current_path[j].type != PATH_KEY ||
|
283
|
+
sctx->current_path[j].value.key.len != sctx->paths[i].elems[j].value.key.len ||
|
284
|
+
strncmp(sctx->current_path[j].value.key.val, sctx->paths[i].elems[j].value.key.val, sctx->current_path[j].value.key.len))
|
285
|
+
match = false;
|
286
|
+
break;
|
287
|
+
case MATCHER_INDEX:
|
288
|
+
if (sctx->current_path[j].type != PATH_INDEX ||
|
289
|
+
sctx->current_path[j].value.index != sctx->paths[i].elems[j].value.index)
|
290
|
+
match = false;
|
291
|
+
break;
|
292
|
+
case MATCHER_INDEX_RANGE:
|
293
|
+
if (sctx->current_path[j].type != PATH_INDEX ||
|
294
|
+
sctx->current_path[j].value.index < sctx->paths[i].elems[j].value.range.start ||
|
295
|
+
sctx->current_path[j].value.index > sctx->paths[i].elems[j].value.range.end)
|
296
|
+
match = false;
|
297
|
+
break;
|
298
|
+
}
|
299
|
+
if (!match)
|
300
|
+
break;
|
301
|
+
}
|
302
|
+
if (match)
|
303
|
+
{
|
304
|
+
if (point == Qundef)
|
305
|
+
{
|
306
|
+
create_point(&point, sctx, type, length, yajl_get_bytes_consumed(sctx->handle));
|
307
|
+
}
|
308
|
+
// rb_ary_push raises only in case of a frozen array, which is not the case
|
309
|
+
// rb_ary_entry is safe
|
310
|
+
rb_ary_push(rb_ary_entry(sctx->points_list, i), point);
|
311
|
+
}
|
312
|
+
}
|
313
|
+
}
|
314
|
+
|
315
|
+
// noexcept
|
316
|
+
int scan_on_null(void *ctx)
|
317
|
+
{
|
318
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
319
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
320
|
+
return true;
|
321
|
+
increment_arr_index(sctx);
|
322
|
+
save_point(sctx, null_value, 4);
|
323
|
+
return true;
|
324
|
+
}
|
325
|
+
|
326
|
+
// noexcept
|
327
|
+
int scan_on_boolean(void *ctx, int bool_val)
|
328
|
+
{
|
329
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
330
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
331
|
+
return true;
|
332
|
+
increment_arr_index(sctx);
|
333
|
+
save_point(sctx, boolean_value, bool_val ? 4 : 5);
|
334
|
+
return true;
|
335
|
+
}
|
336
|
+
|
337
|
+
// noexcept
|
338
|
+
int scan_on_number(void *ctx, const char *val, size_t len)
|
339
|
+
{
|
340
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
341
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
342
|
+
return true;
|
343
|
+
increment_arr_index(sctx);
|
344
|
+
save_point(sctx, number_value, len);
|
345
|
+
return true;
|
346
|
+
}
|
347
|
+
|
348
|
+
// noexcept
|
349
|
+
int scan_on_string(void *ctx, const unsigned char *val, size_t len)
|
350
|
+
{
|
351
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
352
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
353
|
+
return true;
|
354
|
+
increment_arr_index(sctx);
|
355
|
+
save_point(sctx, string_value, len + 2);
|
356
|
+
return true;
|
357
|
+
}
|
358
|
+
|
359
|
+
// noexcept
|
360
|
+
int scan_on_start_object(void *ctx)
|
361
|
+
{
|
362
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
363
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
364
|
+
{
|
365
|
+
sctx->current_path_len++;
|
366
|
+
return true;
|
367
|
+
}
|
368
|
+
increment_arr_index(sctx);
|
369
|
+
if (sctx->current_path_len < sctx->max_path_len)
|
370
|
+
{
|
371
|
+
sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
|
372
|
+
sctx->current_path[sctx->current_path_len].type = PATH_KEY;
|
373
|
+
}
|
374
|
+
sctx->current_path_len++;
|
375
|
+
return true;
|
376
|
+
}
|
377
|
+
|
378
|
+
// noexcept
|
379
|
+
int scan_on_key(void *ctx, const unsigned char *key, size_t len)
|
380
|
+
{
|
381
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
382
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
383
|
+
return true;
|
384
|
+
// Can't be called without scan_on_start_object being called before
|
385
|
+
// So current_path_len at least 1 and key.type is set to PATH_KEY;
|
386
|
+
sctx->current_path[sctx->current_path_len - 1].value.key.val = (char *) key;
|
387
|
+
sctx->current_path[sctx->current_path_len - 1].value.key.len = len;
|
388
|
+
return true;
|
389
|
+
}
|
390
|
+
|
391
|
+
// noexcept
|
392
|
+
int scan_on_end_object(void *ctx)
|
393
|
+
{
|
394
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
395
|
+
sctx->current_path_len--;
|
396
|
+
if (sctx->current_path_len >= sctx->max_path_len)
|
397
|
+
return true;
|
398
|
+
save_point(sctx, object_value, 0);
|
399
|
+
return true;
|
400
|
+
}
|
401
|
+
|
402
|
+
// noexcept
|
403
|
+
int scan_on_start_array(void *ctx)
|
404
|
+
{
|
405
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
406
|
+
if (sctx->current_path_len > sctx->max_path_len)
|
407
|
+
{
|
408
|
+
sctx->current_path_len++;
|
409
|
+
return true;
|
410
|
+
}
|
411
|
+
increment_arr_index(sctx);
|
412
|
+
if (sctx->current_path_len < sctx->max_path_len)
|
413
|
+
{
|
414
|
+
sctx->starts[sctx->current_path_len] = yajl_get_bytes_consumed(sctx->handle) - 1;
|
415
|
+
sctx->current_path[sctx->current_path_len].type = PATH_INDEX;
|
416
|
+
sctx->current_path[sctx->current_path_len].value.index = -1;
|
417
|
+
}
|
418
|
+
sctx->current_path_len++;
|
419
|
+
return true;
|
420
|
+
}
|
421
|
+
|
422
|
+
// noexcept
|
423
|
+
int scan_on_end_array(void *ctx)
|
424
|
+
{
|
425
|
+
scan_ctx *sctx = (scan_ctx *)ctx;
|
426
|
+
sctx->current_path_len--;
|
427
|
+
if (sctx->current_path_len >= sctx->max_path_len)
|
428
|
+
return true;
|
429
|
+
save_point(sctx, array_value, 0);
|
430
|
+
return true;
|
431
|
+
}
|
432
|
+
|
433
|
+
static yajl_callbacks scan_callbacks = {
|
434
|
+
scan_on_null,
|
435
|
+
scan_on_boolean,
|
436
|
+
NULL,
|
437
|
+
NULL,
|
438
|
+
scan_on_number,
|
439
|
+
scan_on_string,
|
440
|
+
scan_on_start_object,
|
441
|
+
scan_on_key,
|
442
|
+
scan_on_end_object,
|
443
|
+
scan_on_start_array,
|
444
|
+
scan_on_end_array};
|
445
|
+
|
446
|
+
// TODO: make with_path optional kw: `with_path: false`
|
447
|
+
VALUE scan(VALUE self, VALUE json_str, VALUE path_ary, VALUE with_path)
|
448
|
+
{
|
449
|
+
rb_check_type(json_str, T_STRING);
|
450
|
+
char *json_text = RSTRING_PTR(json_str);
|
451
|
+
#if LONG_MAX > SIZE_MAX
|
452
|
+
size_t json_text_len = RSTRING_LENINT(json_str);
|
453
|
+
#else
|
454
|
+
size_t json_text_len = RSTRING_LEN(json_str);
|
455
|
+
#endif
|
456
|
+
yajl_handle handle;
|
457
|
+
// TODO
|
458
|
+
int opt_verbose_error = 0;
|
459
|
+
yajl_status stat;
|
460
|
+
scan_ctx *ctx = scan_ctx_init(path_ary, with_path);
|
461
|
+
VALUE err = Qnil;
|
462
|
+
VALUE result;
|
463
|
+
// Turned out callbacks can't raise exceptions
|
464
|
+
// VALUE callback_err;
|
465
|
+
|
466
|
+
handle = yajl_alloc(&scan_callbacks, NULL, (void *)ctx);
|
467
|
+
ctx->handle = handle;
|
468
|
+
// TODO: make it configurable
|
469
|
+
// yajl_config(handle, yajl_allow_comments, true);
|
470
|
+
// yajl_config(handle, yajl_allow_trailing_garbage, true);
|
471
|
+
stat = yajl_parse(handle, (unsigned char *) json_text, json_text_len);
|
472
|
+
if (stat == yajl_status_ok)
|
473
|
+
stat = yajl_complete_parse(handle);
|
474
|
+
|
475
|
+
if (stat != yajl_status_ok)
|
476
|
+
{
|
477
|
+
char *str = (char *) yajl_get_error(handle, opt_verbose_error, (unsigned char *) json_text, json_text_len);
|
478
|
+
err = rb_str_new_cstr(str);
|
479
|
+
yajl_free_error(handle, (unsigned char *) str);
|
480
|
+
}
|
481
|
+
// callback_err = ctx->rb_err;
|
482
|
+
result = ctx->points_list;
|
483
|
+
scan_ctx_free(ctx);
|
484
|
+
yajl_free(handle);
|
485
|
+
if (err != Qnil)
|
486
|
+
rb_exc_raise(rb_exc_new_str(rb_eJsonScannerParseError, err));
|
487
|
+
// if (callback_err != Qnil)
|
488
|
+
// rb_exc_raise(callback_err);
|
489
|
+
// TODO: report yajl_get_bytes_consumed(handle)
|
490
|
+
return result;
|
491
|
+
}
|
492
|
+
|
493
|
+
RUBY_FUNC_EXPORTED void
|
494
|
+
Init_json_scanner(void)
|
495
|
+
{
|
496
|
+
rb_mJsonScanner = rb_define_module("JsonScanner");
|
497
|
+
rb_define_const(rb_mJsonScanner, "ALL", rb_range_new(INT2FIX(0), INT2FIX(-1), false));
|
498
|
+
rb_mJsonScannerOptions = rb_define_module_under(rb_mJsonScanner, "Options");
|
499
|
+
rb_eJsonScannerParseError = rb_define_class_under(rb_mJsonScanner, "ParseError", rb_eRuntimeError);
|
500
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_COMMENTS", INT2FIX(yajl_allow_comments));
|
501
|
+
rb_define_const(rb_mJsonScannerOptions, "DONT_VALIDATE_STRINGS", INT2FIX(yajl_dont_validate_strings));
|
502
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_TRAILING_GARBAGE", INT2FIX(yajl_allow_trailing_garbage));
|
503
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_MULTIPLE_VALUES", INT2FIX(yajl_allow_multiple_values));
|
504
|
+
rb_define_const(rb_mJsonScannerOptions, "ALLOW_PARTIAL_VALUES", INT2FIX(yajl_allow_partial_values));
|
505
|
+
rb_define_module_function(rb_mJsonScanner, "scan", scan, 3);
|
506
|
+
null_sym = rb_id2sym(rb_intern("null"));
|
507
|
+
boolean_sym = rb_id2sym(rb_intern("boolean"));
|
508
|
+
number_sym = rb_id2sym(rb_intern("number"));
|
509
|
+
string_sym = rb_id2sym(rb_intern("string"));
|
510
|
+
object_sym = rb_id2sym(rb_intern("object"));
|
511
|
+
array_sym = rb_id2sym(rb_intern("array"));
|
512
|
+
}
|
data/lib/json_scanner.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "spec_helper"
|
4
|
+
|
5
|
+
RSpec.describe JsonScanner do
|
6
|
+
it "has a version number" do
|
7
|
+
expect(described_class::VERSION).not_to be nil
|
8
|
+
end
|
9
|
+
|
10
|
+
it "scans json" do
|
11
|
+
result = described_class.scan('["1", {"a": 2}]', [[0], [1, "a"], []], false)
|
12
|
+
expect(result).to eq([[[1, 4, :string]], [[12, 13, :number]], [[0, 15, :array]]])
|
13
|
+
expect(described_class.scan('"2"', [[]], false)).to eq([[[0, 3, :string]]])
|
14
|
+
expect(
|
15
|
+
described_class.scan("[0,1,2,3,4,5,6,7]", [[(0..2)], [(4...6)]], false)
|
16
|
+
).to eq(
|
17
|
+
[[[1, 2, :number], [3, 4, :number], [5, 6, :number]], [[9, 10, :number], [11, 12, :number]]]
|
18
|
+
)
|
19
|
+
expect(described_class.scan('{"a": 1}', [["a"], []], false)).to eq(
|
20
|
+
[[[6, 7, :number]], [[0, 8, :object]]]
|
21
|
+
)
|
22
|
+
expect do
|
23
|
+
begin
|
24
|
+
GC.stress = true
|
25
|
+
# TODO: investigate
|
26
|
+
# got "munmap_chunk(): invalid pointer" in in console once after
|
27
|
+
# JsonScanner.scan '[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[]]]]]]]]]]]]]]]]]]]]]]', [[0,0,0,0,0,0,0]], true + Ctrl+D
|
28
|
+
# (last arg wasn't handled at the time)
|
29
|
+
# but I don't think it's a problem of tht extension or libyajl, it happened at exit and I free everything before
|
30
|
+
# `JsonScanner.scan` returns
|
31
|
+
described_class.scan "[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[]]]]]]]]]]]]]]]]]]]]]]", [[0, 0, 0, 0, 0, 0, 0]], false
|
32
|
+
ensure
|
33
|
+
GC.stress = false
|
34
|
+
end
|
35
|
+
end.to raise_error described_class::ParseError
|
36
|
+
end
|
37
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "json_scanner"
|
4
|
+
|
5
|
+
RSpec.configure do |config|
|
6
|
+
# Enable flags like --only-failures and --next-failure
|
7
|
+
config.example_status_persistence_file_path = ".rspec_status"
|
8
|
+
|
9
|
+
# Disable RSpec exposing methods globally on `Module` and `main`
|
10
|
+
config.disable_monkey_patching!
|
11
|
+
|
12
|
+
config.expect_with :rspec do |c|
|
13
|
+
c.syntax = :expect
|
14
|
+
end
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: json_scanner
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- uvlad7
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-12-15 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: This gem uses yajl lib to scan a json string and allows you to parse
|
14
|
+
pieces of it
|
15
|
+
email:
|
16
|
+
- uvlad7@gmail.com
|
17
|
+
executables: []
|
18
|
+
extensions:
|
19
|
+
- ext/json_scanner/extconf.rb
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- README.md
|
23
|
+
- ext/json_scanner/extconf.rb
|
24
|
+
- ext/json_scanner/json_scanner.c
|
25
|
+
- ext/json_scanner/json_scanner.h
|
26
|
+
- lib/json_scanner.rb
|
27
|
+
- lib/json_scanner/version.rb
|
28
|
+
- sig/json_scanner.rbs
|
29
|
+
- spec/json_scanner_spec.rb
|
30
|
+
- spec/spec_helper.rb
|
31
|
+
homepage: https://github.com/uvlad7/json_scanner
|
32
|
+
licenses:
|
33
|
+
- MIT
|
34
|
+
metadata:
|
35
|
+
homepage_uri: https://github.com/uvlad7/json_scanner
|
36
|
+
source_code_uri: https://github.com/uvlad7/json_scanner
|
37
|
+
changelog_uri: https://github.com/uvlad7/json_scanner/CHANGELOG.md
|
38
|
+
rubygems_mfa_required: 'true'
|
39
|
+
post_install_message:
|
40
|
+
rdoc_options: []
|
41
|
+
require_paths:
|
42
|
+
- lib
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.3.8
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
requirements:
|
54
|
+
- libyajl2, v2.1
|
55
|
+
- libyajl-dev, v2.1
|
56
|
+
rubygems_version: 3.5.7
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Extract values from JSON without full parsing
|
60
|
+
test_files: []
|