json_scanner 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +113 -3
- data/ext/json_scanner/json_scanner.c +373 -76
- data/lib/json_scanner/version.rb +1 -1
- data/spec/extensiontesttask.rb +128 -0
- data/spec/json_scanner_spec.c +0 -0
- data/spec/json_scanner_spec.rb +169 -2
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ca9d160b389a5b605a37baeebd8e9d704a9b42712b9c8de9d0667ffa7e6b3d35
|
4
|
+
data.tar.gz: f2c0362e6bb4484e47fd5dbdda4afd7332ce42d266b4d0ebe681f24e18c31b0a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 91d190291d0e2c16db38c523a58fbbff23cde136481b2c4a19446d25169a77390eee4792979cd3a22724bdf2c89132b5dc3ccc78c1553c17fe21227a70e1bf3c
|
7
|
+
data.tar.gz: dc704ac0ba609f209883d5af50f3f9cabcde04a0eb2c3f5ac04f964010bf866fd727c5c175dd40917526b693f13bc4b532f7a4fabce9859c0b44238cf3fa421b
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# JsonScanner
|
4
4
|
|
5
|
-
Extract values from JSON without full parsing. This gem uses yajl
|
5
|
+
Extract values from JSON without full parsing. This gem uses the `yajl` library to scan a JSON string and allows you to parse pieces of it.
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
@@ -16,6 +16,8 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
16
16
|
|
17
17
|
## Usage
|
18
18
|
|
19
|
+
Basic usage
|
20
|
+
|
19
21
|
```ruby
|
20
22
|
require "json"
|
21
23
|
require "json_scanner"
|
@@ -32,15 +34,123 @@ emoji_json = '{"grin": "😁", "heart": "😍", "rofl": "🤣"}'
|
|
32
34
|
begin_pos, end_pos, = JsonScanner.scan(emoji_json, [["heart"]], false).first.first
|
33
35
|
emoji_json.byteslice(begin_pos...end_pos)
|
34
36
|
# => "\"😍\""
|
35
|
-
# Note: most likely don't need `quirks_mode` option
|
36
|
-
#
|
37
|
+
# Note: You most likely don't need the `quirks_mode` option unless you are using an older version
|
38
|
+
# of Ruby with the stdlib - or just also old - version of the json gem. In newer versions, `quirks_mode` is enabled by default.
|
37
39
|
JSON.parse(emoji_json.byteslice(begin_pos...end_pos), quirks_mode: true)
|
38
40
|
# => "😍"
|
39
41
|
# You can also do this
|
40
42
|
# emoji_json.force_encoding(Encoding::BINARY)[begin_pos...end_pos].force_encoding(Encoding::UTF_8)
|
41
43
|
# => "\"😍\""
|
44
|
+
|
45
|
+
# Ranges are supported as matchers for indexes with the following restrictions:
|
46
|
+
# - the start of a range must be positive
|
47
|
+
# - the end of a range must be positive or -1
|
48
|
+
# - a range with -1 end must be closed, e.g. (0..-1) works, but (0...-1) is forbidden
|
49
|
+
JsonScanner.scan('[0, 42, 0]', [[(1..-1)]])
|
50
|
+
# => [[[4, 6, :number], [8, 9, :number]]]
|
51
|
+
JsonScanner.scan('[0, 42, 0]', [[JsonScanner::ANY_INDEX]])
|
52
|
+
# => [[[1, 2, :number], [4, 6, :number], [8, 9, :number]]]
|
53
|
+
|
54
|
+
# Special matcher JsonScanner::ANY_KEY is supported for object keys
|
55
|
+
JsonScanner.scan('{"a": 1, "b": 2}', [[JsonScanner::ANY_KEY]], with_path: true)
|
56
|
+
# => [[[["a"], [6, 7, :number]], [["b"], [14, 15, :number]]]]
|
57
|
+
# Regex mathers aren't supported yet, but you can simulate it using `with_path` option
|
58
|
+
JsonScanner.scan(
|
59
|
+
'{"question1": 1, "answer": 42, "question2": 2}',
|
60
|
+
[[JsonScanner::ANY_KEY]], with_path: true,
|
61
|
+
).map do |res|
|
62
|
+
res.map do |path, (begin_pos, end_pos, type)|
|
63
|
+
[begin_pos, end_pos, type] if path[0] =~ /\Aquestion/
|
64
|
+
end.compact
|
65
|
+
end
|
66
|
+
# => [[[14, 15, :number], [44, 45, :number]]]
|
67
|
+
```
|
68
|
+
|
69
|
+
## Options
|
70
|
+
|
71
|
+
`JsonScanner` supports multiple options
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
JsonScanner.scan('[0, 42, 0]', [[(1..-1)]], with_path: true)
|
75
|
+
# => [[[[1], [4, 6, :number]], [[2], [8, 9, :number]]]]
|
76
|
+
JsonScanner.scan('[0, 42],', [[(1..-1)]], verbose_error: true)
|
77
|
+
# JsonScanner::ParseError (parse error: trailing garbage)
|
78
|
+
# [0, 42],
|
79
|
+
# (right here) ------^
|
80
|
+
# Note: the 'right here' pointer is wrong in case of a premature EOF error, it's a bug of the libyajl
|
81
|
+
JsonScanner.scan('[0, 42,', [[(1..-1)]], verbose_error: true)
|
82
|
+
# JsonScanner::ParseError (parse error: premature EOF)
|
83
|
+
# [0, 42,
|
84
|
+
# (right here) ------^
|
85
|
+
JsonScanner.scan('[0, /* answer */ 42, 0]', [[(1..-1)]], allow_comments: true)
|
86
|
+
# => [[[17, 19, :number], [21, 22, :number]]]
|
87
|
+
JsonScanner.scan("\"\x81\x83\"", [[]], dont_validate_strings: true)
|
88
|
+
# => [[[0, 4, :string]]]
|
89
|
+
JsonScanner.scan("{\"\x81\x83\": 42}", [[JsonScanner::ANY_KEY]], dont_validate_strings: true, with_path: true)
|
90
|
+
# => [[[["\x81\x83"], [7, 9, :number]]]]
|
91
|
+
JsonScanner.scan('[0, 42, 0]garbage', [[(1..-1)]], allow_trailing_garbage: true)
|
92
|
+
# => [[[4, 6, :number], [8, 9, :number]]]
|
93
|
+
JsonScanner.scan('[0, 42, 0] [0, 34]', [[(1..-1)]], allow_multiple_values: true)
|
94
|
+
# => [[[4, 6, :number], [8, 9, :number], [16, 18, :number]]]
|
95
|
+
JsonScanner.scan('[0, 42, 0', [[(1..-1)]], allow_partial_values: true)
|
96
|
+
# => [[[4, 6, :number], [8, 9, :number]]]
|
97
|
+
JsonScanner.scan('{"a": 1}', [[JsonScanner::ANY_KEY]], with_path: true, symbolize_path_keys: true)
|
98
|
+
# => [[[[:a], [6, 7, :number]]]]
|
42
99
|
```
|
43
100
|
|
101
|
+
### Comments in the JSON
|
102
|
+
|
103
|
+
Note that the standard `JSON` library supports comments, so you may want to enable it in the `JsonScanner` as well
|
104
|
+
```ruby
|
105
|
+
json_str = '{"answer": {"value": 42 /* the Ultimate Question of Life, the Universe, and Everything */ }}'
|
106
|
+
JsonScanner.scan(json_str, [["answer"]], allow_comments: true).first.map do |begin_pos, end_pos, _type|
|
107
|
+
JSON.parse(json_str.byteslice(begin_pos...end_pos), quirks_mode: true)
|
108
|
+
end
|
109
|
+
# => [{"value"=>42}]
|
110
|
+
```
|
111
|
+
|
112
|
+
### Find the end of a JSON string
|
113
|
+
|
114
|
+
`allow_trailing_garbage` option may come in handy if you want to extract a JSON string from a JS text
|
115
|
+
```ruby
|
116
|
+
script_text = <<~'JS'
|
117
|
+
<script>window.__APOLLO_STATE__={"ContentItem:0":{"__typename":"ContentItem","id":0, "configurationType":"NO_CONFIGURATION","replacementPartsUrl":null,"relatedCategories":[{"__ref":"Category:109450"},{"__ref":"Category:82044355"},{"__ref":"Category:109441"},{"__ref":"Category:109442"},{"__ref":"Category:109449"},{"__ref":"Category:109444"},{"__ref":"Category:82043730"}],"recommendedOptions":[]}};window.__APPVERSION__=7018;window.__CONFIG_ENV__={value: 'PRODUCTION'};</script>
|
118
|
+
JS
|
119
|
+
json_with_trailing_garbage = script_text[/__APOLLO_STATE__\s*=\s*({.+)/, 1]
|
120
|
+
json_end_pos = JsonScanner.scan(json_with_trailing_garbage, [[]], allow_trailing_garbage: true).first.first[1]
|
121
|
+
apollo_state = JSON.parse(json_with_trailing_garbage[0...json_end_pos])
|
122
|
+
```
|
123
|
+
|
124
|
+
## Reuse configuration
|
125
|
+
|
126
|
+
You can create a `JsonScanner::Config` instance and reuse it between `JsonScanner.scan` calls
|
127
|
+
|
128
|
+
```ruby
|
129
|
+
require "json_scanner"
|
130
|
+
|
131
|
+
config = JsonScanner::Config.new([[], ["key"], [(0..-1)]])
|
132
|
+
# => #<JsonScanner::Config [[], ['key'], [(0..9223372036854775807)]]>
|
133
|
+
JsonScanner.scan('{"key": "42"}', config)
|
134
|
+
# => [[[0, 13, :object]], [[8, 12, :string]], []]
|
135
|
+
JsonScanner.scan('{"key": "42"}', config, with_path: true)
|
136
|
+
# => [[[[], [0, 13, :object]]], [[["key"], [8, 12, :string]]], []]
|
137
|
+
JsonScanner.scan('[0, 42]', config)
|
138
|
+
# => [[[0, 7, :array]], [], [[1, 2, :number], [4, 6, :number]]]
|
139
|
+
JsonScanner.scan('[0, 42]', config, with_path: true)
|
140
|
+
# => [[[[], [0, 7, :array]]], [], [[[0], [1, 2, :number]], [[1], [4, 6, :number]]]]
|
141
|
+
```
|
142
|
+
|
143
|
+
Options can be passed as a hash, even on Ruby 3
|
144
|
+
```ruby
|
145
|
+
options = { allow_trailing_garbage: true, allow_partial_values: true }
|
146
|
+
JsonScanner.scan('[0, 42', [[1]], options) == JsonScanner.scan('[0, 42]_', [[1]], options)
|
147
|
+
# => true
|
148
|
+
```
|
149
|
+
|
150
|
+
## Streaming mode
|
151
|
+
|
152
|
+
Streaming mode isn't supported yet, as it's harder to implement and to use. I plan to add it in the future, its API is a subject to discussion. If you have suggestions, use cases, or preferences for how it should behave, I’d love to hear from you!
|
153
|
+
|
44
154
|
## Development
|
45
155
|
|
46
156
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -1,8 +1,12 @@
|
|
1
1
|
#include "json_scanner.h"
|
2
2
|
|
3
3
|
VALUE rb_mJsonScanner;
|
4
|
+
VALUE rb_cJsonScannerConfig;
|
4
5
|
VALUE rb_eJsonScannerParseError;
|
5
|
-
|
6
|
+
#define BYTES_CONSUMED "bytes_consumed"
|
7
|
+
ID rb_iv_bytes_consumed;
|
8
|
+
#define SCAN_KWARGS_SIZE 8
|
9
|
+
ID scan_kwargs_table[SCAN_KWARGS_SIZE];
|
6
10
|
|
7
11
|
VALUE null_sym;
|
8
12
|
VALUE boolean_sym;
|
@@ -11,11 +15,13 @@ VALUE string_sym;
|
|
11
15
|
VALUE object_sym;
|
12
16
|
VALUE array_sym;
|
13
17
|
|
18
|
+
VALUE any_key_sym;
|
19
|
+
|
14
20
|
enum matcher_type
|
15
21
|
{
|
16
22
|
MATCHER_KEY,
|
17
23
|
MATCHER_INDEX,
|
18
|
-
|
24
|
+
MATCHER_ANY_KEY,
|
19
25
|
MATCHER_INDEX_RANGE,
|
20
26
|
// MATCHER_KEYS_LIST,
|
21
27
|
// MATCHER_KEY_REGEX,
|
@@ -70,30 +76,115 @@ typedef struct
|
|
70
76
|
typedef struct
|
71
77
|
{
|
72
78
|
int with_path;
|
73
|
-
|
79
|
+
int symbolize_path_keys;
|
74
80
|
int paths_len;
|
75
|
-
|
81
|
+
paths_t *paths;
|
76
82
|
int current_path_len;
|
77
83
|
int max_path_len;
|
84
|
+
path_elem_t *current_path;
|
78
85
|
// Easier to use a Ruby array for result than convert later
|
86
|
+
// must be supplied by the caller and RB_GC_GUARD-ed if it isn't on the stack
|
79
87
|
VALUE points_list;
|
80
88
|
// by depth
|
81
89
|
size_t *starts;
|
82
90
|
// VALUE rb_err;
|
83
91
|
yajl_handle handle;
|
92
|
+
size_t yajl_bytes_consumed;
|
84
93
|
} scan_ctx;
|
85
94
|
|
95
|
+
inline size_t scan_ctx_get_bytes_consumed(scan_ctx *ctx)
|
96
|
+
{
|
97
|
+
return ctx->yajl_bytes_consumed + yajl_get_bytes_consumed(ctx->handle);
|
98
|
+
}
|
99
|
+
|
100
|
+
inline void scan_ctx_save_bytes_consumed(scan_ctx *ctx)
|
101
|
+
{
|
102
|
+
ctx->yajl_bytes_consumed += yajl_get_bytes_consumed(ctx->handle);
|
103
|
+
}
|
104
|
+
|
105
|
+
void scan_ctx_debug(scan_ctx *ctx)
|
106
|
+
{
|
107
|
+
// actually might have been cleared by GC already, be careful, debug only when in valid state
|
108
|
+
VALUE points_list_inspect = ctx->points_list == Qundef ? rb_str_new_cstr("undef") : rb_sprintf("%" PRIsVALUE, rb_inspect(ctx->points_list));
|
109
|
+
fprintf(stderr, "\nscan_ctx {\n");
|
110
|
+
fprintf(stderr, " with_path: %s,\n", ctx->with_path ? "true" : "false");
|
111
|
+
fprintf(stderr, " symbolize_path_keys: %s,\n", ctx->symbolize_path_keys ? "true" : "false");
|
112
|
+
fprintf(stderr, " paths_len: %d,\n", ctx->paths_len);
|
113
|
+
|
114
|
+
fprintf(stderr, " paths: [\n");
|
115
|
+
for (int i = 0; ctx->paths && i < ctx->paths_len; i++)
|
116
|
+
{
|
117
|
+
fprintf(stderr, " [");
|
118
|
+
for (int j = 0; j < ctx->paths[i].len; j++)
|
119
|
+
{
|
120
|
+
switch (ctx->paths[i].elems[j].type)
|
121
|
+
{
|
122
|
+
case MATCHER_KEY:
|
123
|
+
fprintf(stderr, "'%.*s'", (int)ctx->paths[i].elems[j].value.key.len, ctx->paths[i].elems[j].value.key.val);
|
124
|
+
break;
|
125
|
+
case MATCHER_INDEX:
|
126
|
+
fprintf(stderr, "%ld", ctx->paths[i].elems[j].value.index);
|
127
|
+
break;
|
128
|
+
case MATCHER_INDEX_RANGE:
|
129
|
+
fprintf(stderr, "(%ld..%ld)", ctx->paths[i].elems[j].value.range.start, ctx->paths[i].elems[j].value.range.end);
|
130
|
+
break;
|
131
|
+
case MATCHER_ANY_KEY:
|
132
|
+
fprintf(stderr, "('*'..'*')");
|
133
|
+
break;
|
134
|
+
}
|
135
|
+
if (j < ctx->paths[i].len - 1)
|
136
|
+
fprintf(stderr, ", ");
|
137
|
+
}
|
138
|
+
fprintf(stderr, "],\n");
|
139
|
+
}
|
140
|
+
fprintf(stderr, " ],\n");
|
141
|
+
|
142
|
+
fprintf(stderr, " current_path_len: %d,\n", ctx->current_path_len);
|
143
|
+
fprintf(stderr, " max_path_len: %d,\n", ctx->max_path_len);
|
144
|
+
fprintf(stderr, " current_path: [");
|
145
|
+
for (int i = 0; i < ctx->current_path_len; i++)
|
146
|
+
{
|
147
|
+
switch (ctx->current_path[i].type)
|
148
|
+
{
|
149
|
+
case PATH_KEY:
|
150
|
+
fprintf(stderr, "'%.*s'", (int)ctx->current_path[i].value.key.len, ctx->current_path[i].value.key.val);
|
151
|
+
break;
|
152
|
+
case PATH_INDEX:
|
153
|
+
fprintf(stderr, "%ld", ctx->current_path[i].value.index);
|
154
|
+
break;
|
155
|
+
}
|
156
|
+
if (i < ctx->current_path_len - 1)
|
157
|
+
fprintf(stderr, ", ");
|
158
|
+
}
|
159
|
+
fprintf(stderr, "],\n");
|
160
|
+
|
161
|
+
fprintf(stderr, " points_list: %.*s,\n", RSTRING_LENINT(points_list_inspect), RSTRING_PTR(points_list_inspect));
|
162
|
+
fprintf(stderr, " starts: [");
|
163
|
+
for (int i = 0; i <= ctx->max_path_len; i++)
|
164
|
+
{
|
165
|
+
fprintf(stderr, "%ld", ctx->starts[i]);
|
166
|
+
if (i < ctx->max_path_len)
|
167
|
+
fprintf(stderr, ", ");
|
168
|
+
}
|
169
|
+
fprintf(stderr, "],\n");
|
170
|
+
|
171
|
+
fprintf(stderr, " handle: %p,\n", ctx->handle);
|
172
|
+
fprintf(stderr, " yajl_bytes_consumed: %ld,\n", ctx->yajl_bytes_consumed);
|
173
|
+
fprintf(stderr, "}\n\n\n");
|
174
|
+
}
|
175
|
+
|
86
176
|
// FIXME: This will cause memory leak if ruby_xmalloc raises
|
87
|
-
|
177
|
+
// path_ary must be RB_GC_GUARD-ed by the caller
|
178
|
+
VALUE scan_ctx_init(scan_ctx *ctx, VALUE path_ary, VALUE string_keys)
|
88
179
|
{
|
89
180
|
int path_ary_len;
|
90
|
-
scan_ctx *ctx;
|
91
181
|
paths_t *paths;
|
92
182
|
// TODO: Allow to_ary and sized enumerables
|
93
183
|
rb_check_type(path_ary, T_ARRAY);
|
94
184
|
path_ary_len = rb_long2int(rb_array_len(path_ary));
|
95
185
|
// Check types early before any allocations, so exception is ok
|
96
186
|
// TODO: Fix this, just handle errors
|
187
|
+
// It's not possible that another Ruby thread changes path_ary items between these two loops, because C call holds GVL
|
97
188
|
for (int i = 0; i < path_ary_len; i++)
|
98
189
|
{
|
99
190
|
int path_len;
|
@@ -103,38 +194,42 @@ scan_ctx *scan_ctx_init(VALUE path_ary, int with_path)
|
|
103
194
|
for (int j = 0; j < path_len; j++)
|
104
195
|
{
|
105
196
|
VALUE entry = rb_ary_entry(path, j);
|
106
|
-
|
107
|
-
if (type == T_STRING)
|
197
|
+
switch (TYPE(entry))
|
108
198
|
{
|
199
|
+
case T_SYMBOL:
|
200
|
+
entry = rb_sym2str(entry);
|
201
|
+
/* fall through */
|
202
|
+
case T_STRING:
|
109
203
|
#if LONG_MAX > SIZE_MAX
|
110
204
|
RSTRING_LENINT(entry);
|
111
205
|
#endif
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
206
|
+
break;
|
207
|
+
case T_FIXNUM:
|
208
|
+
case T_BIGNUM:
|
209
|
+
NUM2LONG(entry);
|
210
|
+
break;
|
211
|
+
default:
|
118
212
|
{
|
119
213
|
VALUE range_beg, range_end;
|
120
214
|
long end_val;
|
121
215
|
int open_ended;
|
122
216
|
if (rb_range_values(entry, &range_beg, &range_end, &open_ended) != Qtrue)
|
123
|
-
|
124
|
-
if (
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
217
|
+
return rb_exc_new_cstr(rb_eArgError, "path elements must be strings, integers, or ranges");
|
218
|
+
if (range_beg != any_key_sym || range_end != any_key_sym)
|
219
|
+
{
|
220
|
+
if (NUM2LONG(range_beg) < 0L)
|
221
|
+
return rb_exc_new_cstr(rb_eArgError, "range start must be positive");
|
222
|
+
end_val = NUM2LONG(range_end);
|
223
|
+
if (end_val < -1L)
|
224
|
+
return rb_exc_new_cstr(rb_eArgError, "range end must be positive or -1");
|
225
|
+
if (end_val == -1L && open_ended)
|
226
|
+
return rb_exc_new_cstr(rb_eArgError, "range with -1 end must be closed");
|
227
|
+
}
|
228
|
+
}
|
131
229
|
}
|
132
230
|
}
|
133
231
|
}
|
134
232
|
|
135
|
-
ctx = ruby_xmalloc(sizeof(scan_ctx));
|
136
|
-
|
137
|
-
ctx->with_path = with_path;
|
138
233
|
ctx->max_path_len = 0;
|
139
234
|
|
140
235
|
paths = ruby_xmalloc(sizeof(paths_t) * path_ary_len);
|
@@ -149,9 +244,21 @@ scan_ctx *scan_ctx_init(VALUE path_ary, int with_path)
|
|
149
244
|
for (int j = 0; j < path_len; j++)
|
150
245
|
{
|
151
246
|
VALUE entry = rb_ary_entry(path, j);
|
152
|
-
|
153
|
-
if (type == T_STRING)
|
247
|
+
switch (TYPE(entry))
|
154
248
|
{
|
249
|
+
case T_SYMBOL:
|
250
|
+
entry = rb_sym2str(entry);
|
251
|
+
/* fall through */
|
252
|
+
case T_STRING:
|
253
|
+
{
|
254
|
+
if (string_keys != Qundef)
|
255
|
+
{
|
256
|
+
// If string_keys is provided, we need to duplicate the string
|
257
|
+
// to avoid use-after-free issues and to add the newly created string to the string_keys array.
|
258
|
+
// In Ruby 2.2 and newer symbols can be GC-ed, so we need to duplicate them as well.
|
259
|
+
entry = rb_str_dup(entry);
|
260
|
+
rb_ary_push(string_keys, entry);
|
261
|
+
}
|
155
262
|
paths[i].elems[j].type = MATCHER_KEY;
|
156
263
|
paths[i].elems[j].value.key.val = RSTRING_PTR(entry);
|
157
264
|
#if LONG_MAX > SIZE_MAX
|
@@ -160,25 +267,36 @@ scan_ctx *scan_ctx_init(VALUE path_ary, int with_path)
|
|
160
267
|
paths[i].elems[j].value.key.len = RSTRING_LEN(entry);
|
161
268
|
#endif
|
162
269
|
}
|
163
|
-
|
270
|
+
break;
|
271
|
+
case T_FIXNUM:
|
272
|
+
case T_BIGNUM:
|
164
273
|
{
|
165
274
|
paths[i].elems[j].type = MATCHER_INDEX;
|
166
275
|
paths[i].elems[j].value.index = FIX2LONG(entry);
|
167
276
|
}
|
168
|
-
|
277
|
+
break;
|
278
|
+
default:
|
169
279
|
{
|
170
280
|
VALUE range_beg, range_end;
|
171
281
|
int open_ended;
|
172
|
-
paths[i].elems[j].type = MATCHER_INDEX_RANGE;
|
173
282
|
rb_range_values(entry, &range_beg, &range_end, &open_ended);
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
paths[i].elems[j].value.range.
|
283
|
+
if (range_beg == any_key_sym && range_end == any_key_sym)
|
284
|
+
{
|
285
|
+
paths[i].elems[j].type = MATCHER_ANY_KEY;
|
286
|
+
}
|
287
|
+
else
|
288
|
+
{
|
289
|
+
paths[i].elems[j].type = MATCHER_INDEX_RANGE;
|
290
|
+
paths[i].elems[j].value.range.start = NUM2LONG(range_beg);
|
291
|
+
paths[i].elems[j].value.range.end = NUM2LONG(range_end);
|
292
|
+
// (value..-1) works as expected, (value...-1) is forbidden above
|
293
|
+
if (paths[i].elems[j].value.range.end == -1L)
|
294
|
+
paths[i].elems[j].value.range.end = LONG_MAX;
|
295
|
+
// -1 here is fine, so, (0...0) works just as expected - doesn't match anything
|
296
|
+
if (open_ended)
|
297
|
+
paths[i].elems[j].value.range.end--;
|
298
|
+
}
|
299
|
+
}
|
182
300
|
}
|
183
301
|
}
|
184
302
|
paths[i].len = path_len;
|
@@ -189,32 +307,37 @@ scan_ctx *scan_ctx_init(VALUE path_ary, int with_path)
|
|
189
307
|
ctx->paths_len = path_ary_len;
|
190
308
|
ctx->current_path = ruby_xmalloc2(sizeof(path_elem_t), ctx->max_path_len);
|
191
309
|
|
192
|
-
ctx->current_path_len = 0;
|
193
|
-
ctx->points_list = rb_ary_new_capa(path_ary_len);
|
194
|
-
for (int i = 0; i < path_ary_len; i++)
|
195
|
-
{
|
196
|
-
rb_ary_push(ctx->points_list, rb_ary_new());
|
197
|
-
}
|
198
|
-
|
199
310
|
ctx->starts = ruby_xmalloc2(sizeof(size_t), ctx->max_path_len + 1);
|
311
|
+
return Qundef; // no error
|
312
|
+
}
|
313
|
+
|
314
|
+
// resets temporary values in the config
|
315
|
+
void scan_ctx_reset(scan_ctx *ctx, VALUE points_list, int with_path, int symbolize_path_keys)
|
316
|
+
{
|
317
|
+
// TODO: reset matched_depth if implemented
|
318
|
+
ctx->current_path_len = 0;
|
200
319
|
// ctx->rb_err = Qnil;
|
201
320
|
ctx->handle = NULL;
|
202
|
-
|
203
|
-
|
321
|
+
ctx->yajl_bytes_consumed = 0;
|
322
|
+
ctx->points_list = points_list;
|
323
|
+
ctx->with_path = with_path;
|
324
|
+
ctx->symbolize_path_keys = symbolize_path_keys;
|
204
325
|
}
|
205
326
|
|
206
327
|
void scan_ctx_free(scan_ctx *ctx)
|
207
328
|
{
|
329
|
+
// fprintf(stderr, "scan_ctx_free\n");
|
208
330
|
if (!ctx)
|
209
331
|
return;
|
210
332
|
ruby_xfree(ctx->starts);
|
211
333
|
ruby_xfree(ctx->current_path);
|
334
|
+
if (!ctx->paths)
|
335
|
+
return;
|
212
336
|
for (int i = 0; i < ctx->paths_len; i++)
|
213
337
|
{
|
214
338
|
ruby_xfree(ctx->paths[i].elems);
|
215
339
|
}
|
216
340
|
ruby_xfree(ctx->paths);
|
217
|
-
ruby_xfree(ctx);
|
218
341
|
}
|
219
342
|
|
220
343
|
// noexcept
|
@@ -239,37 +362,38 @@ typedef enum
|
|
239
362
|
} value_type;
|
240
363
|
|
241
364
|
// noexcept
|
242
|
-
VALUE create_point(scan_ctx *sctx, value_type type, size_t length
|
365
|
+
VALUE create_point(scan_ctx *sctx, value_type type, size_t length)
|
243
366
|
{
|
244
|
-
VALUE values[3];
|
245
|
-
|
367
|
+
VALUE values[3], point;
|
368
|
+
size_t curr_pos = scan_ctx_get_bytes_consumed(sctx);
|
369
|
+
point = rb_ary_new_capa(3);
|
246
370
|
// noexcept
|
247
|
-
values[1] =
|
371
|
+
values[1] = ULL2NUM(curr_pos);
|
248
372
|
switch (type)
|
249
373
|
{
|
250
374
|
// FIXME: size_t can be longer than ulong
|
251
375
|
case null_value:
|
252
|
-
values[0] =
|
376
|
+
values[0] = ULL2NUM(curr_pos - length);
|
253
377
|
values[2] = null_sym;
|
254
378
|
break;
|
255
379
|
case boolean_value:
|
256
|
-
values[0] =
|
380
|
+
values[0] = ULL2NUM(curr_pos - length);
|
257
381
|
values[2] = boolean_sym;
|
258
382
|
break;
|
259
383
|
case number_value:
|
260
|
-
values[0] =
|
384
|
+
values[0] = ULL2NUM(curr_pos - length);
|
261
385
|
values[2] = number_sym;
|
262
386
|
break;
|
263
387
|
case string_value:
|
264
|
-
values[0] =
|
388
|
+
values[0] = ULL2NUM(curr_pos - length);
|
265
389
|
values[2] = string_sym;
|
266
390
|
break;
|
267
391
|
case object_value:
|
268
|
-
values[0] =
|
392
|
+
values[0] = ULL2NUM(sctx->starts[sctx->current_path_len]);
|
269
393
|
values[2] = object_sym;
|
270
394
|
break;
|
271
395
|
case array_value:
|
272
|
-
values[0] =
|
396
|
+
values[0] = ULL2NUM(sctx->starts[sctx->current_path_len]);
|
273
397
|
values[2] = array_sym;
|
274
398
|
break;
|
275
399
|
}
|
@@ -288,10 +412,13 @@ VALUE create_path(scan_ctx *sctx)
|
|
288
412
|
switch (sctx->current_path[i].type)
|
289
413
|
{
|
290
414
|
case PATH_KEY:
|
291
|
-
|
415
|
+
if (sctx->symbolize_path_keys)
|
416
|
+
entry = rb_id2sym(rb_intern2(sctx->current_path[i].value.key.val, sctx->current_path[i].value.key.len));
|
417
|
+
else
|
418
|
+
entry = rb_str_new(sctx->current_path[i].value.key.val, sctx->current_path[i].value.key.len);
|
292
419
|
break;
|
293
420
|
case PATH_INDEX:
|
294
|
-
entry =
|
421
|
+
entry = LONG2NUM(sctx->current_path[i].value.index);
|
295
422
|
break;
|
296
423
|
default:
|
297
424
|
entry = Qnil;
|
@@ -307,7 +434,7 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
|
|
307
434
|
// TODO: Abort parsing if all paths are matched and no more mathces are possible: only trivial key/index matchers at the current level
|
308
435
|
// TODO: Don't re-compare already matched prefixes; hard to invalidate, though
|
309
436
|
// TODO: Might fail in case of no memory
|
310
|
-
VALUE point = Qundef;
|
437
|
+
VALUE point = Qundef, path;
|
311
438
|
int match;
|
312
439
|
for (int i = 0; i < sctx->paths_len; i++)
|
313
440
|
{
|
@@ -319,6 +446,10 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
|
|
319
446
|
{
|
320
447
|
switch (sctx->paths[i].elems[j].type)
|
321
448
|
{
|
449
|
+
case MATCHER_ANY_KEY:
|
450
|
+
if (sctx->current_path[j].type != PATH_KEY)
|
451
|
+
match = false;
|
452
|
+
break;
|
322
453
|
case MATCHER_KEY:
|
323
454
|
if (sctx->current_path[j].type != PATH_KEY ||
|
324
455
|
sctx->current_path[j].value.key.len != sctx->paths[i].elems[j].value.key.len ||
|
@@ -344,10 +475,11 @@ void save_point(scan_ctx *sctx, value_type type, size_t length)
|
|
344
475
|
{
|
345
476
|
if (point == Qundef)
|
346
477
|
{
|
347
|
-
point = create_point(sctx, type, length
|
478
|
+
point = create_point(sctx, type, length);
|
348
479
|
if (sctx->with_path)
|
349
480
|
{
|
350
|
-
|
481
|
+
path = create_path(sctx);
|
482
|
+
point = rb_ary_new_from_args(2, path, point);
|
351
483
|
}
|
352
484
|
}
|
353
485
|
// rb_ary_push raises only in case of a frozen array, which is not the case
|
@@ -411,7 +543,7 @@ int scan_on_start_object(void *ctx)
|
|
411
543
|
return true;
|
412
544
|
}
|
413
545
|
increment_arr_index(sctx);
|
414
|
-
sctx->starts[sctx->current_path_len] =
|
546
|
+
sctx->starts[sctx->current_path_len] = scan_ctx_get_bytes_consumed(sctx) - 1;
|
415
547
|
if (sctx->current_path_len < sctx->max_path_len)
|
416
548
|
sctx->current_path[sctx->current_path_len].type = PATH_KEY;
|
417
549
|
sctx->current_path_len++;
|
@@ -451,7 +583,7 @@ int scan_on_start_array(void *ctx)
|
|
451
583
|
return true;
|
452
584
|
}
|
453
585
|
increment_arr_index(sctx);
|
454
|
-
sctx->starts[sctx->current_path_len] =
|
586
|
+
sctx->starts[sctx->current_path_len] = scan_ctx_get_bytes_consumed(sctx) - 1;
|
455
587
|
if (sctx->current_path_len < sctx->max_path_len)
|
456
588
|
{
|
457
589
|
sctx->current_path[sctx->current_path_len].type = PATH_INDEX;
|
@@ -471,6 +603,107 @@ int scan_on_end_array(void *ctx)
|
|
471
603
|
return true;
|
472
604
|
}
|
473
605
|
|
606
|
+
void config_free(void *data)
|
607
|
+
{
|
608
|
+
scan_ctx_free((scan_ctx *)data);
|
609
|
+
ruby_xfree(data);
|
610
|
+
}
|
611
|
+
|
612
|
+
size_t config_size(const void *data)
|
613
|
+
{
|
614
|
+
// see ObjectSpace.memsize_of
|
615
|
+
scan_ctx *ctx = (scan_ctx *)data;
|
616
|
+
size_t res = sizeof(scan_ctx);
|
617
|
+
// current_path
|
618
|
+
if (ctx->current_path != NULL)
|
619
|
+
res += ctx->max_path_len * sizeof(path_elem_t);
|
620
|
+
// starts
|
621
|
+
if (ctx->starts != NULL)
|
622
|
+
res += ctx->max_path_len * sizeof(size_t);
|
623
|
+
if (ctx->paths != NULL)
|
624
|
+
{
|
625
|
+
res += ctx->paths_len * sizeof(paths_t);
|
626
|
+
for (int i = 0; i < ctx->paths_len; i++)
|
627
|
+
{
|
628
|
+
res += ctx->paths[i].len * sizeof(path_matcher_elem_t);
|
629
|
+
}
|
630
|
+
}
|
631
|
+
return res;
|
632
|
+
}
|
633
|
+
|
634
|
+
static const rb_data_type_t config_type = {
|
635
|
+
.wrap_struct_name = "json_scanner_config",
|
636
|
+
.function = {
|
637
|
+
.dfree = config_free,
|
638
|
+
.dsize = config_size,
|
639
|
+
},
|
640
|
+
.flags = RUBY_TYPED_FREE_IMMEDIATELY,
|
641
|
+
};
|
642
|
+
|
643
|
+
VALUE config_alloc(VALUE self)
|
644
|
+
{
|
645
|
+
scan_ctx *ctx = ruby_xmalloc(sizeof(scan_ctx));
|
646
|
+
ctx->paths = NULL;
|
647
|
+
ctx->paths_len = 0;
|
648
|
+
ctx->current_path = NULL;
|
649
|
+
ctx->max_path_len = 0;
|
650
|
+
ctx->starts = NULL;
|
651
|
+
scan_ctx_reset(ctx, Qundef, false, false);
|
652
|
+
return TypedData_Wrap_Struct(self, &config_type, ctx);
|
653
|
+
}
|
654
|
+
|
655
|
+
VALUE config_m_initialize(VALUE self, VALUE path_ary)
|
656
|
+
{
|
657
|
+
scan_ctx *ctx;
|
658
|
+
VALUE scan_ctx_init_err, string_keys;
|
659
|
+
TypedData_Get_Struct(self, scan_ctx, &config_type, ctx);
|
660
|
+
string_keys = rb_ary_new();
|
661
|
+
scan_ctx_init_err = scan_ctx_init(ctx, path_ary, string_keys);
|
662
|
+
if (scan_ctx_init_err != Qundef)
|
663
|
+
{
|
664
|
+
rb_exc_raise(scan_ctx_init_err);
|
665
|
+
}
|
666
|
+
rb_iv_set(self, "string_keys", string_keys);
|
667
|
+
return self;
|
668
|
+
}
|
669
|
+
|
670
|
+
VALUE config_m_inspect(VALUE self)
|
671
|
+
{
|
672
|
+
scan_ctx *ctx;
|
673
|
+
VALUE res;
|
674
|
+
TypedData_Get_Struct(self, scan_ctx, &config_type, ctx);
|
675
|
+
res = rb_sprintf("#<%" PRIsVALUE " [", rb_class_name(CLASS_OF(self)));
|
676
|
+
for (int i = 0; ctx->paths && i < ctx->paths_len; i++)
|
677
|
+
{
|
678
|
+
rb_str_cat_cstr(res, "[");
|
679
|
+
for (int j = 0; j < ctx->paths[i].len; j++)
|
680
|
+
{
|
681
|
+
switch (ctx->paths[i].elems[j].type)
|
682
|
+
{
|
683
|
+
case MATCHER_KEY:
|
684
|
+
rb_str_catf(res, "'%.*s'", (int)ctx->paths[i].elems[j].value.key.len, ctx->paths[i].elems[j].value.key.val);
|
685
|
+
break;
|
686
|
+
case MATCHER_INDEX:
|
687
|
+
rb_str_catf(res, "%ld", ctx->paths[i].elems[j].value.index);
|
688
|
+
break;
|
689
|
+
case MATCHER_INDEX_RANGE:
|
690
|
+
rb_str_catf(res, "(%ld..%ld)", ctx->paths[i].elems[j].value.range.start, ctx->paths[i].elems[j].value.range.end);
|
691
|
+
break;
|
692
|
+
case MATCHER_ANY_KEY:
|
693
|
+
rb_str_cat_cstr(res, "('*'..'*')");
|
694
|
+
break;
|
695
|
+
}
|
696
|
+
if (j < ctx->paths[i].len - 1)
|
697
|
+
rb_str_cat_cstr(res, ", ");
|
698
|
+
}
|
699
|
+
rb_str_cat_cstr(res, "]");
|
700
|
+
if (i < ctx->paths_len - 1)
|
701
|
+
rb_str_cat_cstr(res, ", ");
|
702
|
+
}
|
703
|
+
rb_str_cat_cstr(res, "]>");
|
704
|
+
return res;
|
705
|
+
}
|
706
|
+
|
474
707
|
static yajl_callbacks scan_callbacks = {
|
475
708
|
scan_on_null,
|
476
709
|
scan_on_boolean,
|
@@ -492,15 +725,16 @@ static yajl_callbacks scan_callbacks = {
|
|
492
725
|
VALUE scan(int argc, VALUE *argv, VALUE self)
|
493
726
|
{
|
494
727
|
VALUE json_str, path_ary, with_path_flag, kwargs;
|
495
|
-
VALUE kwargs_values[
|
728
|
+
VALUE kwargs_values[SCAN_KWARGS_SIZE];
|
496
729
|
|
497
|
-
int with_path = false, verbose_error = false;
|
730
|
+
int with_path = false, verbose_error = false, symbolize_path_keys = false;
|
498
731
|
char *json_text;
|
499
732
|
size_t json_text_len;
|
500
733
|
yajl_handle handle;
|
501
734
|
yajl_status stat;
|
502
735
|
scan_ctx *ctx;
|
503
|
-
|
736
|
+
int free_ctx = true;
|
737
|
+
VALUE err_msg = Qnil, bytes_consumed, result;
|
504
738
|
// Turned out callbacks can't raise exceptions
|
505
739
|
// VALUE callback_err;
|
506
740
|
#if RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7)
|
@@ -512,11 +746,13 @@ VALUE scan(int argc, VALUE *argv, VALUE self)
|
|
512
746
|
with_path = RTEST(with_path_flag);
|
513
747
|
if (kwargs != Qnil)
|
514
748
|
{
|
515
|
-
rb_get_kwargs(kwargs, scan_kwargs_table, 0,
|
749
|
+
rb_get_kwargs(kwargs, scan_kwargs_table, 0, SCAN_KWARGS_SIZE, kwargs_values);
|
516
750
|
if (kwargs_values[0] != Qundef)
|
517
751
|
with_path = RTEST(kwargs_values[0]);
|
518
752
|
if (kwargs_values[1] != Qundef)
|
519
753
|
verbose_error = RTEST(kwargs_values[1]);
|
754
|
+
if (kwargs_values[7] != Qundef)
|
755
|
+
symbolize_path_keys = RTEST(kwargs_values[7]);
|
520
756
|
}
|
521
757
|
rb_check_type(json_str, T_STRING);
|
522
758
|
json_text = RSTRING_PTR(json_str);
|
@@ -525,7 +761,30 @@ VALUE scan(int argc, VALUE *argv, VALUE self)
|
|
525
761
|
#else
|
526
762
|
json_text_len = RSTRING_LEN(json_str);
|
527
763
|
#endif
|
528
|
-
|
764
|
+
if (rb_obj_is_kind_of(path_ary, rb_cJsonScannerConfig))
|
765
|
+
{
|
766
|
+
free_ctx = false;
|
767
|
+
TypedData_Get_Struct(path_ary, scan_ctx, &config_type, ctx);
|
768
|
+
}
|
769
|
+
else
|
770
|
+
{
|
771
|
+
VALUE scan_ctx_init_err;
|
772
|
+
ctx = ruby_xmalloc(sizeof(scan_ctx));
|
773
|
+
scan_ctx_init_err = scan_ctx_init(ctx, path_ary, Qundef);
|
774
|
+
if (scan_ctx_init_err != Qundef)
|
775
|
+
{
|
776
|
+
ruby_xfree(ctx);
|
777
|
+
rb_exc_raise(scan_ctx_init_err);
|
778
|
+
}
|
779
|
+
}
|
780
|
+
// Need to keep a ref to result array on the stack to prevent it from being GC-ed
|
781
|
+
result = rb_ary_new_capa(ctx->paths_len);
|
782
|
+
for (int i = 0; i < ctx->paths_len; i++)
|
783
|
+
{
|
784
|
+
rb_ary_push(result, rb_ary_new());
|
785
|
+
}
|
786
|
+
scan_ctx_reset(ctx, result, with_path, symbolize_path_keys);
|
787
|
+
// scan_ctx_debug(ctx);
|
529
788
|
|
530
789
|
handle = yajl_alloc(&scan_callbacks, NULL, (void *)ctx);
|
531
790
|
if (kwargs != Qnil) // it's safe to read kwargs_values only if rb_get_kwargs was called
|
@@ -544,23 +803,52 @@ VALUE scan(int argc, VALUE *argv, VALUE self)
|
|
544
803
|
ctx->handle = handle;
|
545
804
|
stat = yajl_parse(handle, (unsigned char *)json_text, json_text_len);
|
546
805
|
if (stat == yajl_status_ok)
|
806
|
+
{
|
807
|
+
scan_ctx_save_bytes_consumed(ctx);
|
547
808
|
stat = yajl_complete_parse(handle);
|
809
|
+
}
|
548
810
|
|
549
811
|
if (stat != yajl_status_ok)
|
550
812
|
{
|
551
813
|
char *str = (char *)yajl_get_error(handle, verbose_error, (unsigned char *)json_text, json_text_len);
|
552
|
-
|
814
|
+
err_msg = rb_utf8_str_new_cstr(str);
|
815
|
+
bytes_consumed = ULL2NUM(scan_ctx_get_bytes_consumed(ctx));
|
553
816
|
yajl_free_error(handle, (unsigned char *)str);
|
554
817
|
}
|
818
|
+
// // Needed when yajl_allow_partial_values is set
|
819
|
+
// if (ctx->current_path_len > 0)
|
820
|
+
// {
|
821
|
+
// if (ctx->current_path_len > ctx->max_path_len)
|
822
|
+
// ctx->current_path_len = ctx->max_path_len;
|
823
|
+
// for (int i = ctx->current_path_len - 1; i > 0; i--)
|
824
|
+
// {
|
825
|
+
// switch (ctx->current_path[i].type)
|
826
|
+
// {
|
827
|
+
// case PATH_KEY:
|
828
|
+
// scan_on_end_object(ctx);
|
829
|
+
// break;
|
830
|
+
// case PATH_INDEX:
|
831
|
+
// scan_on_end_array(ctx);
|
832
|
+
// break;
|
833
|
+
// }
|
834
|
+
// }
|
835
|
+
// }
|
555
836
|
// callback_err = ctx->rb_err;
|
556
|
-
|
557
|
-
|
837
|
+
if (free_ctx)
|
838
|
+
{
|
839
|
+
// fprintf(stderr, "free_ctx\n");
|
840
|
+
scan_ctx_free(ctx);
|
841
|
+
ruby_xfree(ctx);
|
842
|
+
}
|
558
843
|
yajl_free(handle);
|
559
|
-
if (
|
560
|
-
|
844
|
+
if (err_msg != Qnil)
|
845
|
+
{
|
846
|
+
VALUE err = rb_exc_new_str(rb_eJsonScannerParseError, err_msg);
|
847
|
+
rb_ivar_set(err, rb_iv_bytes_consumed, bytes_consumed);
|
848
|
+
rb_exc_raise(err);
|
849
|
+
}
|
561
850
|
// if (callback_err != Qnil)
|
562
851
|
// rb_exc_raise(callback_err);
|
563
|
-
// TODO: report yajl_get_bytes_consumed(handle)
|
564
852
|
return result;
|
565
853
|
}
|
566
854
|
|
@@ -568,8 +856,16 @@ RUBY_FUNC_EXPORTED void
|
|
568
856
|
Init_json_scanner(void)
|
569
857
|
{
|
570
858
|
rb_mJsonScanner = rb_define_module("JsonScanner");
|
859
|
+
rb_cJsonScannerConfig = rb_define_class_under(rb_mJsonScanner, "Config", rb_cObject);
|
860
|
+
rb_define_alloc_func(rb_cJsonScannerConfig, config_alloc);
|
861
|
+
rb_define_method(rb_cJsonScannerConfig, "initialize", config_m_initialize, 1);
|
862
|
+
rb_define_method(rb_cJsonScannerConfig, "inspect", config_m_inspect, 0);
|
571
863
|
rb_define_const(rb_mJsonScanner, "ANY_INDEX", rb_range_new(INT2FIX(0), INT2FIX(-1), false));
|
864
|
+
any_key_sym = rb_id2sym(rb_intern("*"));
|
865
|
+
rb_define_const(rb_mJsonScanner, "ANY_KEY", rb_range_new(any_key_sym, any_key_sym, false));
|
572
866
|
rb_eJsonScannerParseError = rb_define_class_under(rb_mJsonScanner, "ParseError", rb_eRuntimeError);
|
867
|
+
rb_define_attr(rb_eJsonScannerParseError, BYTES_CONSUMED, true, false);
|
868
|
+
rb_iv_bytes_consumed = rb_intern("@" BYTES_CONSUMED);
|
573
869
|
rb_define_module_function(rb_mJsonScanner, "scan", scan, -1);
|
574
870
|
null_sym = rb_id2sym(rb_intern("null"));
|
575
871
|
boolean_sym = rb_id2sym(rb_intern("boolean"));
|
@@ -584,4 +880,5 @@ Init_json_scanner(void)
|
|
584
880
|
scan_kwargs_table[4] = rb_intern("allow_trailing_garbage");
|
585
881
|
scan_kwargs_table[5] = rb_intern("allow_multiple_values");
|
586
882
|
scan_kwargs_table[6] = rb_intern("allow_partial_values");
|
883
|
+
scan_kwargs_table[7] = rb_intern("symbolize_path_keys");
|
587
884
|
}
|
data/lib/json_scanner/version.rb
CHANGED
@@ -0,0 +1,128 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "rake/clean"
|
4
|
+
require "rake/extensiontask"
|
5
|
+
|
6
|
+
module Rake
|
7
|
+
class ExtensionTestTask < ExtensionTask
|
8
|
+
#
|
9
|
+
# The C files to compile.
|
10
|
+
#
|
11
|
+
attr_accessor :c_spec_files
|
12
|
+
|
13
|
+
#
|
14
|
+
# The folders where includes for the test files are.
|
15
|
+
#
|
16
|
+
# Default: %w{/usr/include /usr/include/google}
|
17
|
+
#
|
18
|
+
attr_accessor :test_includes
|
19
|
+
|
20
|
+
#
|
21
|
+
# The libraries to link against.
|
22
|
+
#
|
23
|
+
# Default: %w{cmockery}
|
24
|
+
#
|
25
|
+
attr_accessor :test_libraries
|
26
|
+
|
27
|
+
#
|
28
|
+
# The folders where the libraries are
|
29
|
+
#
|
30
|
+
# Default: %w{/usr/lib}
|
31
|
+
#
|
32
|
+
attr_accessor :test_lib_folders
|
33
|
+
|
34
|
+
def initialize(*args, &block)
|
35
|
+
super
|
36
|
+
@c_spec_files = []
|
37
|
+
@test_includes = %w[/usr/include /usr/include/google]
|
38
|
+
@test_libraries = %w[cmockery]
|
39
|
+
@test_lib_folders = %w[/usr/lib]
|
40
|
+
init_test_tasks(
|
41
|
+
"#{@tmp_dir}/test", "compile:#{@name}:test",
|
42
|
+
"spec:c:#{@name}", "spec:valgrind:#{@name}", "spec:gdb:#{@name}",
|
43
|
+
)
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def includes
|
49
|
+
@includes ||= (@test_includes + [
|
50
|
+
".",
|
51
|
+
"../../#{@ext_dir}",
|
52
|
+
"/usr/include/ruby-#{RUBY_VERSION}",
|
53
|
+
"/usr/include/ruby-#{RUBY_VERSION}/#{RUBY_PLATFORM}",
|
54
|
+
]).map { |l| "-I#{l}" }.join(" ")
|
55
|
+
end
|
56
|
+
|
57
|
+
def libraries
|
58
|
+
@libraries ||= (@test_libraries + %w[ruby pthread crypto]).map { |l| "-l#{l}" }.join(" ")
|
59
|
+
end
|
60
|
+
|
61
|
+
def lib_folders
|
62
|
+
@lib_folders ||= (@test_lib_folders + %w[/usr/lib .]).map { |l| "-L#{l}" }.join(" ")
|
63
|
+
end
|
64
|
+
|
65
|
+
def compile_tests
|
66
|
+
# compile the test sources
|
67
|
+
FileList["*.c"].each do |cfile|
|
68
|
+
sh "gcc -g #{includes} -c #{cfile}"
|
69
|
+
end
|
70
|
+
|
71
|
+
source_objects = FileList["../#{RUBY_PLATFORM}/#{@name}/#{RUBY_VERSION}/*.o"]
|
72
|
+
# link the executables
|
73
|
+
FileList["*.o"].each do |ofile|
|
74
|
+
sh "gcc -g #{lib_folders} #{libraries} #{source_objects} #{ofile} -o #{ofile.ext}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def init_compile_task(compile_dir, compile_task)
|
79
|
+
directory compile_dir
|
80
|
+
desc "Compile #{@name} tests"
|
81
|
+
task compile_task => ["compile:#{@name}", compile_dir] do
|
82
|
+
# copy the test files into the compilation folder
|
83
|
+
@c_spec_files.each { |file| cp file, compile_dir }
|
84
|
+
|
85
|
+
# start compilation
|
86
|
+
chdir(compile_dir) { compile_tests }
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def init_valgrind_task(compile_dir, compile_task, valgrind_task)
|
91
|
+
desc "Execute valgrind for a #{@name} test"
|
92
|
+
task valgrind_task => [compile_task] do |_t, args|
|
93
|
+
sh "valgrind --num-callers=50 --error-limit=no --partial-loads-ok=yes --undef-value-errors=no " \
|
94
|
+
"--leak-check=full #{compile_dir}/#{args.test}"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def init_gdb_task(compile_dir, compile_task, gdb_task)
|
99
|
+
desc "Execute gdb for a #{@name} test"
|
100
|
+
task gdb_task => [compile_task] do |_t, args|
|
101
|
+
sh "gdb #{compile_dir}/#{args.test}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def init_test_task(compile_dir, compile_task, test_task)
|
106
|
+
desc "Test #{@name}"
|
107
|
+
task test_task => [compile_task] do |_t, args|
|
108
|
+
if args.test
|
109
|
+
sh "#{compile_dir}/#{args.test}"
|
110
|
+
else
|
111
|
+
FileList["#{compile_dir}/*.o"].each do |ofile|
|
112
|
+
sh ofile.ext.to_s
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def init_test_tasks(compile_dir, compile_task, test_task, valgrind_task, gdb_task)
|
119
|
+
init_compile_task(compile_dir, compile_task)
|
120
|
+
init_valgrind_task(compile_dir, compile_task, valgrind_task)
|
121
|
+
init_gdb_task(compile_dir, compile_task, gdb_task)
|
122
|
+
init_test_task(compile_dir, compile_task, test_task)
|
123
|
+
|
124
|
+
desc "Test all C extensions"
|
125
|
+
task "spec:c" => [test_task]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
File without changes
|
data/spec/json_scanner_spec.rb
CHANGED
@@ -22,6 +22,34 @@ RSpec.describe JsonScanner do
|
|
22
22
|
)
|
23
23
|
end
|
24
24
|
|
25
|
+
it "supports 'symbolize_path_keys'" do
|
26
|
+
expect(
|
27
|
+
described_class.scan('{"a": {"b": 1}}', [[:a, "b"]], with_path: true),
|
28
|
+
).to eq([[[%w[a b], [12, 13, :number]]]])
|
29
|
+
expect(
|
30
|
+
described_class.scan('{"a": {"b": 1}}', [[:a, "b"]], with_path: true, symbolize_path_keys: true),
|
31
|
+
).to eq([[[%i[a b], [12, 13, :number]]]])
|
32
|
+
end
|
33
|
+
|
34
|
+
it "supports any key selector" do
|
35
|
+
expect(
|
36
|
+
described_class.scan(
|
37
|
+
'[{"a":1,"b":2},{"c":3,"d":4},[5]]',
|
38
|
+
[[described_class::ANY_INDEX, described_class::ANY_KEY]],
|
39
|
+
),
|
40
|
+
).to eq(
|
41
|
+
[[[6, 7, :number], [12, 13, :number], [20, 21, :number], [26, 27, :number]]],
|
42
|
+
)
|
43
|
+
expect(
|
44
|
+
described_class.scan(
|
45
|
+
'{"a":[1,2],"b":{"c":3}}',
|
46
|
+
[[described_class::ANY_KEY, described_class::ANY_INDEX]],
|
47
|
+
),
|
48
|
+
).to eq(
|
49
|
+
[[[6, 7, :number], [8, 9, :number]]],
|
50
|
+
)
|
51
|
+
end
|
52
|
+
|
25
53
|
it "works with max path len correctly" do
|
26
54
|
expect(
|
27
55
|
described_class.scan('{"a": [1]}', [[], ["a"]]),
|
@@ -90,11 +118,32 @@ RSpec.describe JsonScanner do
|
|
90
118
|
expect do
|
91
119
|
described_class.scan "{1}", [], verbose_error: true
|
92
120
|
end.to raise_error described_class::ParseError, /invalid object key(?=.*\(right here\))/m
|
121
|
+
expect do
|
122
|
+
described_class.scan("[0, 42,", [[(1..-1)]], verbose_error: true)
|
123
|
+
end.to raise_error described_class::ParseError, /parse error: premature EOF.*\[0, 42,.*\(right here\) ------\^/m
|
124
|
+
end
|
125
|
+
|
126
|
+
it "includes bytes consumed in the exception" do
|
127
|
+
expect do
|
128
|
+
described_class.scan("[[1,2],,[3,4]]", [])
|
129
|
+
end.to(
|
130
|
+
raise_error(described_class::ParseError) do |exc|
|
131
|
+
expect(exc.bytes_consumed).to eq(8)
|
132
|
+
end,
|
133
|
+
)
|
134
|
+
expect do
|
135
|
+
described_class.scan("[[1,2", [])
|
136
|
+
end.to(
|
137
|
+
raise_error(described_class::ParseError) do |exc|
|
138
|
+
# 6 because of the final " " chunk - that's how yajl works
|
139
|
+
expect(exc.bytes_consumed).to eq(6)
|
140
|
+
end,
|
141
|
+
)
|
93
142
|
end
|
94
143
|
|
95
144
|
it "allows to return an actual path to the element" do
|
96
145
|
with_path_expected_res = [
|
97
|
-
# result for first
|
146
|
+
# result for first matcher, each element array of two items:
|
98
147
|
# array of path elements and 3-element array start,end,type
|
99
148
|
[[[0], [1, 6, :array]], [[1], [7, 12, :array]]],
|
100
149
|
[
|
@@ -128,7 +177,7 @@ RSpec.describe JsonScanner do
|
|
128
177
|
),
|
129
178
|
).to eq(
|
130
179
|
[
|
131
|
-
# result for first
|
180
|
+
# result for first matcher, each element 3-element array start,end,type
|
132
181
|
[[1, 6, :array], [7, 12, :array]],
|
133
182
|
[
|
134
183
|
[2, 3, :number], [4, 5, :number],
|
@@ -190,4 +239,122 @@ RSpec.describe JsonScanner do
|
|
190
239
|
described_class.scan(json, [[]])
|
191
240
|
end.to raise_error(described_class::ParseError)
|
192
241
|
end
|
242
|
+
|
243
|
+
context "with yajl params" do
|
244
|
+
it "supports 'allow_comments'" do
|
245
|
+
params = ["[0, /* answer */ 42, 0]", [[(1..-1)]]]
|
246
|
+
expect(described_class.scan(*params, allow_comments: true)).to eq(
|
247
|
+
[[[17, 19, :number], [21, 22, :number]]],
|
248
|
+
)
|
249
|
+
expect do
|
250
|
+
described_class.scan(*params)
|
251
|
+
end.to raise_error(described_class::ParseError)
|
252
|
+
end
|
253
|
+
|
254
|
+
it "supports 'dont_validate_strings'" do
|
255
|
+
params = ["\"\x81\x83\"", [[]]]
|
256
|
+
expect(described_class.scan(*params, dont_validate_strings: true)).to eq(
|
257
|
+
[[[0, 4, :string]]],
|
258
|
+
)
|
259
|
+
expect do
|
260
|
+
described_class.scan(*params)
|
261
|
+
end.to raise_error(described_class::ParseError)
|
262
|
+
params = ["{\"\x81\x83\": 42}", [[JsonScanner::ANY_KEY]]]
|
263
|
+
expect(described_class.scan(*params, dont_validate_strings: true, with_path: true)).to eq(
|
264
|
+
[[[["\x81\x83".dup.force_encoding(Encoding::BINARY)], [7, 9, :number]]]],
|
265
|
+
)
|
266
|
+
expect do
|
267
|
+
described_class.scan(*params, with_path: true)
|
268
|
+
end.to raise_error(described_class::ParseError)
|
269
|
+
end
|
270
|
+
|
271
|
+
it "supports 'allow_trailing_garbage'" do
|
272
|
+
params = ["[0, 42, 0]garbage", [[(1..-1)]]]
|
273
|
+
expect(described_class.scan(*params, allow_trailing_garbage: true)).to eq(
|
274
|
+
[[[4, 6, :number], [8, 9, :number]]],
|
275
|
+
)
|
276
|
+
expect do
|
277
|
+
described_class.scan(*params)
|
278
|
+
end.to raise_error(described_class::ParseError)
|
279
|
+
end
|
280
|
+
|
281
|
+
it "supports 'allow_multiple_values'" do
|
282
|
+
params = ["[0, 42, 0] [0, 34]", [[(1..-1)]]]
|
283
|
+
expect(described_class.scan(*params, allow_multiple_values: true)).to eq(
|
284
|
+
[[[4, 6, :number], [8, 9, :number], [16, 18, :number]]],
|
285
|
+
)
|
286
|
+
expect do
|
287
|
+
described_class.scan(*params)
|
288
|
+
end.to raise_error(described_class::ParseError)
|
289
|
+
end
|
290
|
+
|
291
|
+
it "handles multiple top-level values correctly with 'allow_multiple_values'" do
|
292
|
+
expect(described_class.scan("[0, 42, 0] [0, 34]", [[]], allow_multiple_values: true)).to eq(
|
293
|
+
[[[0, 10, :array], [12, 19, :array]]],
|
294
|
+
)
|
295
|
+
expect(described_class.scan('{"42": 34} [0, 34]', [[]], allow_multiple_values: true)).to eq(
|
296
|
+
[[[0, 10, :object], [12, 19, :array]]],
|
297
|
+
)
|
298
|
+
expect(described_class.scan('[0, 42, 0] {"42": 34}', [[]], allow_multiple_values: true)).to eq(
|
299
|
+
[[[0, 10, :array], [12, 22, :object]]],
|
300
|
+
)
|
301
|
+
expect(described_class.scan('{"42": 34} {"0": 34}', [[]], allow_multiple_values: true)).to eq(
|
302
|
+
[[[0, 10, :object], [12, 21, :object]]],
|
303
|
+
)
|
304
|
+
end
|
305
|
+
|
306
|
+
it "supports 'allow_partial_values'" do
|
307
|
+
params = ["[0, 42, 0,", [[(1..-1)]]]
|
308
|
+
expect(described_class.scan(*params, allow_partial_values: true)).to eq(
|
309
|
+
[[[4, 6, :number], [8, 9, :number]]],
|
310
|
+
)
|
311
|
+
expect do
|
312
|
+
described_class.scan(*params)
|
313
|
+
end.to raise_error(described_class::ParseError)
|
314
|
+
expect(described_class.scan("[0, 42, 0", [[(1..-1)]], allow_partial_values: true)).to eq(
|
315
|
+
[[[4, 6, :number], [8, 9, :number]]],
|
316
|
+
)
|
317
|
+
expect(described_class.scan("[0, 42, true", [[(1..-1)]], allow_partial_values: true)).to eq(
|
318
|
+
[[[4, 6, :number], [8, 12, :boolean]]],
|
319
|
+
)
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
describe described_class::Config do
|
324
|
+
it "saves state" do
|
325
|
+
key = "abracadabra".dup
|
326
|
+
conf = described_class.new [[], [key]]
|
327
|
+
key["cad"] = 0.chr
|
328
|
+
key = nil # rubocop:disable Lint/UselessAssignment
|
329
|
+
GC.start
|
330
|
+
expect(
|
331
|
+
10.times.map do
|
332
|
+
JsonScanner.scan '{"abracadabra": 10}', conf, with_path: true
|
333
|
+
end.uniq,
|
334
|
+
).to eq([[[[[], [0, 19, :object]]], [[["abracadabra"], [16, 18, :number]]]]])
|
335
|
+
expect(
|
336
|
+
10.times.map do
|
337
|
+
JsonScanner.scan '{"abracadabra": 10}', conf
|
338
|
+
end.uniq,
|
339
|
+
).to eq([[[[0, 19, :object]], [[16, 18, :number]]]])
|
340
|
+
end
|
341
|
+
|
342
|
+
it "re-raises exceptions" do
|
343
|
+
expect do
|
344
|
+
described_class.new [[(0...-1)]]
|
345
|
+
end.to raise_error ArgumentError
|
346
|
+
expect do
|
347
|
+
described_class.new [[(0..-2)]]
|
348
|
+
end.to raise_error ArgumentError
|
349
|
+
expect do
|
350
|
+
described_class.new [[(-42..1)]]
|
351
|
+
end.to raise_error ArgumentError
|
352
|
+
end
|
353
|
+
|
354
|
+
it "supports inspect" do
|
355
|
+
expect(
|
356
|
+
described_class.new([[], ["abracadabra", JsonScanner::ANY_INDEX], [42, JsonScanner::ANY_KEY]]).inspect,
|
357
|
+
).to eq("#<JsonScanner::Config [[], ['abracadabra', (0..9223372036854775807)], [42, ('*'..'*')]]>")
|
358
|
+
end
|
359
|
+
end
|
193
360
|
end
|
metadata
CHANGED
@@ -1,16 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: json_scanner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- uvlad7
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
|
-
description: This gem uses yajl lib to scan a
|
13
|
+
description: This gem uses the yajl lib to scan a JSON string and allows you to parse
|
14
14
|
pieces of it
|
15
15
|
email:
|
16
16
|
- uvlad7@gmail.com
|
@@ -26,6 +26,8 @@ files:
|
|
26
26
|
- lib/json_scanner.rb
|
27
27
|
- lib/json_scanner/version.rb
|
28
28
|
- sig/json_scanner.rbs
|
29
|
+
- spec/extensiontesttask.rb
|
30
|
+
- spec/json_scanner_spec.c
|
29
31
|
- spec/json_scanner_spec.rb
|
30
32
|
- spec/spec_helper.rb
|
31
33
|
homepage: https://github.com/uvlad7/json_scanner
|