smarter_json 0.8.0 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +83 -47
- data/README.md +225 -46
- data/docs/_introduction.md +6 -12
- data/docs/basic_read_api.md +59 -16
- data/docs/basic_write_api.md +2 -2
- data/docs/examples.md +58 -24
- data/docs/options.md +14 -14
- data/ext/smarter_json/smarter_json.c +261 -97
- data/ext/smarter_json/vendor/LICENSE-fast_float-MIT +27 -0
- data/ext/smarter_json/vendor/eisel_lemire.h +117 -0
- data/ext/smarter_json/vendor/eisel_lemire.md +29 -0
- data/ext/smarter_json/vendor/eisel_lemire_powers.h +663 -0
- data/lib/smarter_json/backports.rb +28 -0
- data/lib/smarter_json/options.rb +52 -0
- data/lib/smarter_json/parser.rb +722 -198
- data/lib/smarter_json/version.rb +1 -1
- data/lib/smarter_json.rb +3 -1
- metadata +9 -5
- data/ext/smarter_json/vendor/ryu.h +0 -819
- data/ext/smarter_json/vendor/ryu.md +0 -22
data/lib/smarter_json/parser.rb
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# Array#filter_map (used in Recovery#extract_payloads) is Ruby 2.7+; on Ruby < 2.7
|
|
4
|
+
# activate the scoped refinement backport (no-op on 2.7+, which uses native filter_map).
|
|
5
|
+
using SmarterJSON::Backports if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("2.7")
|
|
6
|
+
|
|
3
7
|
module SmarterJSON
|
|
4
8
|
# ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
|
|
5
9
|
# from the shared SmarterJSON::Error base.
|
|
@@ -12,15 +16,20 @@ module SmarterJSON
|
|
|
12
16
|
# is always content, never a filename — use process_file for paths.) The values
|
|
13
17
|
# in `options` override Parser::DEFAULT_OPTIONS.
|
|
14
18
|
#
|
|
15
|
-
# Without a block: returns
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
+
# Without a block: always returns an Array of the documents found — [] for none,
|
|
20
|
+
# [doc] for one, [d1, d2, …] for several (NDJSON / JSONL / concatenated). A
|
|
21
|
+
# top-level value must be a recognized JSON value (number / literal / quoted
|
|
22
|
+
# string / object / array) or an implicit-root object, else it raises. For the
|
|
23
|
+
# single-document case use SmarterJSON.process_one (returns the bare value).
|
|
24
|
+
# :acceleration (default true) selects the C extension when compiled and loaded
|
|
25
|
+
# (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
|
|
19
26
|
#
|
|
20
|
-
# With a block: yields each top-level document as it is parsed, and returns
|
|
21
|
-
# For an IO this streams document-by-document in bounded memory —
|
|
22
|
-
# stream as newline-delimited documents (NDJSON / JSONL), one per
|
|
27
|
+
# With a block: yields each top-level document as it is parsed, and returns the
|
|
28
|
+
# document count. For an IO this streams document-by-document in bounded memory —
|
|
29
|
+
# it reads the stream as newline-delimited documents (NDJSON / JSONL), one per
|
|
30
|
+
# line.
|
|
23
31
|
def process(input, options = {}, &block)
|
|
32
|
+
options = Options.process_options(options)
|
|
24
33
|
if input.is_a?(String)
|
|
25
34
|
Recovery.process_string(input, options, &block)
|
|
26
35
|
elsif input.respond_to?(:read)
|
|
@@ -39,7 +48,8 @@ module SmarterJSON
|
|
|
39
48
|
# loading the whole file); the documents are read as newline-delimited
|
|
40
49
|
# (NDJSON / JSONL), one per line.
|
|
41
50
|
def process_file(path, options = {}, &block)
|
|
42
|
-
|
|
51
|
+
options = Options.process_options(options)
|
|
52
|
+
encoding = options[:encoding] || "UTF-8"
|
|
43
53
|
if block
|
|
44
54
|
File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
|
|
45
55
|
else
|
|
@@ -47,8 +57,44 @@ module SmarterJSON
|
|
|
47
57
|
end
|
|
48
58
|
end
|
|
49
59
|
|
|
50
|
-
#
|
|
51
|
-
#
|
|
60
|
+
# SmarterJSON.process_one(input, options = {}) — the single-document accessor.
|
|
61
|
+
#
|
|
62
|
+
# Returns the first document's value (or nil when the input holds no documents).
|
|
63
|
+
# When the input holds MORE than one document it returns the first and warns once
|
|
64
|
+
# — it never raises, since an extra document is valid data; the warning goes to
|
|
65
|
+
# on_warning if set, else Rails.logger.warn when Rails is loaded, else Kernel#warn.
|
|
66
|
+
# For an IO this is bounded memory: it parses just the first document and stops as
|
|
67
|
+
# soon as a second is seen, instead of materialising the whole stream the way
|
|
68
|
+
# process(io).first would. (process(input).first and process(input)[0] silently
|
|
69
|
+
# drop documents 2+ — a footgun; use process_one instead.)
|
|
70
|
+
def process_one(input, options = {})
|
|
71
|
+
options = Options.process_options(options)
|
|
72
|
+
|
|
73
|
+
# IO: bounded memory — parse just the first document and stop once a second is
|
|
74
|
+
# seen (peek-to-warn). A String is already in memory, so use the plain no-block
|
|
75
|
+
# path: it returns the full (wrapper-recovered, de-duplicated) Array in one pass,
|
|
76
|
+
# which also avoids the reactive-recovery double-yield the block path would hit.
|
|
77
|
+
unless input.respond_to?(:read)
|
|
78
|
+
docs = process(input, options)
|
|
79
|
+
warn_extra_documents(options) if docs.length > 1
|
|
80
|
+
return docs.first
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
first = nil
|
|
84
|
+
count = 0
|
|
85
|
+
catch(:smarter_json_first_document) do
|
|
86
|
+
process(input, options) do |doc|
|
|
87
|
+
count += 1
|
|
88
|
+
first = doc if count == 1
|
|
89
|
+
throw(:smarter_json_first_document) if count > 1
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
warn_extra_documents(options) if count > 1
|
|
93
|
+
first
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Parse a String of JSON content (the in-memory path). Returns an Array of the
|
|
97
|
+
# documents found (empty for none); the C extension is used when available.
|
|
52
98
|
def process_content(input, options, &block)
|
|
53
99
|
if block
|
|
54
100
|
if options.fetch(:acceleration, true) && HAS_ACCELERATION
|
|
@@ -63,22 +109,322 @@ module SmarterJSON
|
|
|
63
109
|
end
|
|
64
110
|
end
|
|
65
111
|
|
|
66
|
-
# Stream documents from an IO,
|
|
67
|
-
#
|
|
68
|
-
# spanning multiple lines is not supported by the streaming path.
|
|
112
|
+
# Stream documents from an IO incrementally, yielding each recovered top-level
|
|
113
|
+
# document without slurping the whole input into memory first.
|
|
69
114
|
def stream_io(io, options, &block)
|
|
70
|
-
|
|
115
|
+
count = 0
|
|
116
|
+
Framer.each_document(io) do |doc|
|
|
117
|
+
# Recovery.process_string yields each value and returns how many it yielded;
|
|
118
|
+
# blank / comment-only framed segments yield none, so count tracks actual
|
|
119
|
+
# documents (== values yielded), not raw framed segments.
|
|
120
|
+
count += Recovery.process_string(doc, options, &block)
|
|
121
|
+
end
|
|
122
|
+
count
|
|
71
123
|
end
|
|
72
124
|
|
|
73
|
-
|
|
125
|
+
# process_one's "more than one document" notice — routed to on_warning if the caller
|
|
126
|
+
# gave one, else Rails.logger when Rails is loaded, else Kernel#warn. Never silent,
|
|
127
|
+
# never raised.
|
|
128
|
+
def warn_extra_documents(options)
|
|
129
|
+
message = "SmarterJSON.process_one: input has more than one document — returning the first and " \
|
|
130
|
+
"dropping the rest. Use SmarterJSON.process to get every document."
|
|
131
|
+
handler = options[:on_warning]
|
|
132
|
+
if handler
|
|
133
|
+
handler.call(Warning.new(:extra_documents, message, nil, nil))
|
|
134
|
+
elsif defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
135
|
+
Rails.logger.warn(message)
|
|
136
|
+
else
|
|
137
|
+
Kernel.warn(message)
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
private_class_method :process_content, :stream_io, :warn_extra_documents
|
|
142
|
+
|
|
143
|
+
# Named byte values, shared by the Parser FSM and the Framer / Recovery byte
|
|
144
|
+
# scanners so none of them spell out raw hex. Included where needed.
|
|
145
|
+
module Bytes
|
|
146
|
+
LBRACE = 0x7B
|
|
147
|
+
RBRACE = 0x7D
|
|
148
|
+
LBRACKET = 0x5B
|
|
149
|
+
RBRACKET = 0x5D
|
|
150
|
+
COLON = 0x3A
|
|
151
|
+
COMMA = 0x2C
|
|
152
|
+
DQUOTE = 0x22
|
|
153
|
+
SQUOTE = 0x27
|
|
154
|
+
BACKSLASH = 0x5C
|
|
155
|
+
SLASH = 0x2F
|
|
156
|
+
STAR = 0x2A
|
|
157
|
+
HASH = 0x23
|
|
158
|
+
MINUS = 0x2D
|
|
159
|
+
PLUS = 0x2B
|
|
160
|
+
DOT = 0x2E
|
|
161
|
+
ZERO = 0x30
|
|
162
|
+
NINE = 0x39
|
|
163
|
+
LOWER_E = 0x65
|
|
164
|
+
UPPER_E = 0x45
|
|
165
|
+
LOWER_T = 0x74
|
|
166
|
+
LOWER_F = 0x66
|
|
167
|
+
LOWER_N = 0x6E
|
|
168
|
+
LOWER_U = 0x75
|
|
169
|
+
LOWER_X = 0x78
|
|
170
|
+
UPPER_X = 0x58
|
|
171
|
+
UPPER_I = 0x49
|
|
172
|
+
UPPER_N = 0x4E
|
|
173
|
+
UPPER_T = 0x54
|
|
174
|
+
UPPER_F = 0x46
|
|
175
|
+
UNDERSCORE = 0x5F
|
|
176
|
+
DOLLAR = 0x24
|
|
177
|
+
SPACE = 0x20
|
|
178
|
+
TAB = 0x09
|
|
179
|
+
LF = 0x0A
|
|
180
|
+
CR = 0x0D
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
module Framer
|
|
184
|
+
include Bytes
|
|
185
|
+
|
|
186
|
+
CHUNK_SIZE = 16 * 1024
|
|
187
|
+
|
|
188
|
+
module_function
|
|
189
|
+
|
|
190
|
+
def each_document(io)
|
|
191
|
+
buffer = +""
|
|
192
|
+
scan = 0
|
|
193
|
+
doc_start = nil
|
|
194
|
+
stack = []
|
|
195
|
+
mode = nil
|
|
196
|
+
|
|
197
|
+
while (chunk = read_chunk(io))
|
|
198
|
+
buffer << chunk
|
|
199
|
+
loop do
|
|
200
|
+
emitted, buffer, scan, doc_start, stack, mode = scan_buffer(buffer, scan, doc_start, stack, mode)
|
|
201
|
+
break unless emitted
|
|
202
|
+
|
|
203
|
+
yield emitted
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
yield buffer unless separators_only?(buffer)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def read_chunk(io)
|
|
211
|
+
if io.respond_to?(:readpartial)
|
|
212
|
+
io.readpartial(CHUNK_SIZE)
|
|
213
|
+
else
|
|
214
|
+
io.read(CHUNK_SIZE)
|
|
215
|
+
end
|
|
216
|
+
rescue EOFError
|
|
217
|
+
nil
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def scan_buffer(buffer, scan, doc_start, stack, mode)
|
|
221
|
+
while scan < buffer.bytesize
|
|
222
|
+
b = buffer.getbyte(scan)
|
|
223
|
+
# A multi-byte marker (// /* ''' */) whose lead byte is here but whose
|
|
224
|
+
# remaining bytes have not arrived yet must not be guessed at — advancing
|
|
225
|
+
# past the lead byte would misread the brace/quote that follows it once the
|
|
226
|
+
# next chunk lands. Stop and let each_document append more input, then resume
|
|
227
|
+
# from this same position. At true EOF the leftover is parsed whole instead.
|
|
228
|
+
break if defer_for_split_marker?(buffer, scan, b, mode, doc_start)
|
|
229
|
+
|
|
230
|
+
if mode == :double
|
|
231
|
+
if b == BACKSLASH
|
|
232
|
+
scan += 2
|
|
233
|
+
elsif b == DQUOTE
|
|
234
|
+
mode = nil
|
|
235
|
+
scan += 1
|
|
236
|
+
else
|
|
237
|
+
scan += 1
|
|
238
|
+
end
|
|
239
|
+
elsif mode == :single
|
|
240
|
+
if b == BACKSLASH
|
|
241
|
+
scan += 2
|
|
242
|
+
elsif b == SQUOTE
|
|
243
|
+
mode = nil
|
|
244
|
+
scan += 1
|
|
245
|
+
else
|
|
246
|
+
scan += 1
|
|
247
|
+
end
|
|
248
|
+
elsif mode == :triple
|
|
249
|
+
if buffer.byteslice(scan, 3) == "'''"
|
|
250
|
+
mode = nil
|
|
251
|
+
scan += 3
|
|
252
|
+
else
|
|
253
|
+
scan += 1
|
|
254
|
+
end
|
|
255
|
+
elsif mode == :line_comment
|
|
256
|
+
if [LF, CR].include?(b)
|
|
257
|
+
mode = nil
|
|
258
|
+
else
|
|
259
|
+
scan += 1
|
|
260
|
+
next
|
|
261
|
+
end
|
|
262
|
+
elsif mode == :block_comment
|
|
263
|
+
if buffer.byteslice(scan, 2) == '*/'
|
|
264
|
+
mode = nil
|
|
265
|
+
scan += 2
|
|
266
|
+
else
|
|
267
|
+
scan += 1
|
|
268
|
+
end
|
|
269
|
+
elsif doc_start.nil?
|
|
270
|
+
if whitespace_byte?(b)
|
|
271
|
+
scan += 1
|
|
272
|
+
elsif line_comment_start?(buffer, scan)
|
|
273
|
+
mode = :line_comment
|
|
274
|
+
scan += buffer.getbyte(scan) == HASH ? 1 : 2
|
|
275
|
+
elsif block_comment_start?(buffer, scan)
|
|
276
|
+
mode = :block_comment
|
|
277
|
+
scan += 2
|
|
278
|
+
elsif [LBRACE, LBRACKET].include?(b)
|
|
279
|
+
doc_start = scan
|
|
280
|
+
stack << b
|
|
281
|
+
scan += 1
|
|
282
|
+
else
|
|
283
|
+
scan = buffer.bytesize
|
|
284
|
+
end
|
|
285
|
+
else
|
|
286
|
+
if mode.nil? && line_comment_start?(buffer, scan)
|
|
287
|
+
mode = :line_comment
|
|
288
|
+
scan += buffer.getbyte(scan) == HASH ? 1 : 2
|
|
289
|
+
elsif mode.nil? && block_comment_start?(buffer, scan)
|
|
290
|
+
mode = :block_comment
|
|
291
|
+
scan += 2
|
|
292
|
+
elsif b == DQUOTE
|
|
293
|
+
mode = :double
|
|
294
|
+
scan += 1
|
|
295
|
+
elsif buffer.byteslice(scan, 3) == "'''"
|
|
296
|
+
mode = :triple
|
|
297
|
+
scan += 3
|
|
298
|
+
elsif b == SQUOTE
|
|
299
|
+
mode = :single
|
|
300
|
+
scan += 1
|
|
301
|
+
elsif [LBRACE, LBRACKET].include?(b)
|
|
302
|
+
stack << b
|
|
303
|
+
scan += 1
|
|
304
|
+
elsif b == RBRACE
|
|
305
|
+
stack.pop if stack.last == LBRACE
|
|
306
|
+
scan += 1
|
|
307
|
+
if stack.empty?
|
|
308
|
+
doc = buffer.byteslice(doc_start, scan - doc_start)
|
|
309
|
+
buffer = buffer.byteslice(scan..-1) || +""
|
|
310
|
+
return [doc, buffer, 0, nil, [], nil]
|
|
311
|
+
end
|
|
312
|
+
elsif b == RBRACKET
|
|
313
|
+
stack.pop if stack.last == LBRACKET
|
|
314
|
+
scan += 1
|
|
315
|
+
if stack.empty?
|
|
316
|
+
doc = buffer.byteslice(doc_start, scan - doc_start)
|
|
317
|
+
buffer = buffer.byteslice(scan..-1) || +""
|
|
318
|
+
return [doc, buffer, 0, nil, [], nil]
|
|
319
|
+
end
|
|
320
|
+
else
|
|
321
|
+
scan += 1
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
[nil, buffer, scan, doc_start, stack, mode]
|
|
327
|
+
end
|
|
328
|
+
|
|
329
|
+
# True when `b` is the lead byte of a multi-byte marker but the rest of that
|
|
330
|
+
# marker has not been read into the buffer yet, so we cannot decide what it is.
|
|
331
|
+
# `//` and `/*` need 2 bytes; `'''` (and a closing `'''`) needs 3; a closing
|
|
332
|
+
# `*/` needs 2. Backslash escapes and single-byte delimiters never need this.
|
|
333
|
+
def defer_for_split_marker?(buffer, scan, b, mode, doc_start)
|
|
334
|
+
avail = buffer.bytesize - scan
|
|
335
|
+
case mode
|
|
336
|
+
when :block_comment
|
|
337
|
+
b == STAR && avail < 2
|
|
338
|
+
when :triple
|
|
339
|
+
b == SQUOTE && avail < 3
|
|
340
|
+
when nil
|
|
341
|
+
if doc_start.nil?
|
|
342
|
+
b == SLASH && avail < 2
|
|
343
|
+
else
|
|
344
|
+
(b == SLASH && avail < 2) || (b == SQUOTE && avail < 3)
|
|
345
|
+
end
|
|
346
|
+
else
|
|
347
|
+
false
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def separators_only?(buffer)
|
|
352
|
+
scan = 0
|
|
353
|
+
mode = nil
|
|
354
|
+
while scan < buffer.bytesize
|
|
355
|
+
b = buffer.getbyte(scan)
|
|
356
|
+
if mode == :line_comment
|
|
357
|
+
if [LF, CR].include?(b)
|
|
358
|
+
mode = nil
|
|
359
|
+
else
|
|
360
|
+
scan += 1
|
|
361
|
+
next
|
|
362
|
+
end
|
|
363
|
+
elsif mode == :block_comment
|
|
364
|
+
if buffer.byteslice(scan, 2) == '*/'
|
|
365
|
+
mode = nil
|
|
366
|
+
scan += 2
|
|
367
|
+
else
|
|
368
|
+
scan += 1
|
|
369
|
+
end
|
|
370
|
+
elsif whitespace_byte?(b)
|
|
371
|
+
scan += 1
|
|
372
|
+
elsif line_comment_start?(buffer, scan)
|
|
373
|
+
mode = :line_comment
|
|
374
|
+
scan += buffer.getbyte(scan) == HASH ? 1 : 2
|
|
375
|
+
elsif block_comment_start?(buffer, scan)
|
|
376
|
+
mode = :block_comment
|
|
377
|
+
scan += 2
|
|
378
|
+
else
|
|
379
|
+
return false
|
|
380
|
+
end
|
|
381
|
+
end
|
|
382
|
+
true
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def whitespace_byte?(b)
|
|
386
|
+
b == SPACE || (b && b >= TAB && b <= CR)
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
def line_comment_start?(buffer, scan)
|
|
390
|
+
b = buffer.getbyte(scan)
|
|
391
|
+
return preceded_by_ws_or_start?(buffer, scan) if b == HASH
|
|
392
|
+
|
|
393
|
+
b == SLASH && buffer.getbyte(scan + 1) == SLASH && preceded_by_ws_or_start?(buffer, scan)
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
def block_comment_start?(buffer, scan)
|
|
397
|
+
buffer.getbyte(scan) == SLASH && buffer.getbyte(scan + 1) == STAR && preceded_by_ws_or_start?(buffer, scan)
|
|
398
|
+
end
|
|
399
|
+
|
|
400
|
+
def preceded_by_ws_or_start?(buffer, scan)
|
|
401
|
+
return true if scan.zero?
|
|
402
|
+
|
|
403
|
+
prev = buffer.getbyte(scan - 1)
|
|
404
|
+
whitespace_byte?(prev)
|
|
405
|
+
end
|
|
406
|
+
end
|
|
74
407
|
|
|
75
408
|
module Recovery
|
|
409
|
+
include Bytes
|
|
410
|
+
|
|
76
411
|
module_function
|
|
77
412
|
|
|
78
413
|
def process_string(input, options, &block)
|
|
79
414
|
return SmarterJSON.send(:process_content, input, options, &block) unless input.valid_encoding?
|
|
80
415
|
|
|
81
|
-
|
|
416
|
+
# Recovery is REACTIVE: parse first, and only fall back to wrapper extraction when
|
|
417
|
+
# the parse actually fails (the rescue below). Every wrapper shape — code fences,
|
|
418
|
+
# <json>/BEGIN_JSON tags, prose around the payload — makes the parse raise, so the
|
|
419
|
+
# rescue catches it. Crucially this keeps clean input on the single-parse fast path
|
|
420
|
+
# even when its string values legitimately contain ``` or <json> (real-world data
|
|
421
|
+
# like GitHub event payloads is full of markdown), instead of dragging hundreds of
|
|
422
|
+
# MB through the pure-Ruby candidate scan.
|
|
423
|
+
#
|
|
424
|
+
# The one exception is a bare leading label like "JSON: {...}", which parses
|
|
425
|
+
# successfully but WRONGLY (as an implicit-root object keyed by the label), so it
|
|
426
|
+
# must be intercepted before parsing.
|
|
427
|
+
if leading_label?(input)
|
|
82
428
|
payloads = extract_payloads(input, options)
|
|
83
429
|
return replay_payloads(payloads, options, &block) unless payloads.empty?
|
|
84
430
|
end
|
|
@@ -93,25 +439,37 @@ module SmarterJSON
|
|
|
93
439
|
raise
|
|
94
440
|
end
|
|
95
441
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
442
|
+
# Whether the input opens with a bare "JSON:" / "Final answer:" label (which would
|
|
443
|
+
# otherwise parse, wrongly, as an implicit-root object keyed by the label). We use
|
|
444
|
+
# String#start_with? with a Regexp rather than match?(/\A.../): start_with? checks
|
|
445
|
+
# only the beginning, whereas a \A-anchored match? still retries at every byte
|
|
446
|
+
# position and so scans the WHOLE input (≈0.3s on a 200 MB document) on every parse.
|
|
447
|
+
# (Caller has already established the input is valid_encoding?.)
|
|
448
|
+
def leading_label?(input)
|
|
449
|
+
input.start_with?(/[[:space:]]*(?:JSON|Final answer)[[:space:]]*:/i)
|
|
100
450
|
end
|
|
101
451
|
|
|
102
452
|
def replay_payloads(payloads, options, &block)
|
|
103
453
|
handler = options[:on_warning]
|
|
104
454
|
emit_wrapper_warnings(payloads, handler)
|
|
105
455
|
|
|
106
|
-
|
|
107
|
-
|
|
456
|
+
if block_given?
|
|
457
|
+
count = 0
|
|
458
|
+
payloads.each do |payload|
|
|
459
|
+
SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
|
|
460
|
+
block.call(doc)
|
|
461
|
+
count += 1
|
|
462
|
+
end
|
|
463
|
+
end
|
|
464
|
+
return count
|
|
108
465
|
end
|
|
109
466
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
467
|
+
# Each payload's process_content now returns an Array of its documents; flatten
|
|
468
|
+
# so several recovered payloads yield one flat Array<doc> (the always-array
|
|
469
|
+
# contract), not an Array of Arrays.
|
|
470
|
+
payloads.flat_map do |payload|
|
|
471
|
+
SmarterJSON.send(:process_content, payload[:slice], options)
|
|
472
|
+
end
|
|
115
473
|
end
|
|
116
474
|
|
|
117
475
|
def emit_wrapper_warnings(payloads, handler)
|
|
@@ -146,16 +504,31 @@ module SmarterJSON
|
|
|
146
504
|
last = ranges.last
|
|
147
505
|
prefix = input.byteslice(0, first.begin)
|
|
148
506
|
suffix = input.byteslice(last.end, input.bytesize - last.end)
|
|
507
|
+
# Look for fence / wrapper markers only in the text we actually strip (outside
|
|
508
|
+
# every recovered payload), so a ``` or <json> sitting inside a payload's own
|
|
509
|
+
# string value does not trigger a "stripped a wrapper" warning.
|
|
510
|
+
outside = non_payload_text(input, ranges)
|
|
149
511
|
{
|
|
150
512
|
prefix: substantive_text?(prefix),
|
|
151
513
|
suffix: substantive_text?(suffix),
|
|
152
|
-
fence:
|
|
153
|
-
wrapper:
|
|
514
|
+
fence: outside.include?("```"),
|
|
515
|
+
wrapper: outside.match?(/<json\b|BEGIN_JSON\b/i),
|
|
154
516
|
first_pos: line_col_for(input, first.begin),
|
|
155
517
|
last_pos: line_col_for(input, last.begin)
|
|
156
518
|
}
|
|
157
519
|
end
|
|
158
520
|
|
|
521
|
+
def non_payload_text(input, ranges)
|
|
522
|
+
out = +""
|
|
523
|
+
pos = 0
|
|
524
|
+
ranges.each do |range|
|
|
525
|
+
out << input.byteslice(pos, range.begin - pos) if range.begin > pos
|
|
526
|
+
pos = range.end
|
|
527
|
+
end
|
|
528
|
+
out << input.byteslice(pos, input.bytesize - pos) if pos < input.bytesize
|
|
529
|
+
out
|
|
530
|
+
end
|
|
531
|
+
|
|
159
532
|
def line_col_for(input, offset)
|
|
160
533
|
line = 1
|
|
161
534
|
col = 1
|
|
@@ -164,15 +537,15 @@ module SmarterJSON
|
|
|
164
537
|
b = input.getbyte(i)
|
|
165
538
|
break if b.nil?
|
|
166
539
|
|
|
167
|
-
if b ==
|
|
540
|
+
if b == LF
|
|
168
541
|
line += 1
|
|
169
542
|
col = 1
|
|
170
543
|
i += 1
|
|
171
|
-
elsif b ==
|
|
544
|
+
elsif b == CR
|
|
172
545
|
line += 1
|
|
173
546
|
col = 1
|
|
174
547
|
i += 1
|
|
175
|
-
i += 1 if i < offset && input.getbyte(i) ==
|
|
548
|
+
i += 1 if i < offset && input.getbyte(i) == LF
|
|
176
549
|
else
|
|
177
550
|
col += 1
|
|
178
551
|
i += 1
|
|
@@ -203,19 +576,19 @@ module SmarterJSON
|
|
|
203
576
|
while i < input.bytesize
|
|
204
577
|
b = input.getbyte(i)
|
|
205
578
|
if mode == :double
|
|
206
|
-
if b ==
|
|
579
|
+
if b == BACKSLASH
|
|
207
580
|
i += 2
|
|
208
581
|
next
|
|
209
|
-
elsif b ==
|
|
582
|
+
elsif b == DQUOTE
|
|
210
583
|
mode = nil
|
|
211
584
|
end
|
|
212
585
|
i += 1
|
|
213
586
|
next
|
|
214
587
|
elsif mode == :single
|
|
215
|
-
if b ==
|
|
588
|
+
if b == BACKSLASH
|
|
216
589
|
i += 2
|
|
217
590
|
next
|
|
218
|
-
elsif b ==
|
|
591
|
+
elsif b == SQUOTE
|
|
219
592
|
mode = nil
|
|
220
593
|
end
|
|
221
594
|
i += 1
|
|
@@ -229,7 +602,7 @@ module SmarterJSON
|
|
|
229
602
|
end
|
|
230
603
|
next
|
|
231
604
|
elsif mode == :line_comment
|
|
232
|
-
if [
|
|
605
|
+
if [LF, CR].include?(b)
|
|
233
606
|
mode = nil
|
|
234
607
|
else
|
|
235
608
|
i += 1
|
|
@@ -252,11 +625,11 @@ module SmarterJSON
|
|
|
252
625
|
mode = :block_comment
|
|
253
626
|
i += 2
|
|
254
627
|
next
|
|
255
|
-
elsif b ==
|
|
628
|
+
elsif b == HASH
|
|
256
629
|
mode = :line_comment
|
|
257
630
|
i += 1
|
|
258
631
|
next
|
|
259
|
-
elsif b ==
|
|
632
|
+
elsif b == DQUOTE
|
|
260
633
|
mode = :double
|
|
261
634
|
i += 1
|
|
262
635
|
next
|
|
@@ -264,21 +637,21 @@ module SmarterJSON
|
|
|
264
637
|
mode = :triple
|
|
265
638
|
i += 3
|
|
266
639
|
next
|
|
267
|
-
elsif b ==
|
|
640
|
+
elsif b == SQUOTE
|
|
268
641
|
mode = :single
|
|
269
642
|
i += 1
|
|
270
643
|
next
|
|
271
|
-
elsif [
|
|
644
|
+
elsif [LBRACE, LBRACKET].include?(b)
|
|
272
645
|
start_pos = i if stack.empty?
|
|
273
646
|
stack << b
|
|
274
|
-
elsif b ==
|
|
275
|
-
stack.pop if stack.last ==
|
|
647
|
+
elsif b == RBRACE
|
|
648
|
+
stack.pop if stack.last == LBRACE
|
|
276
649
|
if stack.empty? && start_pos
|
|
277
650
|
ranges << (start_pos...(i + 1))
|
|
278
651
|
start_pos = nil
|
|
279
652
|
end
|
|
280
|
-
elsif b ==
|
|
281
|
-
stack.pop if stack.last ==
|
|
653
|
+
elsif b == RBRACKET
|
|
654
|
+
stack.pop if stack.last == LBRACKET
|
|
282
655
|
if stack.empty? && start_pos
|
|
283
656
|
ranges << (start_pos...(i + 1))
|
|
284
657
|
start_pos = nil
|
|
@@ -304,41 +677,7 @@ module SmarterJSON
|
|
|
304
677
|
# Python literals (True/False/None) and undefined, underscores in
|
|
305
678
|
# numeric literals, and encoding validation (SmarterJSON::EncodingError).
|
|
306
679
|
class Parser
|
|
307
|
-
|
|
308
|
-
RBRACE = 0x7D
|
|
309
|
-
LBRACKET = 0x5B
|
|
310
|
-
RBRACKET = 0x5D
|
|
311
|
-
COLON = 0x3A
|
|
312
|
-
COMMA = 0x2C
|
|
313
|
-
DQUOTE = 0x22
|
|
314
|
-
SQUOTE = 0x27
|
|
315
|
-
BACKSLASH = 0x5C
|
|
316
|
-
SLASH = 0x2F
|
|
317
|
-
STAR = 0x2A
|
|
318
|
-
HASH = 0x23
|
|
319
|
-
MINUS = 0x2D
|
|
320
|
-
PLUS = 0x2B
|
|
321
|
-
DOT = 0x2E
|
|
322
|
-
ZERO = 0x30
|
|
323
|
-
NINE = 0x39
|
|
324
|
-
LOWER_E = 0x65
|
|
325
|
-
UPPER_E = 0x45
|
|
326
|
-
LOWER_T = 0x74
|
|
327
|
-
LOWER_F = 0x66
|
|
328
|
-
LOWER_N = 0x6E
|
|
329
|
-
LOWER_U = 0x75
|
|
330
|
-
LOWER_X = 0x78
|
|
331
|
-
UPPER_X = 0x58
|
|
332
|
-
UPPER_I = 0x49
|
|
333
|
-
UPPER_N = 0x4E
|
|
334
|
-
UPPER_T = 0x54
|
|
335
|
-
UPPER_F = 0x46
|
|
336
|
-
UNDERSCORE = 0x5F
|
|
337
|
-
DOLLAR = 0x24
|
|
338
|
-
SPACE = 0x20
|
|
339
|
-
TAB = 0x09
|
|
340
|
-
LF = 0x0A
|
|
341
|
-
CR = 0x0D
|
|
680
|
+
include Bytes
|
|
342
681
|
|
|
343
682
|
NOT_NUMERIC = Object.new
|
|
344
683
|
HEX_RE = /\A[-+]?0[xX][0-9a-fA-F_]+\z/.freeze
|
|
@@ -350,18 +689,22 @@ module SmarterJSON
|
|
|
350
689
|
# followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
|
|
351
690
|
# would change the string — so when it doesn't match, we skip normalization.
|
|
352
691
|
NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
#
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
692
|
+
|
|
693
|
+
# parse_string scans to the next closing-quote-or-backslash. byteindex (Ruby 3.2+,
|
|
694
|
+
# MRI) does that jump at C speed; the getbyte loop in scan_string_delimiter is the
|
|
695
|
+
# portable fallback (JRuby / TruffleRuby / older MRI). Both find the same byte.
|
|
696
|
+
BYTEINDEX_AVAILABLE = "".respond_to?(:byteindex)
|
|
697
|
+
DQUOTE_OR_BACKSLASH = /["\\]/.freeze
|
|
698
|
+
SQUOTE_OR_BACKSLASH = /['\\]/.freeze
|
|
699
|
+
|
|
700
|
+
# scan_quoteless_run's fast path jumps (in C) to the first structural terminator
|
|
701
|
+
# (',' '}' ']' '{' '[') OR any whitespace ([[:space:]] covers ASCII + Unicode space,
|
|
702
|
+
# incl. LF/CR which also terminate). Stopping at a terminator/EOF means the run had no
|
|
703
|
+
# interior whitespace, so there's nothing to trim and no comment marker can apply.
|
|
704
|
+
QL_BREAK = /[,{}\[\]]|[[:space:]]/.freeze
|
|
705
|
+
|
|
706
|
+
# The defaults live centrally in SmarterJSON::Options (lib/smarter_json/options.rb).
|
|
707
|
+
DEFAULT_OPTIONS = Options::DEFAULT_OPTIONS
|
|
365
708
|
|
|
366
709
|
def initialize(input, options = {})
|
|
367
710
|
raise ArgumentError, "input must be a String" unless input.is_a?(String)
|
|
@@ -369,8 +712,13 @@ module SmarterJSON
|
|
|
369
712
|
opts = DEFAULT_OPTIONS.merge(options)
|
|
370
713
|
@symbolize_keys = opts[:symbolize_keys]
|
|
371
714
|
@duplicate_key = opts[:duplicate_key]
|
|
372
|
-
@
|
|
373
|
-
@on_warning
|
|
715
|
+
@decimal_precision = opts[:decimal_precision]
|
|
716
|
+
@on_warning = opts[:on_warning]
|
|
717
|
+
# store_member only needs the (per-member) Hash#key? duplicate lookup when a
|
|
718
|
+
# repeat would change behavior: a warning must fire, or :first_wins must keep the
|
|
719
|
+
# first. With the default (:last_wins, no handler) a duplicate just overwrites,
|
|
720
|
+
# which `hash[k] = value` already does — so skip the lookup entirely.
|
|
721
|
+
@check_duplicates = !@on_warning.nil? || @duplicate_key == :first_wins
|
|
374
722
|
|
|
375
723
|
encoding = opts[:encoding]
|
|
376
724
|
@input = encoding ? input.dup.force_encoding(encoding) : input
|
|
@@ -379,8 +727,6 @@ module SmarterJSON
|
|
|
379
727
|
@bytesize = @input.bytesize
|
|
380
728
|
# Skip a UTF-8 BOM (EF BB BF) at the start of input.
|
|
381
729
|
@pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
|
|
382
|
-
@line = 1
|
|
383
|
-
@col = 1
|
|
384
730
|
end
|
|
385
731
|
|
|
386
732
|
# No block: auto-detect the document count for free (the same "is there
|
|
@@ -390,17 +736,14 @@ module SmarterJSON
|
|
|
390
736
|
# value. Commas do NOT separate documents (only whitespace / newline /
|
|
391
737
|
# concatenation do), so a bracketless comma list still raises in parse_document.
|
|
392
738
|
def parse
|
|
393
|
-
|
|
394
|
-
return nil if eof?
|
|
395
|
-
|
|
396
|
-
value = parse_document
|
|
397
|
-
skip_whitespace_and_comments
|
|
398
|
-
return value if eof?
|
|
399
|
-
|
|
400
|
-
results = [value]
|
|
739
|
+
results = []
|
|
401
740
|
until eof?
|
|
402
|
-
|
|
403
|
-
|
|
741
|
+
skip_document_separators
|
|
742
|
+
break if eof?
|
|
743
|
+
|
|
744
|
+
value = parse_document
|
|
745
|
+
enforce_scalar_boundary(value)
|
|
746
|
+
results << value
|
|
404
747
|
end
|
|
405
748
|
results
|
|
406
749
|
end
|
|
@@ -408,13 +751,17 @@ module SmarterJSON
|
|
|
408
751
|
# Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
|
|
409
752
|
# whitespace-separated). Used by the block form of SmarterJSON.process.
|
|
410
753
|
def each_value
|
|
411
|
-
|
|
412
|
-
|
|
754
|
+
count = 0
|
|
755
|
+
until eof?
|
|
756
|
+
skip_document_separators
|
|
413
757
|
break if eof?
|
|
414
758
|
|
|
415
|
-
|
|
759
|
+
value = parse_document
|
|
760
|
+
enforce_scalar_boundary(value)
|
|
761
|
+
yield value
|
|
762
|
+
count += 1
|
|
416
763
|
end
|
|
417
|
-
|
|
764
|
+
count
|
|
418
765
|
end
|
|
419
766
|
|
|
420
767
|
private
|
|
@@ -425,6 +772,48 @@ module SmarterJSON
|
|
|
425
772
|
parse_iter(implicit_root_object_ahead?)
|
|
426
773
|
end
|
|
427
774
|
|
|
775
|
+
# Between top-level documents, whitespace, comments, AND commas all separate
|
|
776
|
+
# (commas collapse like the in-container lenient-comma rule). A space alone never
|
|
777
|
+
# separates — that is handled inside the document by the quoteless run, so
|
|
778
|
+
# `1 2 3` is one document (the string "1 2 3") while `1, 2, 3` is three.
|
|
779
|
+
def skip_document_separators
|
|
780
|
+
skip_whitespace_and_comments
|
|
781
|
+
while byte == COMMA
|
|
782
|
+
advance(1)
|
|
783
|
+
skip_whitespace_and_comments
|
|
784
|
+
end
|
|
785
|
+
end
|
|
786
|
+
|
|
787
|
+
# After a top-level value: a self-delimiting value (object / array / quoted string)
|
|
788
|
+
# may be followed by anything (the next document self-delimits), but a bare scalar
|
|
789
|
+
# (number / keyword) must be followed by a real separator — a newline, ',', a
|
|
790
|
+
# comment, or EOF. A space is NOT a separator, so `1 2 3` and `42 "x" true` raise
|
|
791
|
+
# rather than silently splitting; bare top-level words raise in parse_value itself.
|
|
792
|
+
def enforce_scalar_boundary(value)
|
|
793
|
+
return if value.is_a?(String) || value.is_a?(Hash) || value.is_a?(Array)
|
|
794
|
+
|
|
795
|
+
skip_horizontal_whitespace
|
|
796
|
+
b = byte
|
|
797
|
+
return if b.nil? || b == LF || b == CR || b == COMMA
|
|
798
|
+
return if b == HASH || (b == SLASH && ((c = byte_at(1)) == SLASH || c == STAR))
|
|
799
|
+
|
|
800
|
+
raise error("a top-level number or keyword must be followed by a newline, ',', or end of input")
|
|
801
|
+
end
|
|
802
|
+
|
|
803
|
+
# Skip horizontal whitespace only (space / tab / VT / FF) — NOT newlines, which are
|
|
804
|
+
# document separators. Used by the scalar-boundary check above.
|
|
805
|
+
def skip_horizontal_whitespace
|
|
806
|
+
while (b = byte)
|
|
807
|
+
if b == SPACE || b == TAB || b == 0x0B || b == 0x0C
|
|
808
|
+
advance(1)
|
|
809
|
+
elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
|
|
810
|
+
@pos += n # multibyte horizontal whitespace (NBSP, U+2000–200A, …)
|
|
811
|
+
else
|
|
812
|
+
break
|
|
813
|
+
end
|
|
814
|
+
end
|
|
815
|
+
end
|
|
816
|
+
|
|
428
817
|
# Iterative container parser — explicit stack, NO Ruby recursion, so nesting
|
|
429
818
|
# is bounded only by memory (like Oj and the C extension's fj_parse_iter),
|
|
430
819
|
# never by the call stack. Mirrors the C driver to keep the two paths in
|
|
@@ -445,9 +834,10 @@ module SmarterJSON
|
|
|
445
834
|
end
|
|
446
835
|
|
|
447
836
|
vss = false # warnings: has a value landed in the current container since the last separator?
|
|
448
|
-
|
|
837
|
+
input = @input # hoisted: @input never changes mid-parse; byte reads inline as input.getbyte(@pos)
|
|
838
|
+
while true
|
|
449
839
|
skip_whitespace_and_comments
|
|
450
|
-
b =
|
|
840
|
+
b = input.getbyte(@pos)
|
|
451
841
|
if at_top
|
|
452
842
|
if b == LBRACE
|
|
453
843
|
advance(1)
|
|
@@ -466,8 +856,17 @@ module SmarterJSON
|
|
|
466
856
|
at_top = false
|
|
467
857
|
vss = false
|
|
468
858
|
elsif b.nil?
|
|
859
|
+
# Defensive guard: parse / each_value check eof? before calling parse_iter,
|
|
860
|
+
# so `at_top` never meets end-of-input here. Kept to mirror the C driver.
|
|
861
|
+
# :nocov:
|
|
469
862
|
raise error("unexpected end of input")
|
|
863
|
+
# :nocov:
|
|
470
864
|
else
|
|
865
|
+
# Top-level scalar: must be a recognized JSON value (number / literal /
|
|
866
|
+
# quoted string). A bare word raises — there are no top-level quoteless
|
|
867
|
+
# strings (Decision 2 = B-broad). In-container quoteless still uses
|
|
868
|
+
# parse_member_value; the scalar-vs-separator boundary is enforced by the
|
|
869
|
+
# parse / each_value loop via enforce_scalar_boundary.
|
|
471
870
|
return parse_value
|
|
472
871
|
end
|
|
473
872
|
elsif b == COMMA
|
|
@@ -495,12 +894,12 @@ module SmarterJSON
|
|
|
495
894
|
else
|
|
496
895
|
key = parse_object_key
|
|
497
896
|
skip_whitespace_and_comments
|
|
498
|
-
raise error("expected ':' after key #{key.inspect}") unless
|
|
897
|
+
raise error("expected ':' after key #{key.inspect}") unless input.getbyte(@pos) == COLON
|
|
499
898
|
|
|
500
899
|
advance(1)
|
|
501
900
|
skip_whitespace_and_comments
|
|
502
|
-
b =
|
|
503
|
-
if
|
|
901
|
+
b = input.getbyte(@pos)
|
|
902
|
+
if b == LBRACE || b == LBRACKET
|
|
504
903
|
child = b == LBRACE ? {} : []
|
|
505
904
|
advance(1) # consume { or [
|
|
506
905
|
store_member(cur, key, child)
|
|
@@ -508,7 +907,7 @@ module SmarterJSON
|
|
|
508
907
|
cur = child
|
|
509
908
|
cur_obj = (b == LBRACE)
|
|
510
909
|
vss = false
|
|
511
|
-
elsif
|
|
910
|
+
elsif b == RBRACE || b == COMMA
|
|
512
911
|
# key with a colon but no value -> null (don't consume } or ,; the loop does)
|
|
513
912
|
store_member(cur, key, nil)
|
|
514
913
|
warn(:empty_value, "key #{key.inspect} had no value — used null") if @on_warning
|
|
@@ -533,7 +932,7 @@ module SmarterJSON
|
|
|
533
932
|
raise error("unterminated array")
|
|
534
933
|
elsif b == RBRACE
|
|
535
934
|
raise error("unexpected '}' — expected ']' or a value")
|
|
536
|
-
elsif
|
|
935
|
+
elsif b == LBRACE || b == LBRACKET
|
|
537
936
|
child = b == LBRACE ? {} : []
|
|
538
937
|
advance(1) # consume { or [
|
|
539
938
|
cur.push(child)
|
|
@@ -555,11 +954,11 @@ module SmarterJSON
|
|
|
555
954
|
b = byte
|
|
556
955
|
return false unless b && key_start_byte?(b)
|
|
557
956
|
|
|
558
|
-
saved =
|
|
957
|
+
saved = @pos
|
|
559
958
|
advance(1) while (c = byte) && key_continue_byte?(c)
|
|
560
959
|
skip_pure_whitespace
|
|
561
960
|
result = (byte == COLON)
|
|
562
|
-
@pos
|
|
961
|
+
@pos = saved
|
|
563
962
|
result
|
|
564
963
|
end
|
|
565
964
|
|
|
@@ -577,46 +976,72 @@ module SmarterJSON
|
|
|
577
976
|
@pos >= @bytesize
|
|
578
977
|
end
|
|
579
978
|
|
|
979
|
+
# Advance the byte cursor by n (clamped to EOF). No line/col bookkeeping — that
|
|
980
|
+
# is computed lazily in line_col_at only when an error/warning is built. This is
|
|
981
|
+
# the hot-path primitive every consumed byte goes through, so it stays O(1) with
|
|
982
|
+
# no block, no re-read, and no per-byte branching. Mirrors the C fj_advance.
|
|
580
983
|
def advance(n = 1)
|
|
581
|
-
n
|
|
582
|
-
|
|
583
|
-
|
|
984
|
+
@pos += n
|
|
985
|
+
@pos = @bytesize if @pos > @bytesize
|
|
986
|
+
end
|
|
584
987
|
|
|
988
|
+
# Line and 1-based BYTE column at byte position `pos`, computed lazily by scanning
|
|
989
|
+
# from the start of the buffer — only on the cold path (error / warning / triple-quote
|
|
990
|
+
# indent), never per byte. CR, LF, and CRLF each count as one newline; the column is
|
|
991
|
+
# the byte offset within the line. Mirrors the C extension's fj_line_col so both paths
|
|
992
|
+
# report identical positions.
|
|
993
|
+
def line_col_at(pos = @pos)
|
|
994
|
+
limit = pos < @bytesize ? pos : @bytesize
|
|
995
|
+
line = 1
|
|
996
|
+
col = 1
|
|
997
|
+
i = 0
|
|
998
|
+
while i < limit
|
|
999
|
+
b = @input.getbyte(i)
|
|
585
1000
|
if b == LF
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
@pos += 1
|
|
1001
|
+
line += 1
|
|
1002
|
+
col = 1
|
|
589
1003
|
elsif b == CR
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
@pos += 1 if @input.getbyte(@pos) == LF
|
|
1004
|
+
line += 1
|
|
1005
|
+
col = 1
|
|
1006
|
+
i += 1 if i + 1 < @bytesize && @input.getbyte(i + 1) == LF
|
|
594
1007
|
else
|
|
595
|
-
|
|
596
|
-
@pos += 1
|
|
1008
|
+
col += 1
|
|
597
1009
|
end
|
|
1010
|
+
i += 1
|
|
1011
|
+
end
|
|
1012
|
+
[line, col]
|
|
1013
|
+
end
|
|
1014
|
+
|
|
1015
|
+
# 1-based byte column at `pos` (bytes since the last line start). Used for
|
|
1016
|
+
# triple-quoted-string indentation stripping. Mirrors the C fj_column.
|
|
1017
|
+
def column_at(pos = @pos)
|
|
1018
|
+
c = 1
|
|
1019
|
+
i = pos - 1
|
|
1020
|
+
while i >= 0 && (b = @input.getbyte(i)) != LF && b != CR
|
|
1021
|
+
c += 1
|
|
1022
|
+
i -= 1
|
|
598
1023
|
end
|
|
1024
|
+
c
|
|
599
1025
|
end
|
|
600
1026
|
|
|
601
1027
|
# --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
|
|
602
1028
|
|
|
603
1029
|
def skip_pure_whitespace
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
1030
|
+
input = @input
|
|
1031
|
+
pos = @pos
|
|
1032
|
+
while (b = input.getbyte(pos))
|
|
608
1033
|
if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
|
|
609
|
-
|
|
1034
|
+
pos += 1
|
|
610
1035
|
elsif b >= 0x80
|
|
611
|
-
n = multibyte_ws_len(
|
|
1036
|
+
n = multibyte_ws_len(pos)
|
|
612
1037
|
break if n.zero?
|
|
613
1038
|
|
|
614
|
-
|
|
615
|
-
@col += 1
|
|
1039
|
+
pos += n
|
|
616
1040
|
else
|
|
617
1041
|
break
|
|
618
1042
|
end
|
|
619
1043
|
end
|
|
1044
|
+
@pos = pos
|
|
620
1045
|
end
|
|
621
1046
|
|
|
622
1047
|
# Number of bytes of the Unicode-whitespace char starting at pos, or 0.
|
|
@@ -650,19 +1075,20 @@ module SmarterJSON
|
|
|
650
1075
|
# A '#', '//', or '/*' starts a comment only when preceded by whitespace
|
|
651
1076
|
# or at the very start of input (the comment-marker rule).
|
|
652
1077
|
def skip_whitespace_and_comments
|
|
653
|
-
|
|
1078
|
+
while true
|
|
654
1079
|
skip_pure_whitespace
|
|
655
1080
|
b = byte
|
|
656
|
-
|
|
1081
|
+
if b == HASH
|
|
1082
|
+
break unless preceded_by_ws_or_start?
|
|
657
1083
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
1084
|
+
skip_to_eol
|
|
1085
|
+
elsif b == SLASH
|
|
1086
|
+
c = byte_at(1)
|
|
1087
|
+
break unless (c == SLASH || c == STAR) && preceded_by_ws_or_start?
|
|
661
1088
|
|
|
662
|
-
|
|
663
|
-
skip_block_comment
|
|
1089
|
+
c == STAR ? skip_block_comment : skip_to_eol
|
|
664
1090
|
else
|
|
665
|
-
|
|
1091
|
+
break
|
|
666
1092
|
end
|
|
667
1093
|
end
|
|
668
1094
|
end
|
|
@@ -702,8 +1128,9 @@ module SmarterJSON
|
|
|
702
1128
|
# --- values ---
|
|
703
1129
|
|
|
704
1130
|
# Top-level / strict value: no quoteless fallback.
|
|
1131
|
+
# Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
|
|
1132
|
+
# so @pos is at the value's first byte — no leading skip needed here.
|
|
705
1133
|
def parse_value
|
|
706
|
-
skip_whitespace_and_comments
|
|
707
1134
|
raise error("unexpected end of input") if eof?
|
|
708
1135
|
|
|
709
1136
|
b = byte
|
|
@@ -736,8 +1163,9 @@ module SmarterJSON
|
|
|
736
1163
|
end
|
|
737
1164
|
|
|
738
1165
|
# Value in object-value or array-element position: quoteless allowed.
|
|
1166
|
+
# Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
|
|
1167
|
+
# so @pos is at the value's first byte — no leading skip needed here.
|
|
739
1168
|
def parse_member_value
|
|
740
|
-
skip_whitespace_and_comments
|
|
741
1169
|
raise error("unexpected end of input") if eof?
|
|
742
1170
|
|
|
743
1171
|
b = byte
|
|
@@ -770,7 +1198,7 @@ module SmarterJSON
|
|
|
770
1198
|
until eof?
|
|
771
1199
|
if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
|
|
772
1200
|
closers.include?(@input.getbyte(@pos + 2))
|
|
773
|
-
result = @input.byteslice(start, @pos - start)
|
|
1201
|
+
result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
|
|
774
1202
|
advance(3)
|
|
775
1203
|
return result
|
|
776
1204
|
end
|
|
@@ -781,9 +1209,7 @@ module SmarterJSON
|
|
|
781
1209
|
|
|
782
1210
|
def store_member(hash, key, value)
|
|
783
1211
|
k = @symbolize_keys ? key.to_sym : key
|
|
784
|
-
if hash.key?(k)
|
|
785
|
-
raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
|
|
786
|
-
|
|
1212
|
+
if @check_duplicates && hash.key?(k)
|
|
787
1213
|
warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}") if @on_warning
|
|
788
1214
|
return if @duplicate_key == :first_wins
|
|
789
1215
|
end
|
|
@@ -814,51 +1240,77 @@ module SmarterJSON
|
|
|
814
1240
|
start = @pos
|
|
815
1241
|
advance(1)
|
|
816
1242
|
advance(1) while (b = byte) && key_continue_byte?(b)
|
|
817
|
-
@input.byteslice(start, @pos - start)
|
|
1243
|
+
@input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
|
|
818
1244
|
end
|
|
819
1245
|
|
|
820
1246
|
# --- quoteless strings & literal classification ---
|
|
821
1247
|
|
|
822
1248
|
def parse_quoteless_or_literal
|
|
823
1249
|
start = @pos
|
|
824
|
-
scan_quoteless_run
|
|
1250
|
+
value_end = scan_quoteless_run
|
|
825
1251
|
# A quoteless run must consume at least one byte. If the first byte is a
|
|
826
1252
|
# delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
|
|
827
1253
|
# here would make the caller's `result << parse_member_value` loop forever.
|
|
828
1254
|
# Raise instead (correct today: the Lenient Commas Option is not adopted).
|
|
829
1255
|
raise error("expected a value") if @pos == start
|
|
830
1256
|
|
|
831
|
-
|
|
832
|
-
|
|
1257
|
+
# value_end is the end of the last non-whitespace char in the run; slicing to it
|
|
1258
|
+
# drops trailing whitespace without a regex (the caller already skipped leading
|
|
1259
|
+
# whitespace, so there is none to trim at the front). Equivalent to the old
|
|
1260
|
+
# trim_blank(raw) but with no per-scalar String#sub allocations.
|
|
1261
|
+
raw = @input.byteslice(start, value_end - start) # byteslice preserves @input's encoding
|
|
1262
|
+
classify_quoteless(raw)
|
|
833
1263
|
end
|
|
834
1264
|
|
|
835
1265
|
# Advance to the end of a quoteless run. Stops at structural punctuation
|
|
836
|
-
# (',' '}' ']'
|
|
837
|
-
#
|
|
1266
|
+
# (',' '{' '}' '[' ']' — openers terminate symmetrically with closers, so a
|
|
1267
|
+
# self-delimiting value starts fresh: `localhost {"a":1}` -> ["localhost", {...}]),
|
|
1268
|
+
# a newline, EOF, or a comment marker that is preceded by whitespace. Spaces by
|
|
1269
|
+
# themselves are not delimiters.
|
|
1270
|
+
# Advance @pos to the end of the quoteless run (including any trailing whitespace,
|
|
1271
|
+
# so the parser resumes correctly after the value). Returns value_end: the byte
|
|
1272
|
+
# offset just past the last NON-whitespace char, so the caller can slice off
|
|
1273
|
+
# trailing whitespace without a regex.
|
|
838
1274
|
def scan_quoteless_run
|
|
1275
|
+
input = @input
|
|
1276
|
+
pos = @pos
|
|
1277
|
+
# Fast path: one C-level byteindex jumps to the first structural terminator or
|
|
1278
|
+
# whitespace. If it lands on a terminator (or EOF) the run had no interior whitespace,
|
|
1279
|
+
# so [pos, hit) is the whole value — value_end == hit (no trailing trim) and no comment
|
|
1280
|
+
# marker can apply (those only break after whitespace). This is the common case
|
|
1281
|
+
# (numbers and simple tokens). Anything with whitespace falls to the byte-by-byte loop.
|
|
1282
|
+
if BYTEINDEX_AVAILABLE
|
|
1283
|
+
hit = input.byteindex(QL_BREAK, pos) || @bytesize
|
|
1284
|
+
b = hit < @bytesize ? input.getbyte(hit) : nil
|
|
1285
|
+
if b.nil? || b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
|
|
1286
|
+
@pos = hit
|
|
1287
|
+
return hit
|
|
1288
|
+
end
|
|
1289
|
+
end
|
|
1290
|
+
|
|
1291
|
+
# Slow path: the run contains whitespace — scan byte by byte to honor interior
|
|
1292
|
+
# whitespace, trailing-whitespace trimming (value_end is the end of the last
|
|
1293
|
+
# non-whitespace char), and the comment-marker-after-whitespace rule.
|
|
1294
|
+
value_end = pos
|
|
839
1295
|
prev_ws = false
|
|
840
|
-
|
|
841
|
-
b
|
|
842
|
-
break if b.
|
|
843
|
-
break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
|
|
844
|
-
break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
|
|
1296
|
+
while (b = input.getbyte(pos))
|
|
1297
|
+
break if b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
|
|
1298
|
+
break if prev_ws && (b == HASH || (b == SLASH && ((c = input.getbyte(pos + 1)) == SLASH || c == STAR)))
|
|
845
1299
|
|
|
846
1300
|
if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
|
|
847
1301
|
prev_ws = true
|
|
848
|
-
|
|
849
|
-
elsif b >= 0x80 && (n = multibyte_ws_len(
|
|
1302
|
+
pos += 1
|
|
1303
|
+
elsif b >= 0x80 && (n = multibyte_ws_len(pos)).positive?
|
|
850
1304
|
prev_ws = true
|
|
851
|
-
|
|
852
|
-
@col += 1
|
|
1305
|
+
pos += n
|
|
853
1306
|
else
|
|
854
1307
|
prev_ws = false
|
|
855
|
-
|
|
1308
|
+
pos += 1
|
|
1309
|
+
value_end = pos
|
|
856
1310
|
end
|
|
857
1311
|
end
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
def trim_blank(str)
|
|
861
|
-
str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
|
|
1312
|
+
@pos = pos
|
|
1313
|
+
value_end
|
|
862
1314
|
end
|
|
863
1315
|
|
|
864
1316
|
def classify_quoteless(str)
|
|
@@ -869,7 +1321,7 @@ module SmarterJSON
|
|
|
869
1321
|
when "undefined" then return nil
|
|
870
1322
|
when "NaN" then return Float::NAN
|
|
871
1323
|
when "Infinity", "+Infinity" then return Float::INFINITY
|
|
872
|
-
when "-Infinity" then return
|
|
1324
|
+
when "-Infinity" then return -Float::INFINITY
|
|
873
1325
|
end
|
|
874
1326
|
num = numeric_value(str)
|
|
875
1327
|
num.equal?(NOT_NUMERIC) ? str : num
|
|
@@ -877,31 +1329,73 @@ module SmarterJSON
|
|
|
877
1329
|
|
|
878
1330
|
# Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
|
|
879
1331
|
def numeric_value(str)
|
|
880
|
-
|
|
881
|
-
|
|
1332
|
+
# Cheap hex gate: only invoke HEX_RE when the token actually looks like [+-]?0x… .
|
|
1333
|
+
# A Regexp#match? has real per-call cost; almost no number is hex, so the 1–3 byte
|
|
1334
|
+
# check skips that call on the common path (measured +21% on long-token decimals).
|
|
1335
|
+
if hex_prefix?(str) && HEX_RE.match?(str)
|
|
1336
|
+
neg = str.getbyte(0) == MINUS
|
|
882
1337
|
body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
|
|
883
1338
|
v = body[2..-1].to_i(16)
|
|
884
1339
|
return neg ? -v : v
|
|
885
1340
|
end
|
|
886
1341
|
return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
|
|
887
1342
|
|
|
888
|
-
|
|
1343
|
+
# delete("_") allocates a fresh string even when there is nothing to delete; on long
|
|
1344
|
+
# number tokens that is a real per-value allocation. Underscores are rare, so only
|
|
1345
|
+
# pay it when the token actually contains one (measured +27% on long-token decimals).
|
|
1346
|
+
body = str.include?("_") ? str.delete("_") : str
|
|
889
1347
|
body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
|
|
890
1348
|
end
|
|
891
1349
|
|
|
892
|
-
#
|
|
1350
|
+
# True when the token starts with [+-]?0[xX] — the only shape HEX_RE can match.
|
|
1351
|
+
def hex_prefix?(str)
|
|
1352
|
+
c0 = str.getbyte(0)
|
|
1353
|
+
if c0 == ZERO
|
|
1354
|
+
x = str.getbyte(1)
|
|
1355
|
+
x == LOWER_X || x == UPPER_X
|
|
1356
|
+
elsif c0 == MINUS || c0 == PLUS
|
|
1357
|
+
str.getbyte(1) == ZERO && ((x = str.getbyte(2)) == LOWER_X || x == UPPER_X)
|
|
1358
|
+
else
|
|
1359
|
+
false
|
|
1360
|
+
end
|
|
1361
|
+
end
|
|
1362
|
+
|
|
1363
|
+
# A decimal (has '.' or exponent). decimal_precision: :float -> Float,
|
|
893
1364
|
# :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
|
|
894
1365
|
# than 16 significant digits (Oj's DEC_MAX threshold), else Float.
|
|
895
1366
|
def decimal_value(body)
|
|
896
|
-
case @
|
|
1367
|
+
case @decimal_precision
|
|
897
1368
|
when :float then body.to_f
|
|
898
1369
|
when :bigdecimal then to_big_decimal(body)
|
|
899
1370
|
else significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
|
|
900
1371
|
end
|
|
901
1372
|
end
|
|
902
1373
|
|
|
1374
|
+
# Count significant mantissa digits (leading zeros excluded, exponent ignored) to pick
|
|
1375
|
+
# Float vs BigDecimal in :auto mode. A single byte-scan — the old three-regex version
|
|
1376
|
+
# (strip exponent, strip non-digits, strip leading zeros, .length) ran on every float
|
|
1377
|
+
# and dominated the number path's cost. body is a DEC_RE-validated token (digits, at most
|
|
1378
|
+
# one '.', optional sign, optional e/E exponent), underscores already removed.
|
|
903
1379
|
def significant_digits(body)
|
|
904
|
-
|
|
1380
|
+
count = 0
|
|
1381
|
+
leading = true
|
|
1382
|
+
i = 0
|
|
1383
|
+
n = body.bytesize
|
|
1384
|
+
while i < n
|
|
1385
|
+
b = body.getbyte(i)
|
|
1386
|
+
i += 1
|
|
1387
|
+
break if b == LOWER_E || b == UPPER_E # exponent: its digits aren't significant
|
|
1388
|
+
|
|
1389
|
+
next unless b >= ZERO && b <= NINE # skip sign and the decimal point
|
|
1390
|
+
|
|
1391
|
+
if leading && b == ZERO
|
|
1392
|
+
next # leading zero (incl. those after '.') — not significant
|
|
1393
|
+
else
|
|
1394
|
+
leading = false
|
|
1395
|
+
count += 1
|
|
1396
|
+
end
|
|
1397
|
+
end
|
|
1398
|
+
count
|
|
905
1399
|
end
|
|
906
1400
|
|
|
907
1401
|
def to_big_decimal(body)
|
|
@@ -912,7 +1406,11 @@ module SmarterJSON
|
|
|
912
1406
|
body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
|
|
913
1407
|
BigDecimal(body)
|
|
914
1408
|
rescue ArgumentError
|
|
1409
|
+
# Defensive: BigDecimal() does not reject a DEC_RE-validated, normalized token,
|
|
1410
|
+
# so this fallback is unreachable from valid input. Kept as a safety net.
|
|
1411
|
+
# :nocov:
|
|
915
1412
|
body.to_f
|
|
1413
|
+
# :nocov:
|
|
916
1414
|
end
|
|
917
1415
|
|
|
918
1416
|
# BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
|
|
@@ -931,7 +1429,7 @@ module SmarterJSON
|
|
|
931
1429
|
end
|
|
932
1430
|
|
|
933
1431
|
def parse_triple_quoted
|
|
934
|
-
indent = @
|
|
1432
|
+
indent = column_at(@pos) - 1
|
|
935
1433
|
advance(3)
|
|
936
1434
|
raw_start = @pos
|
|
937
1435
|
until eof?
|
|
@@ -941,7 +1439,7 @@ module SmarterJSON
|
|
|
941
1439
|
end
|
|
942
1440
|
raise error("unterminated triple-quoted string") if eof?
|
|
943
1441
|
|
|
944
|
-
raw = @input.byteslice(raw_start, @pos - raw_start)
|
|
1442
|
+
raw = @input.byteslice(raw_start, @pos - raw_start) # byteslice preserves @input's encoding
|
|
945
1443
|
advance(3)
|
|
946
1444
|
strip_triple(raw, indent)
|
|
947
1445
|
end
|
|
@@ -971,20 +1469,30 @@ module SmarterJSON
|
|
|
971
1469
|
def parse_string(quote)
|
|
972
1470
|
advance(1)
|
|
973
1471
|
start = @pos
|
|
974
|
-
|
|
1472
|
+
# Fast path (the common case — a string with no escapes): jump straight to the
|
|
1473
|
+
# closing quote with byteindex. It is called only here, from `start`, which is
|
|
1474
|
+
# always a character boundary, so byteindex never sees a mid-char offset.
|
|
1475
|
+
hit = scan_string_delimiter(quote)
|
|
1476
|
+
raise error("unterminated string") if hit.nil?
|
|
1477
|
+
|
|
1478
|
+
if @input.getbyte(hit) == quote
|
|
1479
|
+
@pos = hit
|
|
1480
|
+
result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
|
|
1481
|
+
advance(1)
|
|
1482
|
+
return result
|
|
1483
|
+
end
|
|
1484
|
+
|
|
1485
|
+
# Escape path: a backslash precedes the closing quote. Scan byte by byte from
|
|
1486
|
+
# here — byteindex can't be used past a backslash (a lenient \<multibyte> would
|
|
1487
|
+
# leave @pos mid-character), and this lets the decoder flag invalid escapes
|
|
1488
|
+
# exactly as before. decode_string_with_escapes handles the whole [start, finish].
|
|
1489
|
+
@pos = hit
|
|
975
1490
|
while (b = byte)
|
|
976
1491
|
if b == quote
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
return decoded
|
|
981
|
-
else
|
|
982
|
-
result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
|
|
983
|
-
advance(1)
|
|
984
|
-
return result
|
|
985
|
-
end
|
|
1492
|
+
decoded = decode_string_with_escapes(start, @pos, quote)
|
|
1493
|
+
advance(1)
|
|
1494
|
+
return decoded
|
|
986
1495
|
elsif b == BACKSLASH
|
|
987
|
-
has_escape = true
|
|
988
1496
|
advance(1)
|
|
989
1497
|
raise error("unterminated string escape") if eof?
|
|
990
1498
|
|
|
@@ -996,6 +1504,20 @@ module SmarterJSON
|
|
|
996
1504
|
raise error("unterminated string")
|
|
997
1505
|
end
|
|
998
1506
|
|
|
1507
|
+
# Byte index of the next closing quote or backslash at/after @pos, or nil if
|
|
1508
|
+
# neither occurs before EOF. byteindex scans inside MRI's C; the fallback is a
|
|
1509
|
+
# tight getbyte loop (the ASCII delimiters never alias UTF-8 continuation bytes,
|
|
1510
|
+
# so byte scanning is correct for UTF-8 string content).
|
|
1511
|
+
def scan_string_delimiter(quote)
|
|
1512
|
+
if BYTEINDEX_AVAILABLE
|
|
1513
|
+
@input.byteindex(quote == DQUOTE ? DQUOTE_OR_BACKSLASH : SQUOTE_OR_BACKSLASH, @pos)
|
|
1514
|
+
else
|
|
1515
|
+
i = @pos
|
|
1516
|
+
i += 1 while i < @bytesize && (b = @input.getbyte(i)) != quote && b != BACKSLASH
|
|
1517
|
+
i < @bytesize ? i : nil
|
|
1518
|
+
end
|
|
1519
|
+
end
|
|
1520
|
+
|
|
999
1521
|
def decode_string_with_escapes(start, finish, _quote)
|
|
1000
1522
|
buf = String.new(encoding: Encoding::ASCII_8BIT)
|
|
1001
1523
|
i = start
|
|
@@ -1087,7 +1609,7 @@ module SmarterJSON
|
|
|
1087
1609
|
|
|
1088
1610
|
if byte == ZERO
|
|
1089
1611
|
advance(1)
|
|
1090
|
-
if
|
|
1612
|
+
if (x = byte) == LOWER_X || x == UPPER_X
|
|
1091
1613
|
advance(1)
|
|
1092
1614
|
hex_start = @pos
|
|
1093
1615
|
advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
|
|
@@ -1112,10 +1634,10 @@ module SmarterJSON
|
|
|
1112
1634
|
advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
|
|
1113
1635
|
end
|
|
1114
1636
|
|
|
1115
|
-
if
|
|
1637
|
+
if (e = byte) == LOWER_E || e == UPPER_E
|
|
1116
1638
|
is_float = true
|
|
1117
1639
|
advance(1)
|
|
1118
|
-
advance(1) if
|
|
1640
|
+
advance(1) if (s = byte) == PLUS || s == MINUS
|
|
1119
1641
|
raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
|
|
1120
1642
|
|
|
1121
1643
|
advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
|
|
@@ -1151,11 +1673,13 @@ module SmarterJSON
|
|
|
1151
1673
|
def warn(type, message)
|
|
1152
1674
|
return unless @on_warning
|
|
1153
1675
|
|
|
1154
|
-
|
|
1676
|
+
line, col = line_col_at(@pos)
|
|
1677
|
+
@on_warning.call(Warning.new(type, message, line, col))
|
|
1155
1678
|
end
|
|
1156
1679
|
|
|
1157
1680
|
def error(message)
|
|
1158
|
-
|
|
1681
|
+
line, col = line_col_at(@pos)
|
|
1682
|
+
ParseError.new(message, line, col)
|
|
1159
1683
|
end
|
|
1160
1684
|
|
|
1161
1685
|
def display_byte(b)
|