smarter_json 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,957 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterJSON
4
+ # ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
5
+ # from the shared SmarterJSON::Error base.
6
+
7
+ module_function
8
+
9
+ # SmarterJSON.process(input, options = {}) — the main entry point.
10
+ #
11
+ # `input` is either a String of JSON content or an IO to read from. (A String
12
+ # is always content, never a filename — use process_file for paths.) The values
13
+ # in `options` override Parser::DEFAULT_OPTIONS.
14
+ #
15
+ # Without a block: returns nil (zero documents), the value (one document), or an
16
+ # Array of the values (two or more — NDJSON / JSONL / concatenated / whitespace-
17
+ # separated). :acceleration (default true) selects the C extension when compiled
18
+ # and loaded (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
+ #
20
+ # With a block: yields each top-level document as it is parsed, and returns nil.
21
+ # For an IO this streams document-by-document in bounded memory — it reads the
22
+ # stream as newline-delimited documents (NDJSON / JSONL), one per line.
23
+ def process(input, options = {}, &block)
24
+ if input.is_a?(String)
25
+ process_content(input, options, &block)
26
+ elsif input.respond_to?(:read)
27
+ block ? stream_io(input, options, &block) : process_content(input.read, options)
28
+ else
29
+ raise ArgumentError, "SmarterJSON.process expects a String or an IO, got #{input.class}"
30
+ end
31
+ end
32
+
33
+ # SmarterJSON.process_file(path, options = {}) — open a file and process it.
34
+ #
35
+ # The :encoding option labels the file's encoding (default "UTF-8"); it does NOT
36
+ # trigger a transcoding pass — the parser works on the bytes in their native
37
+ # encoding and emits string values with the same encoding tag. With a block,
38
+ # streams document-by-document straight from disk in bounded memory (never
39
+ # loading the whole file); the documents are read as newline-delimited
40
+ # (NDJSON / JSONL), one per line.
41
+ def process_file(path, options = {}, &block)
42
+ encoding = options.fetch(:encoding, "UTF-8")
43
+ if block
44
+ File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
45
+ else
46
+ process_content(File.read(path, encoding: encoding), options)
47
+ end
48
+ end
49
+
50
+ # Parse a String of JSON content (the in-memory path). Returns nil (block) or
51
+ # the value / Array (no block); the C extension is used when available.
52
+ def process_content(input, options, &block)
53
+ if block
54
+ if options.fetch(:acceleration, true) && HAS_ACCELERATION
55
+ parse_c(input, options, &block)
56
+ else
57
+ Parser.new(input, options).each_value(&block)
58
+ end
59
+ elsif options.fetch(:acceleration, true) && HAS_ACCELERATION
60
+ parse_c(input, options) # returns [result, warnings] when options[:warnings]
61
+ else
62
+ parser = Parser.new(input, options)
63
+ options.fetch(:warnings, false) ? [parser.parse, parser.warnings] : parser.parse
64
+ end
65
+ end
66
+
67
+ # Stream documents from an IO, one line (= one document) at a time, yielding
68
+ # each — bounded memory. Newline-delimited (NDJSON / JSONL); a single document
69
+ # spanning multiple lines is not supported by the streaming path.
70
+ def stream_io(io, options, &block)
71
+ io.each_line("\n") { |line| process_content(line, options, &block) }
72
+ nil
73
+ end
74
+
75
+ private_class_method :process_content, :stream_io
76
+
77
+ # Hand-rolled FSM single-pass parser.
78
+ # Layer 1: strict JSON (RFC 8259).
79
+ # Layer 2: JSON5 additions — line/block comments, trailing comma,
80
+ # unquoted ECMAScript identifier keys, single-quoted strings,
81
+ # hex numbers, leading/trailing decimal points, Infinity/NaN,
82
+ # explicit + sign, \-line-continuation inside strings.
83
+ # Layer 3: HJSON-inspired additions — #/comment-marker rule, triple-quoted
84
+ # strings, quoteless single-line strings, implicit root object,
85
+ # newline-as-separator, broader unquoted keys, recognized-literals-win.
86
+ # Layer 4: smarter_json additions — UTF-8 BOM skip, smart/curly quotes,
87
+ # Python literals (True/False/None) and undefined, underscores in
88
+ # numeric literals, and encoding validation (SmarterJSON::EncodingError).
89
+ class Parser
90
+ LBRACE = 0x7B
91
+ RBRACE = 0x7D
92
+ LBRACKET = 0x5B
93
+ RBRACKET = 0x5D
94
+ COLON = 0x3A
95
+ COMMA = 0x2C
96
+ DQUOTE = 0x22
97
+ SQUOTE = 0x27
98
+ BACKSLASH = 0x5C
99
+ SLASH = 0x2F
100
+ STAR = 0x2A
101
+ HASH = 0x23
102
+ MINUS = 0x2D
103
+ PLUS = 0x2B
104
+ DOT = 0x2E
105
+ ZERO = 0x30
106
+ NINE = 0x39
107
+ LOWER_E = 0x65
108
+ UPPER_E = 0x45
109
+ LOWER_T = 0x74
110
+ LOWER_F = 0x66
111
+ LOWER_N = 0x6E
112
+ LOWER_U = 0x75
113
+ LOWER_X = 0x78
114
+ UPPER_X = 0x58
115
+ UPPER_I = 0x49
116
+ UPPER_N = 0x4E
117
+ UPPER_T = 0x54
118
+ UPPER_F = 0x46
119
+ UNDERSCORE = 0x5F
120
+ DOLLAR = 0x24
121
+ SPACE = 0x20
122
+ TAB = 0x09
123
+ LF = 0x0A
124
+ CR = 0x0D
125
+
126
+ NOT_NUMERIC = Object.new
127
+ HEX_RE = /\A[-+]?0[xX][0-9a-fA-F_]+\z/.freeze
128
+ # Mantissa must carry at least one digit (int part, or a leading-dot fraction), so a
129
+ # bare exponent like "-e695881" is NOT a number — it falls through to a quoteless
130
+ # string, matching the C path. Trailing exponent stays optional.
131
+ DEC_RE = /\A[-+]?(?:(?:0|[1-9][0-9_]*)(?:\.[0-9_]*)?|\.[0-9_]+)(?:[eE][-+]?[0-9_]+)?\z/.freeze
132
+ # A decimal BigDecimal() would reject as-is: a leading dot (".5") or a dot not
133
+ # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
134
+ # would change the string — so when it doesn't match, we skip normalization.
135
+ NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
136
+ BLANK_HEAD = /\A[[:space:]]+/.freeze
137
+ BLANK_TAIL = /[[:space:]]+\z/.freeze
138
+
139
+ # All caller-facing settings live in one options hash (smarter_csv style).
140
+ DEFAULT_OPTIONS = {
141
+ acceleration: true, # use the C extension when available
142
+ encoding: nil, # label the input's encoding (no transcoding)
143
+ symbolize_keys: false, # Symbol keys instead of String
144
+ duplicate_key: :last_wins, # :last_wins | :first_wins | :raise
145
+ bigdecimal_load: :auto, # :auto | :float | :bigdecimal (Oj-compatible)
146
+ warnings: false, # collect non-fatal lenient fixes; process returns [result, warnings]
147
+ }.freeze
148
+
149
+ # Warnings collected during the parse (empty slots, empty values, dropped duplicate
150
+ # keys). Empty unless the parser was built with warnings: true. Public so the module
151
+ # functions can read it after parse / each_value.
152
+ attr_reader :warnings
153
+
154
+ def initialize(input, options = {})
155
+ raise ArgumentError, "input must be a String" unless input.is_a?(String)
156
+
157
+ opts = DEFAULT_OPTIONS.merge(options)
158
+ @symbolize_keys = opts[:symbolize_keys]
159
+ @duplicate_key = opts[:duplicate_key]
160
+ @bigdecimal_load = opts[:bigdecimal_load]
161
+ @collect_warnings = opts[:warnings]
162
+ @warnings = []
163
+
164
+ encoding = opts[:encoding]
165
+ @input = encoding ? input.dup.force_encoding(encoding) : input
166
+ raise EncodingError, "invalid byte sequence for #{@input.encoding.name}" unless @input.valid_encoding?
167
+
168
+ @bytesize = @input.bytesize
169
+ # Skip a UTF-8 BOM (EF BB BF) at the start of input.
170
+ @pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
171
+ @line = 1
172
+ @col = 1
173
+ end
174
+
175
+ # No block: auto-detect the document count for free (the same "is there
176
+ # trailing content?" check that used to raise). 0 documents -> nil; 1 document
177
+ # -> the value itself (single-document path, no Array allocated); 2+ documents
178
+ # (NDJSON / JSONL / concatenated / whitespace-separated) -> an Array of every
179
+ # value. Commas do NOT separate documents (only whitespace / newline /
180
+ # concatenation do), so a bracketless comma list still raises in parse_document.
181
+ def parse
182
+ skip_whitespace_and_comments
183
+ return nil if eof?
184
+
185
+ value = parse_document
186
+ skip_whitespace_and_comments
187
+ return value if eof?
188
+
189
+ results = [value]
190
+ until eof?
191
+ results << parse_document
192
+ skip_whitespace_and_comments
193
+ end
194
+ results
195
+ end
196
+
197
+ # Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
198
+ # whitespace-separated). Used by the block form of SmarterJSON.process.
199
+ def each_value
200
+ loop do
201
+ skip_whitespace_and_comments
202
+ break if eof?
203
+
204
+ yield parse_document
205
+ end
206
+ nil
207
+ end
208
+
209
+ private
210
+
211
+ # --- top-level dispatch ---
212
+
213
+ def parse_document
214
+ parse_iter(implicit_root_object_ahead?)
215
+ end
216
+
217
+ # Iterative container parser — explicit stack, NO Ruby recursion, so nesting
218
+ # is bounded only by memory (like Oj and the C extension's fj_parse_iter),
219
+ # never by the call stack. Mirrors the C driver to keep the two paths in
220
+ # parity.
221
+ def parse_iter(implicit_root)
222
+ stack = []
223
+ root = nil
224
+ cur = nil
225
+ cur_obj = false
226
+ at_top = true
227
+
228
+ if implicit_root
229
+ root = {}
230
+ stack.push(root)
231
+ cur = root
232
+ cur_obj = true
233
+ at_top = false
234
+ end
235
+
236
+ vss = false # warnings: has a value landed in the current container since the last separator?
237
+ loop do
238
+ skip_whitespace_and_comments
239
+ b = byte
240
+ if at_top
241
+ if b == LBRACE
242
+ advance(1)
243
+ root = {}
244
+ stack.push(root)
245
+ cur = root
246
+ cur_obj = true
247
+ at_top = false
248
+ vss = false
249
+ elsif b == LBRACKET
250
+ advance(1)
251
+ root = []
252
+ stack.push(root)
253
+ cur = root
254
+ cur_obj = false
255
+ at_top = false
256
+ vss = false
257
+ elsif b.nil?
258
+ raise error("unexpected end of input")
259
+ else
260
+ return parse_value
261
+ end
262
+ elsif b == COMMA
263
+ # Commas are collapsing separators inside a container: an empty slot (leading,
264
+ # interior, or trailing comma) adds nothing. Skip it; the next iteration reads
265
+ # the following value/key or the closing bracket.
266
+ warn(:empty_slot, "extra comma — collapsed an empty slot") unless vss
267
+ vss = false
268
+ advance(1)
269
+ elsif cur_obj
270
+ if b == RBRACE
271
+ advance(1)
272
+ stack.pop
273
+ return root if stack.empty?
274
+
275
+ cur = stack.last
276
+ cur_obj = cur.is_a?(Hash)
277
+ vss = true # the just-closed container is a value in its parent
278
+ elsif b.nil?
279
+ return root if implicit_root && stack.size == 1
280
+
281
+ raise error("unterminated object")
282
+ elsif b == RBRACKET
283
+ raise error("unexpected ']' — expected a key or '}'")
284
+ else
285
+ key = parse_object_key
286
+ skip_whitespace_and_comments
287
+ raise error("expected ':' after key #{key.inspect}") unless byte == COLON
288
+
289
+ advance(1)
290
+ skip_whitespace_and_comments
291
+ b = byte
292
+ if [LBRACE, LBRACKET].include?(b)
293
+ child = b == LBRACE ? {} : []
294
+ advance(1) # consume { or [
295
+ store_member(cur, key, child)
296
+ stack.push(child)
297
+ cur = child
298
+ cur_obj = (b == LBRACE)
299
+ vss = false
300
+ elsif [RBRACE, COMMA].include?(b)
301
+ # key with a colon but no value -> null (don't consume } or ,; the loop does)
302
+ store_member(cur, key, nil)
303
+ warn(:empty_value, "key #{key.inspect} had no value — used null")
304
+ vss = true
305
+ elsif b.nil?
306
+ raise error("unexpected end of input")
307
+ else
308
+ store_member(cur, key, parse_member_value)
309
+ vss = true
310
+ end
311
+ end
312
+ else # array
313
+ if b == RBRACKET
314
+ advance(1)
315
+ stack.pop
316
+ return root if stack.empty?
317
+
318
+ cur = stack.last
319
+ cur_obj = cur.is_a?(Hash)
320
+ vss = true # the just-closed container is a value in its parent
321
+ elsif b.nil?
322
+ raise error("unterminated array")
323
+ elsif b == RBRACE
324
+ raise error("unexpected '}' — expected ']' or a value")
325
+ elsif [LBRACE, LBRACKET].include?(b)
326
+ child = b == LBRACE ? {} : []
327
+ advance(1) # consume { or [
328
+ cur.push(child)
329
+ stack.push(child)
330
+ cur = child
331
+ cur_obj = (b == LBRACE)
332
+ vss = false
333
+ else
334
+ cur.push(parse_member_value)
335
+ vss = true
336
+ end
337
+ end
338
+ end
339
+ end
340
+
341
+ # At the start of a document: an unquoted identifier followed by ':' means
342
+ # an implicit root object (no outer braces). Look ahead without consuming.
343
+ def implicit_root_object_ahead?
344
+ b = byte
345
+ return false unless b && key_start_byte?(b)
346
+
347
+ saved = [@pos, @line, @col]
348
+ advance(1) while (c = byte) && key_continue_byte?(c)
349
+ skip_pure_whitespace
350
+ result = (byte == COLON)
351
+ @pos, @line, @col = saved
352
+ result
353
+ end
354
+
355
+ # --- byte access ---
356
+
357
+ def byte
358
+ @input.getbyte(@pos)
359
+ end
360
+
361
+ def byte_at(offset)
362
+ @input.getbyte(@pos + offset)
363
+ end
364
+
365
+ def eof?
366
+ @pos >= @bytesize
367
+ end
368
+
369
+ def advance(n = 1)
370
+ n.times do
371
+ b = @input.getbyte(@pos)
372
+ return if b.nil?
373
+
374
+ if b == LF
375
+ @line += 1
376
+ @col = 1
377
+ @pos += 1
378
+ elsif b == CR
379
+ @line += 1
380
+ @col = 1
381
+ @pos += 1
382
+ @pos += 1 if @input.getbyte(@pos) == LF
383
+ else
384
+ @col += 1
385
+ @pos += 1
386
+ end
387
+ end
388
+ end
389
+
390
+ # --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
391
+
392
+ def skip_pure_whitespace
393
+ loop do
394
+ b = byte
395
+ break if b.nil?
396
+
397
+ if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
398
+ advance(1)
399
+ elsif b >= 0x80
400
+ n = multibyte_ws_len(@pos)
401
+ break if n.zero?
402
+
403
+ @pos += n
404
+ @col += 1
405
+ else
406
+ break
407
+ end
408
+ end
409
+ end
410
+
411
+ # Number of bytes of the Unicode-whitespace char starting at pos, or 0.
412
+ # Only meaningful for bytes >= 0x80.
413
+ def multibyte_ws_len(pos)
414
+ b0 = @input.getbyte(pos)
415
+ return 0 if b0 != 0xC2 && (b0 < 0xE1 || b0 > 0xE3) # reject-gate
416
+
417
+ b1 = @input.getbyte(pos + 1)
418
+ return 0 if b1.nil?
419
+ return [0xA0, 0x85].include?(b1) ? 2 : 0 if b0 == 0xC2 # NBSP, NEL
420
+
421
+ b2 = @input.getbyte(pos + 2)
422
+ return 0 if b2.nil?
423
+
424
+ case b0
425
+ when 0xE1
426
+ return 3 if b1 == 0x9A && b2 == 0x80 # U+1680
427
+ when 0xE2
428
+ if b1 == 0x80
429
+ return 3 if (b2 >= 0x80 && b2 <= 0x8A) || b2 == 0xA8 || b2 == 0xA9 || b2 == 0xAF
430
+ elsif b1 == 0x81 && b2 == 0x9F
431
+ return 3 # U+205F
432
+ end
433
+ when 0xE3
434
+ return 3 if b1 == 0x80 && b2 == 0x80 # U+3000
435
+ end
436
+ 0
437
+ end
438
+
439
+ # A '#', '//', or '/*' starts a comment only when preceded by whitespace
440
+ # or at the very start of input (the comment-marker rule).
441
+ def skip_whitespace_and_comments
442
+ loop do
443
+ skip_pure_whitespace
444
+ b = byte
445
+ break if b.nil?
446
+
447
+ is_marker = (b == HASH) || (b == SLASH && [SLASH, STAR].include?(byte_at(1)))
448
+ break unless is_marker
449
+ break unless preceded_by_ws_or_start?
450
+
451
+ if b == SLASH && byte_at(1) == STAR
452
+ skip_block_comment
453
+ else
454
+ skip_to_eol
455
+ end
456
+ end
457
+ end
458
+
459
+ def preceded_by_ws_or_start?
460
+ return true if @pos.zero?
461
+
462
+ prev = @input.getbyte(@pos - 1)
463
+ return true if prev == SPACE || (prev >= TAB && prev <= CR)
464
+ return false if prev < 0x80
465
+
466
+ # rare: a multibyte whitespace char ending right before @pos
467
+ i = @pos - 1
468
+ i -= 1 while i.positive? && (@input.getbyte(i) & 0xC0) == 0x80
469
+ n = multibyte_ws_len(i)
470
+ n.positive? && (i + n == @pos)
471
+ end
472
+
473
+ def skip_to_eol
474
+ advance(1) while (c = byte) && c != LF && c != CR
475
+ end
476
+
477
+ def skip_block_comment
478
+ advance(2) # consume /*
479
+ until eof?
480
+ break if byte == STAR && byte_at(1) == SLASH
481
+
482
+ advance(1)
483
+ end
484
+ raise error("unterminated block comment") if eof?
485
+
486
+ advance(2) # consume */
487
+ end
488
+
489
+ # Layer 1 (strict JSON) shape: whitespace + at most one comma + whitespace.
490
+ # The Lenient Commas Option becomes a one-line change here.
491
+ # --- values ---
492
+
493
+ # Top-level / strict value: no quoteless fallback.
494
+ def parse_value
495
+ skip_whitespace_and_comments
496
+ raise error("unexpected end of input") if eof?
497
+
498
+ b = byte
499
+ case b
500
+ when DQUOTE then parse_string(DQUOTE)
501
+ when SQUOTE then parse_single_or_triple
502
+ when MINUS, PLUS, DOT, ZERO..NINE, UPPER_I then parse_number
503
+ when UPPER_N then parse_upper_n # NaN vs None
504
+ when LOWER_T then parse_literal_keyword("true", true)
505
+ when LOWER_F then parse_literal_keyword("false", false)
506
+ when LOWER_N then parse_literal_keyword("null", nil)
507
+ when LOWER_U then parse_literal_keyword("undefined", nil)
508
+ when UPPER_T then parse_literal_keyword("True", true)
509
+ when UPPER_F then parse_literal_keyword("False", false)
510
+ else
511
+ kind = smart_quote_kind(@pos)
512
+ return parse_smart_string(kind) if kind
513
+
514
+ raise error("unexpected character #{display_byte(b)}")
515
+ end
516
+ end
517
+
518
+ # Disambiguate NaN (number) from None (Python null) at a strict position.
519
+ def parse_upper_n
520
+ if byte_at(1) == 0x61 # 'a' → NaN
521
+ parse_number
522
+ else
523
+ parse_literal_keyword("None", nil)
524
+ end
525
+ end
526
+
527
+ # Value in object-value or array-element position: quoteless allowed.
528
+ def parse_member_value
529
+ skip_whitespace_and_comments
530
+ raise error("unexpected end of input") if eof?
531
+
532
+ b = byte
533
+ case b
534
+ when DQUOTE then parse_string(DQUOTE)
535
+ when SQUOTE then parse_single_or_triple
536
+ else
537
+ kind = smart_quote_kind(@pos)
538
+ kind ? parse_smart_string(kind) : parse_quoteless_or_literal
539
+ end
540
+ end
541
+
542
+ # Smart / curly quotes (U+201C/201D double, U+2018/2019 single), UTF-8
543
+ # E2 80 9C/9D/98/99. Returns :double, :single, or nil.
544
+ def smart_quote_kind(pos)
545
+ return nil unless @input.getbyte(pos) == 0xE2 && @input.getbyte(pos + 1) == 0x80
546
+
547
+ case @input.getbyte(pos + 2)
548
+ when 0x9C, 0x9D then :double
549
+ when 0x98, 0x99 then :single
550
+ end
551
+ end
552
+
553
+ # Content between smart quotes is taken literally (no escape processing).
554
+ # Accepts either curly variant as opener/closer (lenient about direction).
555
+ def parse_smart_string(kind)
556
+ closers = kind == :double ? [0x9C, 0x9D] : [0x98, 0x99]
557
+ advance(3)
558
+ start = @pos
559
+ until eof?
560
+ if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
561
+ closers.include?(@input.getbyte(@pos + 2))
562
+ result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
563
+ advance(3)
564
+ return result
565
+ end
566
+ advance(1)
567
+ end
568
+ raise error("unterminated smart-quoted string")
569
+ end
570
+
571
+ def store_member(hash, key, value)
572
+ k = @symbolize_keys ? key.to_sym : key
573
+ if hash.key?(k)
574
+ raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
575
+
576
+ warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}")
577
+ return if @duplicate_key == :first_wins
578
+ end
579
+ hash[k] = value
580
+ end
581
+
582
+ def parse_object_key
583
+ b = byte
584
+ return parse_string(DQUOTE) if b == DQUOTE
585
+ return parse_string(SQUOTE) if b == SQUOTE
586
+ raise error("expected a key") unless b && key_start_byte?(b)
587
+
588
+ parse_identifier_key
589
+ end
590
+
591
+ def key_start_byte?(b)
592
+ (b >= 0x41 && b <= 0x5A) || # A-Z
593
+ (b >= 0x61 && b <= 0x7A) || # a-z
594
+ b == UNDERSCORE ||
595
+ b == DOLLAR
596
+ end
597
+
598
+ def key_continue_byte?(b)
599
+ key_start_byte?(b) || (b >= ZERO && b <= NINE) || b == MINUS # hyphen allowed
600
+ end
601
+
602
+ def parse_identifier_key
603
+ start = @pos
604
+ advance(1)
605
+ advance(1) while (b = byte) && key_continue_byte?(b)
606
+ @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
607
+ end
608
+
609
+ # --- quoteless strings & literal classification ---
610
+
611
+ def parse_quoteless_or_literal
612
+ start = @pos
613
+ scan_quoteless_run
614
+ # A quoteless run must consume at least one byte. If the first byte is a
615
+ # delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
616
+ # here would make the caller's `result << parse_member_value` loop forever.
617
+ # Raise instead (correct today: the Lenient Commas Option is not adopted).
618
+ raise error("expected a value") if @pos == start
619
+
620
+ raw = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
621
+ classify_quoteless(trim_blank(raw))
622
+ end
623
+
624
+ # Advance to the end of a quoteless run. Stops at structural punctuation
625
+ # (',' '}' ']'), a newline, EOF, or a comment marker that is preceded by
626
+ # whitespace. Spaces by themselves are not delimiters.
627
+ def scan_quoteless_run
628
+ prev_ws = false
629
+ loop do
630
+ b = byte
631
+ break if b.nil?
632
+ break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
633
+ break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
634
+
635
+ if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
636
+ prev_ws = true
637
+ advance(1)
638
+ elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
639
+ prev_ws = true
640
+ @pos += n
641
+ @col += 1
642
+ else
643
+ prev_ws = false
644
+ advance(1)
645
+ end
646
+ end
647
+ end
648
+
649
+ def trim_blank(str)
650
+ str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
651
+ end
652
+
653
+ def classify_quoteless(str)
654
+ case str
655
+ when "true", "True" then return true
656
+ when "false", "False" then return false
657
+ when "null", "None" then return nil
658
+ when "undefined" then return nil
659
+ when "NaN" then return Float::NAN
660
+ when "Infinity", "+Infinity" then return Float::INFINITY
661
+ when "-Infinity" then return (-Float::INFINITY)
662
+ end
663
+ num = numeric_value(str)
664
+ num.equal?(NOT_NUMERIC) ? str : num
665
+ end
666
+
667
+ # Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
668
+ def numeric_value(str)
669
+ if HEX_RE.match?(str)
670
+ neg = str.start_with?("-")
671
+ body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
672
+ v = body[2..-1].to_i(16)
673
+ return neg ? -v : v
674
+ end
675
+ return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
676
+
677
+ body = str.delete("_")
678
+ body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
679
+ end
680
+
681
+ # A decimal (has '.' or exponent). bigdecimal_load: :float -> Float,
682
+ # :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
683
+ # than 16 significant digits (Oj's DEC_MAX threshold), else Float.
684
+ def decimal_value(body)
685
+ case @bigdecimal_load
686
+ when :float then body.to_f
687
+ when :bigdecimal then to_big_decimal(body)
688
+ else significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
689
+ end
690
+ end
691
+
692
+ def significant_digits(body)
693
+ body.sub(/[eE].*\z/, "").gsub(/[^0-9]/, "").sub(/\A0+/, "").length
694
+ end
695
+
696
+ def to_big_decimal(body)
697
+ # Fast path (mirrors the C extension): a clean token goes straight to
698
+ # BigDecimal(); only a bare/trailing dot needs the normalizing rewrite,
699
+ # which BigDecimal() would otherwise reject. (body has no underscores here
700
+ # — numeric_value already stripped them.)
701
+ body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
702
+ BigDecimal(body)
703
+ rescue ArgumentError
704
+ body.to_f
705
+ end
706
+
707
+ # BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
708
+ def normalize_for_bigdecimal(body)
709
+ body.sub(/\A([+-]?)\./, '\10.').sub(/\.([eE]|\z)/, '.0\1')
710
+ end
711
+
712
+ # --- quoted strings ---
713
+
714
+ def parse_single_or_triple
715
+ if byte_at(1) == SQUOTE && byte_at(2) == SQUOTE
716
+ parse_triple_quoted
717
+ else
718
+ parse_string(SQUOTE)
719
+ end
720
+ end
721
+
722
+ def parse_triple_quoted
723
+ indent = @col - 1
724
+ advance(3)
725
+ raw_start = @pos
726
+ until eof?
727
+ break if byte == SQUOTE && byte_at(1) == SQUOTE && byte_at(2) == SQUOTE
728
+
729
+ advance(1)
730
+ end
731
+ raise error("unterminated triple-quoted string") if eof?
732
+
733
+ raw = @input.byteslice(raw_start, @pos - raw_start).force_encoding(@input.encoding)
734
+ advance(3)
735
+ strip_triple(raw, indent)
736
+ end
737
+
738
+ def strip_triple(raw, indent)
739
+ text = raw.gsub(/\r\n?/, "\n")
740
+ leading_newline = text.start_with?("\n")
741
+ lines = text.split("\n", -1)
742
+ out = []
743
+ lines.each_with_index do |line, idx|
744
+ if idx.zero?
745
+ leading_newline ? next : (out << line)
746
+ else
747
+ out << strip_indent(line, indent)
748
+ end
749
+ end
750
+ out.pop if out.last && out.last =~ /\A[ \t]*\z/
751
+ out.join("\n").force_encoding(@input.encoding)
752
+ end
753
+
754
+ def strip_indent(line, indent)
755
+ i = 0
756
+ i += 1 while i < indent && [" ", "\t"].include?(line[i])
757
+ line[i..-1] || ""
758
+ end
759
+
760
+ def parse_string(quote)
761
+ advance(1)
762
+ start = @pos
763
+ has_escape = false
764
+ while (b = byte)
765
+ if b == quote
766
+ if has_escape
767
+ decoded = decode_string_with_escapes(start, @pos, quote)
768
+ advance(1)
769
+ return decoded
770
+ else
771
+ result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
772
+ advance(1)
773
+ return result
774
+ end
775
+ elsif b == BACKSLASH
776
+ has_escape = true
777
+ advance(1)
778
+ raise error("unterminated string escape") if eof?
779
+
780
+ advance(1)
781
+ else
782
+ advance(1)
783
+ end
784
+ end
785
+ raise error("unterminated string")
786
+ end
787
+
788
+ def decode_string_with_escapes(start, finish, _quote)
789
+ buf = String.new(encoding: Encoding::ASCII_8BIT)
790
+ i = start
791
+ while i < finish
792
+ b = @input.getbyte(i)
793
+ unless b == BACKSLASH
794
+ buf << b
795
+ i += 1
796
+ next
797
+ end
798
+ i += 1
799
+ esc = @input.getbyte(i)
800
+ case esc
801
+ when DQUOTE then buf << '"'.b
802
+ when SQUOTE then buf << "'".b
803
+ when BACKSLASH then buf << "\\".b
804
+ when SLASH then buf << "/".b
805
+ when 0x62 then buf << "\b".b
806
+ when 0x66 then buf << "\f".b
807
+ when 0x6E then buf << "\n".b
808
+ when 0x72 then buf << "\r".b
809
+ when 0x74 then buf << "\t".b
810
+ when LF
811
+ # JSON5 line continuation: \<LF> emits nothing
812
+ when CR
813
+ i += 1 if @input.getbyte(i + 1) == LF
814
+ when LOWER_U
815
+ cp, consumed = decode_unicode_escape(i)
816
+ buf << [cp].pack("U").b
817
+ i += consumed
818
+ next
819
+ else
820
+ raise error("invalid escape \\#{esc&.chr || "?"}")
821
+ end
822
+ i += 1
823
+ end
824
+ buf.force_encoding(@input.encoding)
825
+ end
826
+
827
+ def decode_unicode_escape(i)
828
+ raise error("incomplete \\u escape") if i + 4 >= @bytesize
829
+
830
+ hex = @input.byteslice(i + 1, 4)
831
+ # Match on a binary view: the 4 bytes may split a raw multibyte character, and a
832
+ # regex on an invalid-UTF-8 String raises ArgumentError. On binary, non-hex bytes
833
+ # simply fail the match and we raise a clean ParseError below.
834
+ raise error("invalid \\u escape") unless hex.b.match?(/\A\h{4}\z/)
835
+
836
+ cp = hex.to_i(16)
837
+ consumed = 5
838
+ if cp >= 0xD800 && cp <= 0xDBFF
839
+ unless @input.getbyte(i + consumed) == BACKSLASH && @input.getbyte(i + consumed + 1) == LOWER_U
840
+ raise error("unpaired high surrogate in string")
841
+ end
842
+
843
+ hex2 = @input.byteslice(i + consumed + 2, 4)
844
+ raise error("invalid low surrogate \\u escape") unless hex2 && hex2.bytesize == 4 && hex2.b.match?(/\A\h{4}\z/)
845
+
846
+ cp2 = hex2.to_i(16)
847
+ raise error("invalid low surrogate value") unless cp2 >= 0xDC00 && cp2 <= 0xDFFF
848
+
849
+ cp = 0x10000 + ((cp - 0xD800) << 10) + (cp2 - 0xDC00)
850
+ consumed += 6
851
+ end
852
+ [cp, consumed]
853
+ end
854
+
855
+ # --- numbers (top-level / strict positions) ---
856
+
857
+ def parse_number
858
+ negative = false
859
+ if byte == MINUS
860
+ negative = true
861
+ advance(1)
862
+ elsif byte == PLUS
863
+ advance(1)
864
+ end
865
+
866
+ if byte == UPPER_I
867
+ consume_keyword!("Infinity")
868
+ return negative ? -Float::INFINITY : Float::INFINITY
869
+ end
870
+ if byte == UPPER_N
871
+ consume_keyword!("NaN")
872
+ return Float::NAN
873
+ end
874
+
875
+ int_start = @pos
876
+
877
+ if byte == ZERO
878
+ advance(1)
879
+ if [LOWER_X, UPPER_X].include?(byte)
880
+ advance(1)
881
+ hex_start = @pos
882
+ advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
883
+ raise error("invalid hex number") if @pos == hex_start
884
+
885
+ value = @input.byteslice(hex_start, @pos - hex_start).delete("_").to_i(16)
886
+ return negative ? -value : value
887
+ end
888
+ elsif byte && byte >= 0x31 && byte <= NINE
889
+ advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
890
+ elsif byte == DOT
891
+ # leading decimal handled below
892
+ else
893
+ raise error("invalid number")
894
+ end
895
+
896
+ is_float = false
897
+
898
+ if byte == DOT
899
+ is_float = true
900
+ advance(1)
901
+ advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
902
+ end
903
+
904
+ if [LOWER_E, UPPER_E].include?(byte)
905
+ is_float = true
906
+ advance(1)
907
+ advance(1) if [PLUS, MINUS].include?(byte)
908
+ raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
909
+
910
+ advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
911
+ end
912
+
913
+ slice = @input.byteslice(int_start, @pos - int_start).delete("_")
914
+ value = is_float ? decimal_value(slice) : slice.to_i
915
+ negative ? -value : value
916
+ end
917
+
918
+ def hex_digit?(b)
919
+ (b >= ZERO && b <= NINE) ||
920
+ (b >= 0x41 && b <= 0x46) ||
921
+ (b >= 0x61 && b <= 0x66)
922
+ end
923
+
924
+ def consume_keyword!(word)
925
+ word.bytesize.times do |i|
926
+ raise error("invalid literal #{word.inspect}") unless byte_at(i) == word.getbyte(i)
927
+ end
928
+ advance(word.bytesize)
929
+ end
930
+
931
+ def parse_literal_keyword(word, value)
932
+ consume_keyword!(word)
933
+ value
934
+ end
935
+
936
+ # Record a non-fatal lenient fix (only when built with warnings: true).
937
+ def warn(type, message)
938
+ return unless @collect_warnings
939
+
940
+ @warnings << Warning.new(type, message, @line, @col)
941
+ end
942
+
943
+ def error(message)
944
+ ParseError.new(message, @line, @col)
945
+ end
946
+
947
+ def display_byte(b)
948
+ return "EOF" if b.nil?
949
+
950
+ if b >= 0x20 && b < 0x7F
951
+ "'#{b.chr}'"
952
+ else
953
+ format("0x%02X", b)
954
+ end
955
+ end
956
+ end
957
+ end