smarter_json 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,926 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterJSON
4
+ # ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
5
+ # from the shared SmarterJSON::Error base.
6
+
7
+ module_function
8
+
9
+ # SmarterJSON.process(input, options = {}) — the main entry point.
10
+ #
11
+ # `input` is either a String of JSON content or an IO to read from. (A String
12
+ # is always content, never a filename — use process_file for paths.) The values
13
+ # in `options` override Parser::DEFAULT_OPTIONS.
14
+ #
15
+ # Without a block: returns nil (zero documents), the value (one document), or an
16
+ # Array of the values (two or more — NDJSON / JSONL / concatenated / whitespace-
17
+ # separated). :acceleration (default true) selects the C extension when compiled
18
+ # and loaded (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
+ #
20
+ # With a block: yields each top-level document as it is parsed, and returns nil.
21
+ # For an IO this streams document-by-document in bounded memory — it reads the
22
+ # stream as newline-delimited documents (NDJSON / JSONL), one per line.
23
+ def process(input, options = {}, &block)
24
+ if input.is_a?(String)
25
+ process_content(input, options, &block)
26
+ elsif input.respond_to?(:read)
27
+ block ? stream_io(input, options, &block) : process_content(input.read, options)
28
+ else
29
+ raise ArgumentError, "SmarterJSON.process expects a String or an IO, got #{input.class}"
30
+ end
31
+ end
32
+
33
+ # SmarterJSON.process_file(path, options = {}) — open a file and process it.
34
+ #
35
+ # The :encoding option labels the file's encoding (default "UTF-8"); it does NOT
36
+ # trigger a transcoding pass — the parser works on the bytes in their native
37
+ # encoding and emits string values with the same encoding tag. With a block,
38
+ # streams document-by-document straight from disk in bounded memory (never
39
+ # loading the whole file); the documents are read as newline-delimited
40
+ # (NDJSON / JSONL), one per line.
41
+ def process_file(path, options = {}, &block)
42
+ encoding = options.fetch(:encoding, "UTF-8")
43
+ if block
44
+ File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
45
+ else
46
+ process_content(File.read(path, encoding: encoding), options)
47
+ end
48
+ end
49
+
50
+ # Parse a String of JSON content (the in-memory path). Returns nil (block) or
51
+ # the value / Array (no block); the C extension is used when available.
52
+ def process_content(input, options, &block)
53
+ if block
54
+ if options.fetch(:acceleration, true) && HAS_ACCELERATION
55
+ parse_c(input, options, &block)
56
+ else
57
+ Parser.new(input, options).each_value(&block)
58
+ end
59
+ elsif options.fetch(:acceleration, true) && HAS_ACCELERATION
60
+ parse_c(input, options)
61
+ else
62
+ Parser.new(input, options).parse
63
+ end
64
+ end
65
+
66
+ # Stream documents from an IO, one line (= one document) at a time, yielding
67
+ # each — bounded memory. Newline-delimited (NDJSON / JSONL); a single document
68
+ # spanning multiple lines is not supported by the streaming path.
69
+ def stream_io(io, options, &block)
70
+ io.each_line("\n") { |line| process_content(line, options, &block) }
71
+ nil
72
+ end
73
+
74
+ private_class_method :process_content, :stream_io
75
+
76
+ # Hand-rolled FSM single-pass parser.
77
+ # Layer 1: strict JSON (RFC 8259).
78
+ # Layer 2: JSON5 additions — line/block comments, trailing comma,
79
+ # unquoted ECMAScript identifier keys, single-quoted strings,
80
+ # hex numbers, leading/trailing decimal points, Infinity/NaN,
81
+ # explicit + sign, \-line-continuation inside strings.
82
+ # Layer 3: HJSON-inspired additions — #/comment-marker rule, triple-quoted
83
+ # strings, quoteless single-line strings, implicit root object,
84
+ # newline-as-separator, broader unquoted keys, recognized-literals-win.
85
+ # Layer 4: smarter_json additions — UTF-8 BOM skip, smart/curly quotes,
86
+ # Python literals (True/False/None) and undefined, underscores in
87
+ # numeric literals, and encoding validation (SmarterJSON::EncodingError).
88
+ class Parser
89
+ LBRACE = 0x7B
90
+ RBRACE = 0x7D
91
+ LBRACKET = 0x5B
92
+ RBRACKET = 0x5D
93
+ COLON = 0x3A
94
+ COMMA = 0x2C
95
+ DQUOTE = 0x22
96
+ SQUOTE = 0x27
97
+ BACKSLASH = 0x5C
98
+ SLASH = 0x2F
99
+ STAR = 0x2A
100
+ HASH = 0x23
101
+ MINUS = 0x2D
102
+ PLUS = 0x2B
103
+ DOT = 0x2E
104
+ ZERO = 0x30
105
+ NINE = 0x39
106
+ LOWER_E = 0x65
107
+ UPPER_E = 0x45
108
+ LOWER_T = 0x74
109
+ LOWER_F = 0x66
110
+ LOWER_N = 0x6E
111
+ LOWER_U = 0x75
112
+ LOWER_X = 0x78
113
+ UPPER_X = 0x58
114
+ UPPER_I = 0x49
115
+ UPPER_N = 0x4E
116
+ UPPER_T = 0x54
117
+ UPPER_F = 0x46
118
+ UNDERSCORE = 0x5F
119
+ DOLLAR = 0x24
120
+ SPACE = 0x20
121
+ TAB = 0x09
122
+ LF = 0x0A
123
+ CR = 0x0D
124
+
125
+ NOT_NUMERIC = Object.new
126
+ HEX_RE = /\A[-+]?0[xX][0-9a-fA-F_]+\z/.freeze
127
+ DEC_RE = /\A[-+]?(?:0|[1-9][0-9_]*)?(?:\.[0-9_]*)?(?:[eE][-+]?[0-9_]+)?\z/.freeze
128
+ # A decimal BigDecimal() would reject as-is: a leading dot (".5") or a dot not
129
+ # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
130
+ # would change the string — so when it doesn't match, we skip normalization.
131
+ NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
132
+ BLANK_HEAD = /\A[[:space:]]+/.freeze
133
+ BLANK_TAIL = /[[:space:]]+\z/.freeze
134
+
135
+ # All caller-facing settings live in one options hash (smarter_csv style).
136
+ DEFAULT_OPTIONS = {
137
+ acceleration: true, # use the C extension when available
138
+ encoding: nil, # label the input's encoding (no transcoding)
139
+ symbolize_keys: false, # Symbol keys instead of String
140
+ duplicate_key: :last_wins, # :last_wins | :first_wins | :raise
141
+ bigdecimal_load: :auto, # :auto | :float | :bigdecimal (Oj-compatible)
142
+ }.freeze
143
+
144
+ def initialize(input, options = {})
145
+ raise ArgumentError, "input must be a String" unless input.is_a?(String)
146
+
147
+ opts = DEFAULT_OPTIONS.merge(options)
148
+ @symbolize_keys = opts[:symbolize_keys]
149
+ @duplicate_key = opts[:duplicate_key]
150
+ @bigdecimal_load = opts[:bigdecimal_load]
151
+
152
+ encoding = opts[:encoding]
153
+ @input = encoding ? input.dup.force_encoding(encoding) : input
154
+ raise EncodingError, "invalid byte sequence for #{@input.encoding.name}" unless @input.valid_encoding?
155
+
156
+ @bytesize = @input.bytesize
157
+ # Skip a UTF-8 BOM (EF BB BF) at the start of input.
158
+ @pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
159
+ @line = 1
160
+ @col = 1
161
+ end
162
+
163
+ # No block: auto-detect the document count for free (the same "is there
164
+ # trailing content?" check that used to raise). 0 documents -> nil; 1 document
165
+ # -> the value itself (single-document path, no Array allocated); 2+ documents
166
+ # (NDJSON / JSONL / concatenated / whitespace-separated) -> an Array of every
167
+ # value. Commas do NOT separate documents (only whitespace / newline /
168
+ # concatenation do), so a bracketless comma list still raises in parse_document.
169
+ def parse
170
+ skip_whitespace_and_comments
171
+ return nil if eof?
172
+
173
+ value = parse_document
174
+ skip_whitespace_and_comments
175
+ return value if eof?
176
+
177
+ results = [value]
178
+ until eof?
179
+ results << parse_document
180
+ skip_whitespace_and_comments
181
+ end
182
+ results
183
+ end
184
+
185
+ # Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
186
+ # whitespace-separated). Used by the block form of SmarterJSON.process.
187
+ def each_value
188
+ loop do
189
+ skip_whitespace_and_comments
190
+ break if eof?
191
+
192
+ yield parse_document
193
+ end
194
+ nil
195
+ end
196
+
197
+ private
198
+
199
+ # --- top-level dispatch ---
200
+
201
+ def parse_document
202
+ parse_iter(implicit_root_object_ahead?)
203
+ end
204
+
205
+ # Iterative container parser — explicit stack, NO Ruby recursion, so nesting
206
+ # is bounded only by memory (like Oj and the C extension's fj_parse_iter),
207
+ # never by the call stack. Mirrors the C driver to keep the two paths in
208
+ # parity.
209
+ def parse_iter(implicit_root)
210
+ stack = []
211
+ root = nil
212
+ cur = nil
213
+ cur_obj = false
214
+ at_top = true
215
+
216
+ if implicit_root
217
+ root = {}
218
+ stack.push(root)
219
+ cur = root
220
+ cur_obj = true
221
+ at_top = false
222
+ end
223
+
224
+ loop do
225
+ skip_whitespace_and_comments
226
+ b = byte
227
+ if at_top
228
+ if b == LBRACE
229
+ advance(1)
230
+ root = {}
231
+ stack.push(root)
232
+ cur = root
233
+ cur_obj = true
234
+ at_top = false
235
+ elsif b == LBRACKET
236
+ advance(1)
237
+ root = []
238
+ stack.push(root)
239
+ cur = root
240
+ cur_obj = false
241
+ at_top = false
242
+ elsif b.nil?
243
+ raise error("unexpected end of input")
244
+ else
245
+ return parse_value
246
+ end
247
+ elsif cur_obj
248
+ if b == RBRACE
249
+ advance(1)
250
+ stack.pop
251
+ return root if stack.empty?
252
+
253
+ cur = stack.last
254
+ cur_obj = cur.is_a?(Hash)
255
+ skip_separator_run
256
+ elsif b.nil?
257
+ return root if implicit_root && stack.size == 1
258
+
259
+ raise error("unterminated object")
260
+ elsif b == RBRACKET
261
+ raise error("unexpected ']' — expected a key or '}'")
262
+ else
263
+ key = parse_object_key
264
+ skip_whitespace_and_comments
265
+ raise error("expected ':' after key #{key.inspect}") unless byte == COLON
266
+
267
+ advance(1)
268
+ skip_whitespace_and_comments
269
+ b = byte
270
+ if [LBRACE, LBRACKET].include?(b)
271
+ child = b == LBRACE ? {} : []
272
+ advance(1) # consume { or [
273
+ store_member(cur, key, child)
274
+ stack.push(child)
275
+ cur = child
276
+ cur_obj = (b == LBRACE)
277
+ elsif b.nil?
278
+ raise error("unexpected end of input")
279
+ else
280
+ store_member(cur, key, parse_member_value)
281
+ skip_separator_run
282
+ end
283
+ end
284
+ else # array
285
+ if b == RBRACKET
286
+ advance(1)
287
+ stack.pop
288
+ return root if stack.empty?
289
+
290
+ cur = stack.last
291
+ cur_obj = cur.is_a?(Hash)
292
+ skip_separator_run
293
+ elsif b.nil?
294
+ raise error("unterminated array")
295
+ elsif b == RBRACE
296
+ raise error("unexpected '}' — expected ']' or a value")
297
+ elsif [LBRACE, LBRACKET].include?(b)
298
+ child = b == LBRACE ? {} : []
299
+ advance(1) # consume { or [
300
+ cur.push(child)
301
+ stack.push(child)
302
+ cur = child
303
+ cur_obj = (b == LBRACE)
304
+ else
305
+ cur.push(parse_member_value)
306
+ skip_separator_run
307
+ end
308
+ end
309
+ end
310
+ end
311
+
312
+ # At the start of a document: an unquoted identifier followed by ':' means
313
+ # an implicit root object (no outer braces). Look ahead without consuming.
314
+ def implicit_root_object_ahead?
315
+ b = byte
316
+ return false unless b && key_start_byte?(b)
317
+
318
+ saved = [@pos, @line, @col]
319
+ advance(1) while (c = byte) && key_continue_byte?(c)
320
+ skip_pure_whitespace
321
+ result = (byte == COLON)
322
+ @pos, @line, @col = saved
323
+ result
324
+ end
325
+
326
+ # --- byte access ---
327
+
328
+ def byte
329
+ @input.getbyte(@pos)
330
+ end
331
+
332
+ def byte_at(offset)
333
+ @input.getbyte(@pos + offset)
334
+ end
335
+
336
+ def eof?
337
+ @pos >= @bytesize
338
+ end
339
+
340
+ def advance(n = 1)
341
+ n.times do
342
+ b = @input.getbyte(@pos)
343
+ return if b.nil?
344
+
345
+ if b == LF
346
+ @line += 1
347
+ @col = 1
348
+ @pos += 1
349
+ elsif b == CR
350
+ @line += 1
351
+ @col = 1
352
+ @pos += 1
353
+ @pos += 1 if @input.getbyte(@pos) == LF
354
+ else
355
+ @col += 1
356
+ @pos += 1
357
+ end
358
+ end
359
+ end
360
+
361
+ # --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
362
+
363
+ def skip_pure_whitespace
364
+ loop do
365
+ b = byte
366
+ break if b.nil?
367
+
368
+ if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
369
+ advance(1)
370
+ elsif b >= 0x80
371
+ n = multibyte_ws_len(@pos)
372
+ break if n.zero?
373
+
374
+ @pos += n
375
+ @col += 1
376
+ else
377
+ break
378
+ end
379
+ end
380
+ end
381
+
382
+ # Number of bytes of the Unicode-whitespace char starting at pos, or 0.
383
+ # Only meaningful for bytes >= 0x80.
384
+ def multibyte_ws_len(pos)
385
+ b0 = @input.getbyte(pos)
386
+ return 0 if b0 != 0xC2 && (b0 < 0xE1 || b0 > 0xE3) # reject-gate
387
+
388
+ b1 = @input.getbyte(pos + 1)
389
+ return 0 if b1.nil?
390
+ return [0xA0, 0x85].include?(b1) ? 2 : 0 if b0 == 0xC2 # NBSP, NEL
391
+
392
+ b2 = @input.getbyte(pos + 2)
393
+ return 0 if b2.nil?
394
+
395
+ case b0
396
+ when 0xE1
397
+ return 3 if b1 == 0x9A && b2 == 0x80 # U+1680
398
+ when 0xE2
399
+ if b1 == 0x80
400
+ return 3 if (b2 >= 0x80 && b2 <= 0x8A) || b2 == 0xA8 || b2 == 0xA9 || b2 == 0xAF
401
+ elsif b1 == 0x81 && b2 == 0x9F
402
+ return 3 # U+205F
403
+ end
404
+ when 0xE3
405
+ return 3 if b1 == 0x80 && b2 == 0x80 # U+3000
406
+ end
407
+ 0
408
+ end
409
+
410
+ # A '#', '//', or '/*' starts a comment only when preceded by whitespace
411
+ # or at the very start of input (the comment-marker rule).
412
+ def skip_whitespace_and_comments
413
+ loop do
414
+ skip_pure_whitespace
415
+ b = byte
416
+ break if b.nil?
417
+
418
+ is_marker = (b == HASH) || (b == SLASH && [SLASH, STAR].include?(byte_at(1)))
419
+ break unless is_marker
420
+ break unless preceded_by_ws_or_start?
421
+
422
+ if b == SLASH && byte_at(1) == STAR
423
+ skip_block_comment
424
+ else
425
+ skip_to_eol
426
+ end
427
+ end
428
+ end
429
+
430
+ def preceded_by_ws_or_start?
431
+ return true if @pos.zero?
432
+
433
+ prev = @input.getbyte(@pos - 1)
434
+ return true if prev == SPACE || (prev >= TAB && prev <= CR)
435
+ return false if prev < 0x80
436
+
437
+ # rare: a multibyte whitespace char ending right before @pos
438
+ i = @pos - 1
439
+ i -= 1 while i.positive? && (@input.getbyte(i) & 0xC0) == 0x80
440
+ n = multibyte_ws_len(i)
441
+ n.positive? && (i + n == @pos)
442
+ end
443
+
444
+ def skip_to_eol
445
+ advance(1) while (c = byte) && c != LF && c != CR
446
+ end
447
+
448
+ def skip_block_comment
449
+ advance(2) # consume /*
450
+ until eof?
451
+ break if byte == STAR && byte_at(1) == SLASH
452
+
453
+ advance(1)
454
+ end
455
+ raise error("unterminated block comment") if eof?
456
+
457
+ advance(2) # consume */
458
+ end
459
+
460
+ # Layer 1 (strict JSON) shape: whitespace + at most one comma + whitespace.
461
+ # The Lenient Commas Option becomes a one-line change here.
462
+ def skip_separator_run
463
+ skip_whitespace_and_comments
464
+ return unless byte == COMMA
465
+
466
+ advance(1)
467
+ skip_whitespace_and_comments
468
+ end
469
+
470
+ # --- values ---
471
+
472
+ # Top-level / strict value: no quoteless fallback.
473
+ def parse_value
474
+ skip_whitespace_and_comments
475
+ raise error("unexpected end of input") if eof?
476
+
477
+ b = byte
478
+ case b
479
+ when DQUOTE then parse_string(DQUOTE)
480
+ when SQUOTE then parse_single_or_triple
481
+ when MINUS, PLUS, DOT, ZERO..NINE, UPPER_I then parse_number
482
+ when UPPER_N then parse_upper_n # NaN vs None
483
+ when LOWER_T then parse_literal_keyword("true", true)
484
+ when LOWER_F then parse_literal_keyword("false", false)
485
+ when LOWER_N then parse_literal_keyword("null", nil)
486
+ when LOWER_U then parse_literal_keyword("undefined", nil)
487
+ when UPPER_T then parse_literal_keyword("True", true)
488
+ when UPPER_F then parse_literal_keyword("False", false)
489
+ else
490
+ kind = smart_quote_kind(@pos)
491
+ return parse_smart_string(kind) if kind
492
+
493
+ raise error("unexpected character #{display_byte(b)}")
494
+ end
495
+ end
496
+
497
+ # Disambiguate NaN (number) from None (Python null) at a strict position.
498
+ def parse_upper_n
499
+ if byte_at(1) == 0x61 # 'a' → NaN
500
+ parse_number
501
+ else
502
+ parse_literal_keyword("None", nil)
503
+ end
504
+ end
505
+
506
+ # Value in object-value or array-element position: quoteless allowed.
507
+ def parse_member_value
508
+ skip_whitespace_and_comments
509
+ raise error("unexpected end of input") if eof?
510
+
511
+ b = byte
512
+ case b
513
+ when DQUOTE then parse_string(DQUOTE)
514
+ when SQUOTE then parse_single_or_triple
515
+ else
516
+ kind = smart_quote_kind(@pos)
517
+ kind ? parse_smart_string(kind) : parse_quoteless_or_literal
518
+ end
519
+ end
520
+
521
+ # Smart / curly quotes (U+201C/201D double, U+2018/2019 single), UTF-8
522
+ # E2 80 9C/9D/98/99. Returns :double, :single, or nil.
523
+ def smart_quote_kind(pos)
524
+ return nil unless @input.getbyte(pos) == 0xE2 && @input.getbyte(pos + 1) == 0x80
525
+
526
+ case @input.getbyte(pos + 2)
527
+ when 0x9C, 0x9D then :double
528
+ when 0x98, 0x99 then :single
529
+ end
530
+ end
531
+
532
+ # Content between smart quotes is taken literally (no escape processing).
533
+ # Accepts either curly variant as opener/closer (lenient about direction).
534
+ def parse_smart_string(kind)
535
+ closers = kind == :double ? [0x9C, 0x9D] : [0x98, 0x99]
536
+ advance(3)
537
+ start = @pos
538
+ until eof?
539
+ if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
540
+ closers.include?(@input.getbyte(@pos + 2))
541
+ result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
542
+ advance(3)
543
+ return result
544
+ end
545
+ advance(1)
546
+ end
547
+ raise error("unterminated smart-quoted string")
548
+ end
549
+
550
+ def store_member(hash, key, value)
551
+ k = @symbolize_keys ? key.to_sym : key
552
+ if hash.key?(k)
553
+ case @duplicate_key
554
+ when :first_wins then return
555
+ when :raise then raise error("duplicate key #{k.inspect}")
556
+ end
557
+ end
558
+ hash[k] = value
559
+ end
560
+
561
+ def parse_object_key
562
+ b = byte
563
+ return parse_string(DQUOTE) if b == DQUOTE
564
+ return parse_string(SQUOTE) if b == SQUOTE
565
+ raise error("expected a key") unless b && key_start_byte?(b)
566
+
567
+ parse_identifier_key
568
+ end
569
+
570
+ def key_start_byte?(b)
571
+ (b >= 0x41 && b <= 0x5A) || # A-Z
572
+ (b >= 0x61 && b <= 0x7A) || # a-z
573
+ b == UNDERSCORE ||
574
+ b == DOLLAR
575
+ end
576
+
577
+ def key_continue_byte?(b)
578
+ key_start_byte?(b) || (b >= ZERO && b <= NINE) || b == MINUS # hyphen allowed
579
+ end
580
+
581
+ def parse_identifier_key
582
+ start = @pos
583
+ advance(1)
584
+ advance(1) while (b = byte) && key_continue_byte?(b)
585
+ @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
586
+ end
587
+
588
+ # --- quoteless strings & literal classification ---
589
+
590
+ def parse_quoteless_or_literal
591
+ start = @pos
592
+ scan_quoteless_run
593
+ # A quoteless run must consume at least one byte. If the first byte is a
594
+ # delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
595
+ # here would make the caller's `result << parse_member_value` loop forever.
596
+ # Raise instead (correct today: the Lenient Commas Option is not adopted).
597
+ raise error("expected a value") if @pos == start
598
+
599
+ raw = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
600
+ classify_quoteless(trim_blank(raw))
601
+ end
602
+
603
+ # Advance to the end of a quoteless run. Stops at structural punctuation
604
+ # (',' '}' ']'), a newline, EOF, or a comment marker that is preceded by
605
+ # whitespace. Spaces by themselves are not delimiters.
606
+ def scan_quoteless_run
607
+ prev_ws = false
608
+ loop do
609
+ b = byte
610
+ break if b.nil?
611
+ break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
612
+ break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
613
+
614
+ if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
615
+ prev_ws = true
616
+ advance(1)
617
+ elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
618
+ prev_ws = true
619
+ @pos += n
620
+ @col += 1
621
+ else
622
+ prev_ws = false
623
+ advance(1)
624
+ end
625
+ end
626
+ end
627
+
628
+ def trim_blank(str)
629
+ str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
630
+ end
631
+
632
+ def classify_quoteless(str)
633
+ case str
634
+ when "true", "True" then return true
635
+ when "false", "False" then return false
636
+ when "null", "None" then return nil
637
+ when "undefined" then return nil
638
+ when "NaN" then return Float::NAN
639
+ when "Infinity", "+Infinity" then return Float::INFINITY
640
+ when "-Infinity" then return (-Float::INFINITY)
641
+ end
642
+ num = numeric_value(str)
643
+ num.equal?(NOT_NUMERIC) ? str : num
644
+ end
645
+
646
+ # Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
647
+ def numeric_value(str)
648
+ if HEX_RE.match?(str)
649
+ neg = str.start_with?("-")
650
+ body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
651
+ v = body[2..-1].to_i(16)
652
+ return neg ? -v : v
653
+ end
654
+ return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
655
+
656
+ body = str.delete("_")
657
+ body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
658
+ end
659
+
660
+ # A decimal (has '.' or exponent). bigdecimal_load: :float -> Float,
661
+ # :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
662
+ # than 16 significant digits (Oj's DEC_MAX threshold), else Float.
663
+ def decimal_value(body)
664
+ case @bigdecimal_load
665
+ when :float then body.to_f
666
+ when :bigdecimal then to_big_decimal(body)
667
+ else significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
668
+ end
669
+ end
670
+
671
+ def significant_digits(body)
672
+ body.sub(/[eE].*\z/, "").gsub(/[^0-9]/, "").sub(/\A0+/, "").length
673
+ end
674
+
675
+ def to_big_decimal(body)
676
+ # Fast path (mirrors the C extension): a clean token goes straight to
677
+ # BigDecimal(); only a bare/trailing dot needs the normalizing rewrite,
678
+ # which BigDecimal() would otherwise reject. (body has no underscores here
679
+ # — numeric_value already stripped them.)
680
+ body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
681
+ BigDecimal(body)
682
+ rescue ArgumentError
683
+ body.to_f
684
+ end
685
+
686
+ # BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
687
+ def normalize_for_bigdecimal(body)
688
+ body.sub(/\A([+-]?)\./, '\10.').sub(/\.([eE]|\z)/, '.0\1')
689
+ end
690
+
691
+ # --- quoted strings ---
692
+
693
+ def parse_single_or_triple
694
+ if byte_at(1) == SQUOTE && byte_at(2) == SQUOTE
695
+ parse_triple_quoted
696
+ else
697
+ parse_string(SQUOTE)
698
+ end
699
+ end
700
+
701
+ def parse_triple_quoted
702
+ indent = @col - 1
703
+ advance(3)
704
+ raw_start = @pos
705
+ until eof?
706
+ break if byte == SQUOTE && byte_at(1) == SQUOTE && byte_at(2) == SQUOTE
707
+
708
+ advance(1)
709
+ end
710
+ raise error("unterminated triple-quoted string") if eof?
711
+
712
+ raw = @input.byteslice(raw_start, @pos - raw_start).force_encoding(@input.encoding)
713
+ advance(3)
714
+ strip_triple(raw, indent)
715
+ end
716
+
717
+ def strip_triple(raw, indent)
718
+ text = raw.gsub(/\r\n?/, "\n")
719
+ leading_newline = text.start_with?("\n")
720
+ lines = text.split("\n", -1)
721
+ out = []
722
+ lines.each_with_index do |line, idx|
723
+ if idx.zero?
724
+ leading_newline ? next : (out << line)
725
+ else
726
+ out << strip_indent(line, indent)
727
+ end
728
+ end
729
+ out.pop if out.last && out.last =~ /\A[ \t]*\z/
730
+ out.join("\n").force_encoding(@input.encoding)
731
+ end
732
+
733
+ def strip_indent(line, indent)
734
+ i = 0
735
+ i += 1 while i < indent && [" ", "\t"].include?(line[i])
736
+ line[i..-1] || ""
737
+ end
738
+
739
+ def parse_string(quote)
740
+ advance(1)
741
+ start = @pos
742
+ has_escape = false
743
+ while (b = byte)
744
+ if b == quote
745
+ if has_escape
746
+ decoded = decode_string_with_escapes(start, @pos, quote)
747
+ advance(1)
748
+ return decoded
749
+ else
750
+ result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
751
+ advance(1)
752
+ return result
753
+ end
754
+ elsif b == BACKSLASH
755
+ has_escape = true
756
+ advance(1)
757
+ raise error("unterminated string escape") if eof?
758
+
759
+ advance(1)
760
+ else
761
+ advance(1)
762
+ end
763
+ end
764
+ raise error("unterminated string")
765
+ end
766
+
767
+ def decode_string_with_escapes(start, finish, _quote)
768
+ buf = String.new(encoding: Encoding::ASCII_8BIT)
769
+ i = start
770
+ while i < finish
771
+ b = @input.getbyte(i)
772
+ unless b == BACKSLASH
773
+ buf << b
774
+ i += 1
775
+ next
776
+ end
777
+ i += 1
778
+ esc = @input.getbyte(i)
779
+ case esc
780
+ when DQUOTE then buf << '"'.b
781
+ when SQUOTE then buf << "'".b
782
+ when BACKSLASH then buf << "\\".b
783
+ when SLASH then buf << "/".b
784
+ when 0x62 then buf << "\b".b
785
+ when 0x66 then buf << "\f".b
786
+ when 0x6E then buf << "\n".b
787
+ when 0x72 then buf << "\r".b
788
+ when 0x74 then buf << "\t".b
789
+ when LF
790
+ # JSON5 line continuation: \<LF> emits nothing
791
+ when CR
792
+ i += 1 if @input.getbyte(i + 1) == LF
793
+ when LOWER_U
794
+ cp, consumed = decode_unicode_escape(i)
795
+ buf << [cp].pack("U").b
796
+ i += consumed
797
+ next
798
+ else
799
+ raise error("invalid escape \\#{esc&.chr || "?"}")
800
+ end
801
+ i += 1
802
+ end
803
+ buf.force_encoding(@input.encoding)
804
+ end
805
+
806
+ def decode_unicode_escape(i)
807
+ raise error("incomplete \\u escape") if i + 4 >= @bytesize
808
+
809
+ hex = @input.byteslice(i + 1, 4)
810
+ raise error("invalid \\u escape") unless hex =~ /\A\h{4}\z/
811
+
812
+ cp = hex.to_i(16)
813
+ consumed = 5
814
+ if cp >= 0xD800 && cp <= 0xDBFF
815
+ unless @input.getbyte(i + consumed) == BACKSLASH && @input.getbyte(i + consumed + 1) == LOWER_U
816
+ raise error("unpaired high surrogate in string")
817
+ end
818
+
819
+ hex2 = @input.byteslice(i + consumed + 2, 4)
820
+ raise error("invalid low surrogate \\u escape") unless hex2 && hex2.bytesize == 4 && hex2 =~ /\A\h{4}\z/
821
+
822
+ cp2 = hex2.to_i(16)
823
+ raise error("invalid low surrogate value") unless cp2 >= 0xDC00 && cp2 <= 0xDFFF
824
+
825
+ cp = 0x10000 + ((cp - 0xD800) << 10) + (cp2 - 0xDC00)
826
+ consumed += 6
827
+ end
828
+ [cp, consumed]
829
+ end
830
+
831
+ # --- numbers (top-level / strict positions) ---
832
+
833
+ def parse_number
834
+ negative = false
835
+ if byte == MINUS
836
+ negative = true
837
+ advance(1)
838
+ elsif byte == PLUS
839
+ advance(1)
840
+ end
841
+
842
+ if byte == UPPER_I
843
+ consume_keyword!("Infinity")
844
+ return negative ? -Float::INFINITY : Float::INFINITY
845
+ end
846
+ if byte == UPPER_N
847
+ consume_keyword!("NaN")
848
+ return Float::NAN
849
+ end
850
+
851
+ int_start = @pos
852
+
853
+ if byte == ZERO
854
+ advance(1)
855
+ if [LOWER_X, UPPER_X].include?(byte)
856
+ advance(1)
857
+ hex_start = @pos
858
+ advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
859
+ raise error("invalid hex number") if @pos == hex_start
860
+
861
+ value = @input.byteslice(hex_start, @pos - hex_start).delete("_").to_i(16)
862
+ return negative ? -value : value
863
+ end
864
+ elsif byte && byte >= 0x31 && byte <= NINE
865
+ advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
866
+ elsif byte == DOT
867
+ # leading decimal handled below
868
+ else
869
+ raise error("invalid number")
870
+ end
871
+
872
+ is_float = false
873
+
874
+ if byte == DOT
875
+ is_float = true
876
+ advance(1)
877
+ advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
878
+ end
879
+
880
+ if [LOWER_E, UPPER_E].include?(byte)
881
+ is_float = true
882
+ advance(1)
883
+ advance(1) if [PLUS, MINUS].include?(byte)
884
+ raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
885
+
886
+ advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
887
+ end
888
+
889
+ slice = @input.byteslice(int_start, @pos - int_start).delete("_")
890
+ value = is_float ? decimal_value(slice) : slice.to_i
891
+ negative ? -value : value
892
+ end
893
+
894
+ def hex_digit?(b)
895
+ (b >= ZERO && b <= NINE) ||
896
+ (b >= 0x41 && b <= 0x46) ||
897
+ (b >= 0x61 && b <= 0x66)
898
+ end
899
+
900
+ def consume_keyword!(word)
901
+ word.bytesize.times do |i|
902
+ raise error("invalid literal #{word.inspect}") unless byte_at(i) == word.getbyte(i)
903
+ end
904
+ advance(word.bytesize)
905
+ end
906
+
907
+ def parse_literal_keyword(word, value)
908
+ consume_keyword!(word)
909
+ value
910
+ end
911
+
912
+ def error(message)
913
+ ParseError.new(message, @line, @col)
914
+ end
915
+
916
+ def display_byte(b)
917
+ return "EOF" if b.nil?
918
+
919
+ if b >= 0x20 && b < 0x7F
920
+ "'#{b.chr}'"
921
+ else
922
+ format("0x%02X", b)
923
+ end
924
+ end
925
+ end
926
+ end