smarter_json 0.8.0 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Array#filter_map (used in Recovery#extract_payloads) is Ruby 2.7+; on Ruby < 2.7
4
+ # activate the scoped refinement backport (no-op on 2.7+, which uses native filter_map).
5
+ using SmarterJSON::Backports if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("2.7")
6
+
3
7
  module SmarterJSON
4
8
  # ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
5
9
  # from the shared SmarterJSON::Error base.
@@ -12,15 +16,20 @@ module SmarterJSON
12
16
  # is always content, never a filename — use process_file for paths.) The values
13
17
  # in `options` override Parser::DEFAULT_OPTIONS.
14
18
  #
15
- # Without a block: returns nil (zero documents), the value (one document), or an
16
- # Array of the values (two or more NDJSON / JSONL / concatenated / whitespace-
17
- # separated). :acceleration (default true) selects the C extension when compiled
18
- # and loaded (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
+ # Without a block: always returns an Array of the documents found [] for none,
20
+ # [doc] for one, [d1, d2, …] for several (NDJSON / JSONL / concatenated). A
21
+ # top-level value must be a recognized JSON value (number / literal / quoted
22
+ # string / object / array) or an implicit-root object, else it raises. For the
23
+ # single-document case use SmarterJSON.process_one (returns the bare value).
24
+ # :acceleration (default true) selects the C extension when compiled and loaded
25
+ # (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
26
  #
20
- # With a block: yields each top-level document as it is parsed, and returns nil.
21
- # For an IO this streams document-by-document in bounded memory — it reads the
22
- # stream as newline-delimited documents (NDJSON / JSONL), one per line.
27
+ # With a block: yields each top-level document as it is parsed, and returns the
28
+ # document count. For an IO this streams document-by-document in bounded memory —
29
+ # it reads the stream as newline-delimited documents (NDJSON / JSONL), one per
30
+ # line.
23
31
  def process(input, options = {}, &block)
32
+ options = Options.process_options(options)
24
33
  if input.is_a?(String)
25
34
  Recovery.process_string(input, options, &block)
26
35
  elsif input.respond_to?(:read)
@@ -39,7 +48,8 @@ module SmarterJSON
39
48
  # loading the whole file); the documents are read as newline-delimited
40
49
  # (NDJSON / JSONL), one per line.
41
50
  def process_file(path, options = {}, &block)
42
- encoding = options.fetch(:encoding, "UTF-8")
51
+ options = Options.process_options(options)
52
+ encoding = options[:encoding] || "UTF-8"
43
53
  if block
44
54
  File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
45
55
  else
@@ -47,8 +57,44 @@ module SmarterJSON
47
57
  end
48
58
  end
49
59
 
50
- # Parse a String of JSON content (the in-memory path). Returns nil (block) or
51
- # the value / Array (no block); the C extension is used when available.
60
+ # SmarterJSON.process_one(input, options = {}) the single-document accessor.
61
+ #
62
+ # Returns the first document's value (or nil when the input holds no documents).
63
+ # When the input holds MORE than one document it returns the first and warns once
64
+ # — it never raises, since an extra document is valid data; the warning goes to
65
+ # on_warning if set, else Rails.logger.warn when Rails is loaded, else Kernel#warn.
66
+ # For an IO this is bounded memory: it parses just the first document and stops as
67
+ # soon as a second is seen, instead of materialising the whole stream the way
68
+ # process(io).first would. (process(input).first and process(input)[0] silently
69
+ # drop documents 2+ — a footgun; use process_one instead.)
70
+ def process_one(input, options = {})
71
+ options = Options.process_options(options)
72
+
73
+ # IO: bounded memory — parse just the first document and stop once a second is
74
+ # seen (peek-to-warn). A String is already in memory, so use the plain no-block
75
+ # path: it returns the full (wrapper-recovered, de-duplicated) Array in one pass,
76
+ # which also avoids the reactive-recovery double-yield the block path would hit.
77
+ unless input.respond_to?(:read)
78
+ docs = process(input, options)
79
+ warn_extra_documents(options) if docs.length > 1
80
+ return docs.first
81
+ end
82
+
83
+ first = nil
84
+ count = 0
85
+ catch(:smarter_json_first_document) do
86
+ process(input, options) do |doc|
87
+ count += 1
88
+ first = doc if count == 1
89
+ throw(:smarter_json_first_document) if count > 1
90
+ end
91
+ end
92
+ warn_extra_documents(options) if count > 1
93
+ first
94
+ end
95
+
96
+ # Parse a String of JSON content (the in-memory path). Returns an Array of the
97
+ # documents found (empty for none); the C extension is used when available.
52
98
  def process_content(input, options, &block)
53
99
  if block
54
100
  if options.fetch(:acceleration, true) && HAS_ACCELERATION
@@ -63,22 +109,322 @@ module SmarterJSON
63
109
  end
64
110
  end
65
111
 
66
- # Stream documents from an IO, one line (= one document) at a time, yielding
67
- # each bounded memory. Newline-delimited (NDJSON / JSONL); a single document
68
- # spanning multiple lines is not supported by the streaming path.
112
+ # Stream documents from an IO incrementally, yielding each recovered top-level
113
+ # document without slurping the whole input into memory first.
69
114
  def stream_io(io, options, &block)
70
- Recovery.process_string(io.read, options, &block)
115
+ count = 0
116
+ Framer.each_document(io) do |doc|
117
+ # Recovery.process_string yields each value and returns how many it yielded;
118
+ # blank / comment-only framed segments yield none, so count tracks actual
119
+ # documents (== values yielded), not raw framed segments.
120
+ count += Recovery.process_string(doc, options, &block)
121
+ end
122
+ count
71
123
  end
72
124
 
73
- private_class_method :process_content, :stream_io
125
+ # process_one's "more than one document" notice — routed to on_warning if the caller
126
+ # gave one, else Rails.logger when Rails is loaded, else Kernel#warn. Never silent,
127
+ # never raised.
128
+ def warn_extra_documents(options)
129
+ message = "SmarterJSON.process_one: input has more than one document — returning the first and " \
130
+ "dropping the rest. Use SmarterJSON.process to get every document."
131
+ handler = options[:on_warning]
132
+ if handler
133
+ handler.call(Warning.new(:extra_documents, message, nil, nil))
134
+ elsif defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
135
+ Rails.logger.warn(message)
136
+ else
137
+ Kernel.warn(message)
138
+ end
139
+ end
140
+
141
+ private_class_method :process_content, :stream_io, :warn_extra_documents
142
+
143
+ # Named byte values, shared by the Parser FSM and the Framer / Recovery byte
144
+ # scanners so none of them spell out raw hex. Included where needed.
145
+ module Bytes
146
+ LBRACE = 0x7B
147
+ RBRACE = 0x7D
148
+ LBRACKET = 0x5B
149
+ RBRACKET = 0x5D
150
+ COLON = 0x3A
151
+ COMMA = 0x2C
152
+ DQUOTE = 0x22
153
+ SQUOTE = 0x27
154
+ BACKSLASH = 0x5C
155
+ SLASH = 0x2F
156
+ STAR = 0x2A
157
+ HASH = 0x23
158
+ MINUS = 0x2D
159
+ PLUS = 0x2B
160
+ DOT = 0x2E
161
+ ZERO = 0x30
162
+ NINE = 0x39
163
+ LOWER_E = 0x65
164
+ UPPER_E = 0x45
165
+ LOWER_T = 0x74
166
+ LOWER_F = 0x66
167
+ LOWER_N = 0x6E
168
+ LOWER_U = 0x75
169
+ LOWER_X = 0x78
170
+ UPPER_X = 0x58
171
+ UPPER_I = 0x49
172
+ UPPER_N = 0x4E
173
+ UPPER_T = 0x54
174
+ UPPER_F = 0x46
175
+ UNDERSCORE = 0x5F
176
+ DOLLAR = 0x24
177
+ SPACE = 0x20
178
+ TAB = 0x09
179
+ LF = 0x0A
180
+ CR = 0x0D
181
+ end
182
+
183
+ module Framer
184
+ include Bytes
185
+
186
+ CHUNK_SIZE = 16 * 1024
187
+
188
+ module_function
189
+
190
+ def each_document(io)
191
+ buffer = +""
192
+ scan = 0
193
+ doc_start = nil
194
+ stack = []
195
+ mode = nil
196
+
197
+ while (chunk = read_chunk(io))
198
+ buffer << chunk
199
+ loop do
200
+ emitted, buffer, scan, doc_start, stack, mode = scan_buffer(buffer, scan, doc_start, stack, mode)
201
+ break unless emitted
202
+
203
+ yield emitted
204
+ end
205
+ end
206
+
207
+ yield buffer unless separators_only?(buffer)
208
+ end
209
+
210
+ def read_chunk(io)
211
+ if io.respond_to?(:readpartial)
212
+ io.readpartial(CHUNK_SIZE)
213
+ else
214
+ io.read(CHUNK_SIZE)
215
+ end
216
+ rescue EOFError
217
+ nil
218
+ end
219
+
220
+ def scan_buffer(buffer, scan, doc_start, stack, mode)
221
+ while scan < buffer.bytesize
222
+ b = buffer.getbyte(scan)
223
+ # A multi-byte marker (// /* ''' */) whose lead byte is here but whose
224
+ # remaining bytes have not arrived yet must not be guessed at — advancing
225
+ # past the lead byte would misread the brace/quote that follows it once the
226
+ # next chunk lands. Stop and let each_document append more input, then resume
227
+ # from this same position. At true EOF the leftover is parsed whole instead.
228
+ break if defer_for_split_marker?(buffer, scan, b, mode, doc_start)
229
+
230
+ if mode == :double
231
+ if b == BACKSLASH
232
+ scan += 2
233
+ elsif b == DQUOTE
234
+ mode = nil
235
+ scan += 1
236
+ else
237
+ scan += 1
238
+ end
239
+ elsif mode == :single
240
+ if b == BACKSLASH
241
+ scan += 2
242
+ elsif b == SQUOTE
243
+ mode = nil
244
+ scan += 1
245
+ else
246
+ scan += 1
247
+ end
248
+ elsif mode == :triple
249
+ if buffer.byteslice(scan, 3) == "'''"
250
+ mode = nil
251
+ scan += 3
252
+ else
253
+ scan += 1
254
+ end
255
+ elsif mode == :line_comment
256
+ if [LF, CR].include?(b)
257
+ mode = nil
258
+ else
259
+ scan += 1
260
+ next
261
+ end
262
+ elsif mode == :block_comment
263
+ if buffer.byteslice(scan, 2) == '*/'
264
+ mode = nil
265
+ scan += 2
266
+ else
267
+ scan += 1
268
+ end
269
+ elsif doc_start.nil?
270
+ if whitespace_byte?(b)
271
+ scan += 1
272
+ elsif line_comment_start?(buffer, scan)
273
+ mode = :line_comment
274
+ scan += buffer.getbyte(scan) == HASH ? 1 : 2
275
+ elsif block_comment_start?(buffer, scan)
276
+ mode = :block_comment
277
+ scan += 2
278
+ elsif [LBRACE, LBRACKET].include?(b)
279
+ doc_start = scan
280
+ stack << b
281
+ scan += 1
282
+ else
283
+ scan = buffer.bytesize
284
+ end
285
+ else
286
+ if mode.nil? && line_comment_start?(buffer, scan)
287
+ mode = :line_comment
288
+ scan += buffer.getbyte(scan) == HASH ? 1 : 2
289
+ elsif mode.nil? && block_comment_start?(buffer, scan)
290
+ mode = :block_comment
291
+ scan += 2
292
+ elsif b == DQUOTE
293
+ mode = :double
294
+ scan += 1
295
+ elsif buffer.byteslice(scan, 3) == "'''"
296
+ mode = :triple
297
+ scan += 3
298
+ elsif b == SQUOTE
299
+ mode = :single
300
+ scan += 1
301
+ elsif [LBRACE, LBRACKET].include?(b)
302
+ stack << b
303
+ scan += 1
304
+ elsif b == RBRACE
305
+ stack.pop if stack.last == LBRACE
306
+ scan += 1
307
+ if stack.empty?
308
+ doc = buffer.byteslice(doc_start, scan - doc_start)
309
+ buffer = buffer.byteslice(scan..-1) || +""
310
+ return [doc, buffer, 0, nil, [], nil]
311
+ end
312
+ elsif b == RBRACKET
313
+ stack.pop if stack.last == LBRACKET
314
+ scan += 1
315
+ if stack.empty?
316
+ doc = buffer.byteslice(doc_start, scan - doc_start)
317
+ buffer = buffer.byteslice(scan..-1) || +""
318
+ return [doc, buffer, 0, nil, [], nil]
319
+ end
320
+ else
321
+ scan += 1
322
+ end
323
+ end
324
+ end
325
+
326
+ [nil, buffer, scan, doc_start, stack, mode]
327
+ end
328
+
329
+ # True when `b` is the lead byte of a multi-byte marker but the rest of that
330
+ # marker has not been read into the buffer yet, so we cannot decide what it is.
331
+ # `//` and `/*` need 2 bytes; `'''` (and a closing `'''`) needs 3; a closing
332
+ # `*/` needs 2. Backslash escapes and single-byte delimiters never need this.
333
+ def defer_for_split_marker?(buffer, scan, b, mode, doc_start)
334
+ avail = buffer.bytesize - scan
335
+ case mode
336
+ when :block_comment
337
+ b == STAR && avail < 2
338
+ when :triple
339
+ b == SQUOTE && avail < 3
340
+ when nil
341
+ if doc_start.nil?
342
+ b == SLASH && avail < 2
343
+ else
344
+ (b == SLASH && avail < 2) || (b == SQUOTE && avail < 3)
345
+ end
346
+ else
347
+ false
348
+ end
349
+ end
350
+
351
+ def separators_only?(buffer)
352
+ scan = 0
353
+ mode = nil
354
+ while scan < buffer.bytesize
355
+ b = buffer.getbyte(scan)
356
+ if mode == :line_comment
357
+ if [LF, CR].include?(b)
358
+ mode = nil
359
+ else
360
+ scan += 1
361
+ next
362
+ end
363
+ elsif mode == :block_comment
364
+ if buffer.byteslice(scan, 2) == '*/'
365
+ mode = nil
366
+ scan += 2
367
+ else
368
+ scan += 1
369
+ end
370
+ elsif whitespace_byte?(b)
371
+ scan += 1
372
+ elsif line_comment_start?(buffer, scan)
373
+ mode = :line_comment
374
+ scan += buffer.getbyte(scan) == HASH ? 1 : 2
375
+ elsif block_comment_start?(buffer, scan)
376
+ mode = :block_comment
377
+ scan += 2
378
+ else
379
+ return false
380
+ end
381
+ end
382
+ true
383
+ end
384
+
385
+ def whitespace_byte?(b)
386
+ b == SPACE || (b && b >= TAB && b <= CR)
387
+ end
388
+
389
+ def line_comment_start?(buffer, scan)
390
+ b = buffer.getbyte(scan)
391
+ return preceded_by_ws_or_start?(buffer, scan) if b == HASH
392
+
393
+ b == SLASH && buffer.getbyte(scan + 1) == SLASH && preceded_by_ws_or_start?(buffer, scan)
394
+ end
395
+
396
+ def block_comment_start?(buffer, scan)
397
+ buffer.getbyte(scan) == SLASH && buffer.getbyte(scan + 1) == STAR && preceded_by_ws_or_start?(buffer, scan)
398
+ end
399
+
400
+ def preceded_by_ws_or_start?(buffer, scan)
401
+ return true if scan.zero?
402
+
403
+ prev = buffer.getbyte(scan - 1)
404
+ whitespace_byte?(prev)
405
+ end
406
+ end
74
407
 
75
408
  module Recovery
409
+ include Bytes
410
+
76
411
  module_function
77
412
 
78
413
  def process_string(input, options, &block)
79
414
  return SmarterJSON.send(:process_content, input, options, &block) unless input.valid_encoding?
80
415
 
81
- if wrapper_hint?(input)
416
+ # Recovery is REACTIVE: parse first, and only fall back to wrapper extraction when
417
+ # the parse actually fails (the rescue below). Every wrapper shape — code fences,
418
+ # <json>/BEGIN_JSON tags, prose around the payload — makes the parse raise, so the
419
+ # rescue catches it. Crucially this keeps clean input on the single-parse fast path
420
+ # even when its string values legitimately contain ``` or <json> (real-world data
421
+ # like GitHub event payloads is full of markdown), instead of dragging hundreds of
422
+ # MB through the pure-Ruby candidate scan.
423
+ #
424
+ # The one exception is a bare leading label like "JSON: {...}", which parses
425
+ # successfully but WRONGLY (as an implicit-root object keyed by the label), so it
426
+ # must be intercepted before parsing.
427
+ if leading_label?(input)
82
428
  payloads = extract_payloads(input, options)
83
429
  return replay_payloads(payloads, options, &block) unless payloads.empty?
84
430
  end
@@ -93,25 +439,37 @@ module SmarterJSON
93
439
  raise
94
440
  end
95
441
 
96
- def wrapper_hint?(input)
97
- return false unless input.valid_encoding?
98
-
99
- input.match?(/```|<json\b|BEGIN_JSON\b/i) || input.match?(/\A[[:space:]]*(?:JSON|Final answer)[[:space:]]*:/i)
442
+ # Whether the input opens with a bare "JSON:" / "Final answer:" label (which would
443
+ # otherwise parse, wrongly, as an implicit-root object keyed by the label). We use
444
+ # String#start_with? with a Regexp rather than match?(/\A.../): start_with? checks
445
+ # only the beginning, whereas a \A-anchored match? still retries at every byte
446
+ # position and so scans the WHOLE input (≈0.3s on a 200 MB document) on every parse.
447
+ # (Caller has already established the input is valid_encoding?.)
448
+ def leading_label?(input)
449
+ input.start_with?(/[[:space:]]*(?:JSON|Final answer)[[:space:]]*:/i)
100
450
  end
101
451
 
102
452
  def replay_payloads(payloads, options, &block)
103
453
  handler = options[:on_warning]
104
454
  emit_wrapper_warnings(payloads, handler)
105
455
 
106
- results = payloads.map do |payload|
107
- SmarterJSON.send(:process_content, payload[:slice], options)
456
+ if block_given?
457
+ count = 0
458
+ payloads.each do |payload|
459
+ SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
460
+ block.call(doc)
461
+ count += 1
462
+ end
463
+ end
464
+ return count
108
465
  end
109
466
 
110
- return results.each(&block).then { nil } if block_given?
111
- return nil if results.empty?
112
- return results.first if results.length == 1
113
-
114
- results
467
+ # Each payload's process_content now returns an Array of its documents; flatten
468
+ # so several recovered payloads yield one flat Array<doc> (the always-array
469
+ # contract), not an Array of Arrays.
470
+ payloads.flat_map do |payload|
471
+ SmarterJSON.send(:process_content, payload[:slice], options)
472
+ end
115
473
  end
116
474
 
117
475
  def emit_wrapper_warnings(payloads, handler)
@@ -146,16 +504,31 @@ module SmarterJSON
146
504
  last = ranges.last
147
505
  prefix = input.byteslice(0, first.begin)
148
506
  suffix = input.byteslice(last.end, input.bytesize - last.end)
507
+ # Look for fence / wrapper markers only in the text we actually strip (outside
508
+ # every recovered payload), so a ``` or <json> sitting inside a payload's own
509
+ # string value does not trigger a "stripped a wrapper" warning.
510
+ outside = non_payload_text(input, ranges)
149
511
  {
150
512
  prefix: substantive_text?(prefix),
151
513
  suffix: substantive_text?(suffix),
152
- fence: input.match?(/```/),
153
- wrapper: input.match?(/<json\b|BEGIN_JSON\b/i),
514
+ fence: outside.include?("```"),
515
+ wrapper: outside.match?(/<json\b|BEGIN_JSON\b/i),
154
516
  first_pos: line_col_for(input, first.begin),
155
517
  last_pos: line_col_for(input, last.begin)
156
518
  }
157
519
  end
158
520
 
521
+ def non_payload_text(input, ranges)
522
+ out = +""
523
+ pos = 0
524
+ ranges.each do |range|
525
+ out << input.byteslice(pos, range.begin - pos) if range.begin > pos
526
+ pos = range.end
527
+ end
528
+ out << input.byteslice(pos, input.bytesize - pos) if pos < input.bytesize
529
+ out
530
+ end
531
+
159
532
  def line_col_for(input, offset)
160
533
  line = 1
161
534
  col = 1
@@ -164,15 +537,15 @@ module SmarterJSON
164
537
  b = input.getbyte(i)
165
538
  break if b.nil?
166
539
 
167
- if b == 0x0A
540
+ if b == LF
168
541
  line += 1
169
542
  col = 1
170
543
  i += 1
171
- elsif b == 0x0D
544
+ elsif b == CR
172
545
  line += 1
173
546
  col = 1
174
547
  i += 1
175
- i += 1 if i < offset && input.getbyte(i) == 0x0A
548
+ i += 1 if i < offset && input.getbyte(i) == LF
176
549
  else
177
550
  col += 1
178
551
  i += 1
@@ -203,19 +576,19 @@ module SmarterJSON
203
576
  while i < input.bytesize
204
577
  b = input.getbyte(i)
205
578
  if mode == :double
206
- if b == 0x5C
579
+ if b == BACKSLASH
207
580
  i += 2
208
581
  next
209
- elsif b == 0x22
582
+ elsif b == DQUOTE
210
583
  mode = nil
211
584
  end
212
585
  i += 1
213
586
  next
214
587
  elsif mode == :single
215
- if b == 0x5C
588
+ if b == BACKSLASH
216
589
  i += 2
217
590
  next
218
- elsif b == 0x27
591
+ elsif b == SQUOTE
219
592
  mode = nil
220
593
  end
221
594
  i += 1
@@ -229,7 +602,7 @@ module SmarterJSON
229
602
  end
230
603
  next
231
604
  elsif mode == :line_comment
232
- if [0x0A, 0x0D].include?(b)
605
+ if [LF, CR].include?(b)
233
606
  mode = nil
234
607
  else
235
608
  i += 1
@@ -252,11 +625,11 @@ module SmarterJSON
252
625
  mode = :block_comment
253
626
  i += 2
254
627
  next
255
- elsif b == 0x23
628
+ elsif b == HASH
256
629
  mode = :line_comment
257
630
  i += 1
258
631
  next
259
- elsif b == 0x22
632
+ elsif b == DQUOTE
260
633
  mode = :double
261
634
  i += 1
262
635
  next
@@ -264,21 +637,21 @@ module SmarterJSON
264
637
  mode = :triple
265
638
  i += 3
266
639
  next
267
- elsif b == 0x27
640
+ elsif b == SQUOTE
268
641
  mode = :single
269
642
  i += 1
270
643
  next
271
- elsif [0x7B, 0x5B].include?(b)
644
+ elsif [LBRACE, LBRACKET].include?(b)
272
645
  start_pos = i if stack.empty?
273
646
  stack << b
274
- elsif b == 0x7D
275
- stack.pop if stack.last == 0x7B
647
+ elsif b == RBRACE
648
+ stack.pop if stack.last == LBRACE
276
649
  if stack.empty? && start_pos
277
650
  ranges << (start_pos...(i + 1))
278
651
  start_pos = nil
279
652
  end
280
- elsif b == 0x5D
281
- stack.pop if stack.last == 0x5B
653
+ elsif b == RBRACKET
654
+ stack.pop if stack.last == LBRACKET
282
655
  if stack.empty? && start_pos
283
656
  ranges << (start_pos...(i + 1))
284
657
  start_pos = nil
@@ -304,41 +677,7 @@ module SmarterJSON
304
677
  # Python literals (True/False/None) and undefined, underscores in
305
678
  # numeric literals, and encoding validation (SmarterJSON::EncodingError).
306
679
  class Parser
307
- LBRACE = 0x7B
308
- RBRACE = 0x7D
309
- LBRACKET = 0x5B
310
- RBRACKET = 0x5D
311
- COLON = 0x3A
312
- COMMA = 0x2C
313
- DQUOTE = 0x22
314
- SQUOTE = 0x27
315
- BACKSLASH = 0x5C
316
- SLASH = 0x2F
317
- STAR = 0x2A
318
- HASH = 0x23
319
- MINUS = 0x2D
320
- PLUS = 0x2B
321
- DOT = 0x2E
322
- ZERO = 0x30
323
- NINE = 0x39
324
- LOWER_E = 0x65
325
- UPPER_E = 0x45
326
- LOWER_T = 0x74
327
- LOWER_F = 0x66
328
- LOWER_N = 0x6E
329
- LOWER_U = 0x75
330
- LOWER_X = 0x78
331
- UPPER_X = 0x58
332
- UPPER_I = 0x49
333
- UPPER_N = 0x4E
334
- UPPER_T = 0x54
335
- UPPER_F = 0x46
336
- UNDERSCORE = 0x5F
337
- DOLLAR = 0x24
338
- SPACE = 0x20
339
- TAB = 0x09
340
- LF = 0x0A
341
- CR = 0x0D
680
+ include Bytes
342
681
 
343
682
  NOT_NUMERIC = Object.new
344
683
  HEX_RE = /\A[-+]?0[xX][0-9a-fA-F_]+\z/.freeze
@@ -350,18 +689,22 @@ module SmarterJSON
350
689
  # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
351
690
  # would change the string — so when it doesn't match, we skip normalization.
352
691
  NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
353
- BLANK_HEAD = /\A[[:space:]]+/.freeze
354
- BLANK_TAIL = /[[:space:]]+\z/.freeze
355
-
356
- # All caller-facing settings live in one options hash (smarter_csv style).
357
- DEFAULT_OPTIONS = {
358
- acceleration: true, # use the C extension when available
359
- encoding: nil, # label the input's encoding (no transcoding)
360
- symbolize_keys: false, # Symbol keys instead of String
361
- duplicate_key: :last_wins, # :last_wins | :first_wins | :raise
362
- bigdecimal_load: :auto, # :auto | :float | :bigdecimal (Oj-compatible)
363
- on_warning: nil, # a callable invoked once per non-fatal lenient fix (a SmarterJSON::Warning)
364
- }.freeze
692
+
693
+ # parse_string scans to the next closing-quote-or-backslash. byteindex (Ruby 3.2+,
694
+ # MRI) does that jump at C speed; the getbyte loop in scan_string_delimiter is the
695
+ # portable fallback (JRuby / TruffleRuby / older MRI). Both find the same byte.
696
+ BYTEINDEX_AVAILABLE = "".respond_to?(:byteindex)
697
+ DQUOTE_OR_BACKSLASH = /["\\]/.freeze
698
+ SQUOTE_OR_BACKSLASH = /['\\]/.freeze
699
+
700
+ # scan_quoteless_run's fast path jumps (in C) to the first structural terminator
701
+ # (',' '}' ']' '{' '[') OR any whitespace ([[:space:]] covers ASCII + Unicode space,
702
+ # incl. LF/CR which also terminate). Stopping at a terminator/EOF means the run had no
703
+ # interior whitespace, so there's nothing to trim and no comment marker can apply.
704
+ QL_BREAK = /[,{}\[\]]|[[:space:]]/.freeze
705
+
706
+ # The defaults live centrally in SmarterJSON::Options (lib/smarter_json/options.rb).
707
+ DEFAULT_OPTIONS = Options::DEFAULT_OPTIONS
365
708
 
366
709
  def initialize(input, options = {})
367
710
  raise ArgumentError, "input must be a String" unless input.is_a?(String)
@@ -369,8 +712,13 @@ module SmarterJSON
369
712
  opts = DEFAULT_OPTIONS.merge(options)
370
713
  @symbolize_keys = opts[:symbolize_keys]
371
714
  @duplicate_key = opts[:duplicate_key]
372
- @bigdecimal_load = opts[:bigdecimal_load]
373
- @on_warning = opts[:on_warning]
715
+ @decimal_precision = opts[:decimal_precision]
716
+ @on_warning = opts[:on_warning]
717
+ # store_member only needs the (per-member) Hash#key? duplicate lookup when a
718
+ # repeat would change behavior: a warning must fire, or :first_wins must keep the
719
+ # first. With the default (:last_wins, no handler) a duplicate just overwrites,
720
+ # which `hash[k] = value` already does — so skip the lookup entirely.
721
+ @check_duplicates = !@on_warning.nil? || @duplicate_key == :first_wins
374
722
 
375
723
  encoding = opts[:encoding]
376
724
  @input = encoding ? input.dup.force_encoding(encoding) : input
@@ -379,8 +727,6 @@ module SmarterJSON
379
727
  @bytesize = @input.bytesize
380
728
  # Skip a UTF-8 BOM (EF BB BF) at the start of input.
381
729
  @pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
382
- @line = 1
383
- @col = 1
384
730
  end
385
731
 
386
732
  # No block: auto-detect the document count for free (the same "is there
@@ -390,17 +736,14 @@ module SmarterJSON
390
736
  # value. Commas do NOT separate documents (only whitespace / newline /
391
737
  # concatenation do), so a bracketless comma list still raises in parse_document.
392
738
  def parse
393
- skip_whitespace_and_comments
394
- return nil if eof?
395
-
396
- value = parse_document
397
- skip_whitespace_and_comments
398
- return value if eof?
399
-
400
- results = [value]
739
+ results = []
401
740
  until eof?
402
- results << parse_document
403
- skip_whitespace_and_comments
741
+ skip_document_separators
742
+ break if eof?
743
+
744
+ value = parse_document
745
+ enforce_scalar_boundary(value)
746
+ results << value
404
747
  end
405
748
  results
406
749
  end
@@ -408,13 +751,17 @@ module SmarterJSON
408
751
  # Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
409
752
  # whitespace-separated). Used by the block form of SmarterJSON.process.
410
753
  def each_value
411
- loop do
412
- skip_whitespace_and_comments
754
+ count = 0
755
+ until eof?
756
+ skip_document_separators
413
757
  break if eof?
414
758
 
415
- yield parse_document
759
+ value = parse_document
760
+ enforce_scalar_boundary(value)
761
+ yield value
762
+ count += 1
416
763
  end
417
- nil
764
+ count
418
765
  end
419
766
 
420
767
  private
@@ -425,6 +772,48 @@ module SmarterJSON
425
772
  parse_iter(implicit_root_object_ahead?)
426
773
  end
427
774
 
775
+ # Between top-level documents, whitespace, comments, AND commas all separate
776
+ # (commas collapse like the in-container lenient-comma rule). A space alone never
777
+ # separates — that is handled inside the document by the quoteless run, so
778
+ # `1 2 3` is one document (the string "1 2 3") while `1, 2, 3` is three.
779
+ def skip_document_separators
780
+ skip_whitespace_and_comments
781
+ while byte == COMMA
782
+ advance(1)
783
+ skip_whitespace_and_comments
784
+ end
785
+ end
786
+
787
+ # After a top-level value: a self-delimiting value (object / array / quoted string)
788
+ # may be followed by anything (the next document self-delimits), but a bare scalar
789
+ # (number / keyword) must be followed by a real separator — a newline, ',', a
790
+ # comment, or EOF. A space is NOT a separator, so `1 2 3` and `42 "x" true` raise
791
+ # rather than silently splitting; bare top-level words raise in parse_value itself.
792
+ def enforce_scalar_boundary(value)
793
+ return if value.is_a?(String) || value.is_a?(Hash) || value.is_a?(Array)
794
+
795
+ skip_horizontal_whitespace
796
+ b = byte
797
+ return if b.nil? || b == LF || b == CR || b == COMMA
798
+ return if b == HASH || (b == SLASH && ((c = byte_at(1)) == SLASH || c == STAR))
799
+
800
+ raise error("a top-level number or keyword must be followed by a newline, ',', or end of input")
801
+ end
802
+
803
+ # Skip horizontal whitespace only (space / tab / VT / FF) — NOT newlines, which are
804
+ # document separators. Used by the scalar-boundary check above.
805
+ def skip_horizontal_whitespace
806
+ while (b = byte)
807
+ if b == SPACE || b == TAB || b == 0x0B || b == 0x0C
808
+ advance(1)
809
+ elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
810
+ @pos += n # multibyte horizontal whitespace (NBSP, U+2000–200A, …)
811
+ else
812
+ break
813
+ end
814
+ end
815
+ end
816
+
428
817
  # Iterative container parser — explicit stack, NO Ruby recursion, so nesting
429
818
  # is bounded only by memory (like Oj and the C extension's fj_parse_iter),
430
819
  # never by the call stack. Mirrors the C driver to keep the two paths in
@@ -445,9 +834,10 @@ module SmarterJSON
445
834
  end
446
835
 
447
836
  vss = false # warnings: has a value landed in the current container since the last separator?
448
- loop do
837
+ input = @input # hoisted: @input never changes mid-parse; byte reads inline as input.getbyte(@pos)
838
+ while true
449
839
  skip_whitespace_and_comments
450
- b = byte
840
+ b = input.getbyte(@pos)
451
841
  if at_top
452
842
  if b == LBRACE
453
843
  advance(1)
@@ -466,8 +856,17 @@ module SmarterJSON
466
856
  at_top = false
467
857
  vss = false
468
858
  elsif b.nil?
859
+ # Defensive guard: parse / each_value check eof? before calling parse_iter,
860
+ # so `at_top` never meets end-of-input here. Kept to mirror the C driver.
861
+ # :nocov:
469
862
  raise error("unexpected end of input")
863
+ # :nocov:
470
864
  else
865
+ # Top-level scalar: must be a recognized JSON value (number / literal /
866
+ # quoted string). A bare word raises — there are no top-level quoteless
867
+ # strings (Decision 2 = B-broad). In-container quoteless still uses
868
+ # parse_member_value; the scalar-vs-separator boundary is enforced by the
869
+ # parse / each_value loop via enforce_scalar_boundary.
471
870
  return parse_value
472
871
  end
473
872
  elsif b == COMMA
@@ -495,12 +894,12 @@ module SmarterJSON
495
894
  else
496
895
  key = parse_object_key
497
896
  skip_whitespace_and_comments
498
- raise error("expected ':' after key #{key.inspect}") unless byte == COLON
897
+ raise error("expected ':' after key #{key.inspect}") unless input.getbyte(@pos) == COLON
499
898
 
500
899
  advance(1)
501
900
  skip_whitespace_and_comments
502
- b = byte
503
- if [LBRACE, LBRACKET].include?(b)
901
+ b = input.getbyte(@pos)
902
+ if b == LBRACE || b == LBRACKET
504
903
  child = b == LBRACE ? {} : []
505
904
  advance(1) # consume { or [
506
905
  store_member(cur, key, child)
@@ -508,7 +907,7 @@ module SmarterJSON
508
907
  cur = child
509
908
  cur_obj = (b == LBRACE)
510
909
  vss = false
511
- elsif [RBRACE, COMMA].include?(b)
910
+ elsif b == RBRACE || b == COMMA
512
911
  # key with a colon but no value -> null (don't consume } or ,; the loop does)
513
912
  store_member(cur, key, nil)
514
913
  warn(:empty_value, "key #{key.inspect} had no value — used null") if @on_warning
@@ -533,7 +932,7 @@ module SmarterJSON
533
932
  raise error("unterminated array")
534
933
  elsif b == RBRACE
535
934
  raise error("unexpected '}' — expected ']' or a value")
536
- elsif [LBRACE, LBRACKET].include?(b)
935
+ elsif b == LBRACE || b == LBRACKET
537
936
  child = b == LBRACE ? {} : []
538
937
  advance(1) # consume { or [
539
938
  cur.push(child)
@@ -555,11 +954,11 @@ module SmarterJSON
555
954
  b = byte
556
955
  return false unless b && key_start_byte?(b)
557
956
 
558
- saved = [@pos, @line, @col]
957
+ saved = @pos
559
958
  advance(1) while (c = byte) && key_continue_byte?(c)
560
959
  skip_pure_whitespace
561
960
  result = (byte == COLON)
562
- @pos, @line, @col = saved
961
+ @pos = saved
563
962
  result
564
963
  end
565
964
 
@@ -577,46 +976,72 @@ module SmarterJSON
577
976
  @pos >= @bytesize
578
977
  end
579
978
 
979
+ # Advance the byte cursor by n (clamped to EOF). No line/col bookkeeping — that
980
+ # is computed lazily in line_col_at only when an error/warning is built. This is
981
+ # the hot-path primitive every consumed byte goes through, so it stays O(1) with
982
+ # no block, no re-read, and no per-byte branching. Mirrors the C fj_advance.
580
983
  def advance(n = 1)
581
- n.times do
582
- b = @input.getbyte(@pos)
583
- return if b.nil?
984
+ @pos += n
985
+ @pos = @bytesize if @pos > @bytesize
986
+ end
584
987
 
988
+ # Line and 1-based BYTE column at byte position `pos`, computed lazily by scanning
989
+ # from the start of the buffer — only on the cold path (error / warning / triple-quote
990
+ # indent), never per byte. CR, LF, and CRLF each count as one newline; the column is
991
+ # the byte offset within the line. Mirrors the C extension's fj_line_col so both paths
992
+ # report identical positions.
993
+ def line_col_at(pos = @pos)
994
+ limit = pos < @bytesize ? pos : @bytesize
995
+ line = 1
996
+ col = 1
997
+ i = 0
998
+ while i < limit
999
+ b = @input.getbyte(i)
585
1000
  if b == LF
586
- @line += 1
587
- @col = 1
588
- @pos += 1
1001
+ line += 1
1002
+ col = 1
589
1003
  elsif b == CR
590
- @line += 1
591
- @col = 1
592
- @pos += 1
593
- @pos += 1 if @input.getbyte(@pos) == LF
1004
+ line += 1
1005
+ col = 1
1006
+ i += 1 if i + 1 < @bytesize && @input.getbyte(i + 1) == LF
594
1007
  else
595
- @col += 1
596
- @pos += 1
1008
+ col += 1
597
1009
  end
1010
+ i += 1
1011
+ end
1012
+ [line, col]
1013
+ end
1014
+
1015
+ # 1-based byte column at `pos` (bytes since the last line start). Used for
1016
+ # triple-quoted-string indentation stripping. Mirrors the C fj_column.
1017
+ def column_at(pos = @pos)
1018
+ c = 1
1019
+ i = pos - 1
1020
+ while i >= 0 && (b = @input.getbyte(i)) != LF && b != CR
1021
+ c += 1
1022
+ i -= 1
598
1023
  end
1024
+ c
599
1025
  end
600
1026
 
601
1027
  # --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
602
1028
 
603
1029
  def skip_pure_whitespace
604
- loop do
605
- b = byte
606
- break if b.nil?
607
-
1030
+ input = @input
1031
+ pos = @pos
1032
+ while (b = input.getbyte(pos))
608
1033
  if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
609
- advance(1)
1034
+ pos += 1
610
1035
  elsif b >= 0x80
611
- n = multibyte_ws_len(@pos)
1036
+ n = multibyte_ws_len(pos)
612
1037
  break if n.zero?
613
1038
 
614
- @pos += n
615
- @col += 1
1039
+ pos += n
616
1040
  else
617
1041
  break
618
1042
  end
619
1043
  end
1044
+ @pos = pos
620
1045
  end
621
1046
 
622
1047
  # Number of bytes of the Unicode-whitespace char starting at pos, or 0.
@@ -650,19 +1075,20 @@ module SmarterJSON
650
1075
  # A '#', '//', or '/*' starts a comment only when preceded by whitespace
651
1076
  # or at the very start of input (the comment-marker rule).
652
1077
  def skip_whitespace_and_comments
653
- loop do
1078
+ while true
654
1079
  skip_pure_whitespace
655
1080
  b = byte
656
- break if b.nil?
1081
+ if b == HASH
1082
+ break unless preceded_by_ws_or_start?
657
1083
 
658
- is_marker = (b == HASH) || (b == SLASH && [SLASH, STAR].include?(byte_at(1)))
659
- break unless is_marker
660
- break unless preceded_by_ws_or_start?
1084
+ skip_to_eol
1085
+ elsif b == SLASH
1086
+ c = byte_at(1)
1087
+ break unless (c == SLASH || c == STAR) && preceded_by_ws_or_start?
661
1088
 
662
- if b == SLASH && byte_at(1) == STAR
663
- skip_block_comment
1089
+ c == STAR ? skip_block_comment : skip_to_eol
664
1090
  else
665
- skip_to_eol
1091
+ break
666
1092
  end
667
1093
  end
668
1094
  end
@@ -702,8 +1128,9 @@ module SmarterJSON
702
1128
  # --- values ---
703
1129
 
704
1130
  # Top-level / strict value: no quoteless fallback.
1131
+ # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
1132
+ # so @pos is at the value's first byte — no leading skip needed here.
705
1133
  def parse_value
706
- skip_whitespace_and_comments
707
1134
  raise error("unexpected end of input") if eof?
708
1135
 
709
1136
  b = byte
@@ -736,8 +1163,9 @@ module SmarterJSON
736
1163
  end
737
1164
 
738
1165
  # Value in object-value or array-element position: quoteless allowed.
1166
+ # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
1167
+ # so @pos is at the value's first byte — no leading skip needed here.
739
1168
  def parse_member_value
740
- skip_whitespace_and_comments
741
1169
  raise error("unexpected end of input") if eof?
742
1170
 
743
1171
  b = byte
@@ -770,7 +1198,7 @@ module SmarterJSON
770
1198
  until eof?
771
1199
  if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
772
1200
  closers.include?(@input.getbyte(@pos + 2))
773
- result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1201
+ result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
774
1202
  advance(3)
775
1203
  return result
776
1204
  end
@@ -781,9 +1209,7 @@ module SmarterJSON
781
1209
 
782
1210
  def store_member(hash, key, value)
783
1211
  k = @symbolize_keys ? key.to_sym : key
784
- if hash.key?(k)
785
- raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
786
-
1212
+ if @check_duplicates && hash.key?(k)
787
1213
  warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}") if @on_warning
788
1214
  return if @duplicate_key == :first_wins
789
1215
  end
@@ -814,51 +1240,77 @@ module SmarterJSON
814
1240
  start = @pos
815
1241
  advance(1)
816
1242
  advance(1) while (b = byte) && key_continue_byte?(b)
817
- @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1243
+ @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
818
1244
  end
819
1245
 
820
1246
  # --- quoteless strings & literal classification ---
821
1247
 
822
1248
  def parse_quoteless_or_literal
823
1249
  start = @pos
824
- scan_quoteless_run
1250
+ value_end = scan_quoteless_run
825
1251
  # A quoteless run must consume at least one byte. If the first byte is a
826
1252
  # delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
827
1253
  # here would make the caller's `result << parse_member_value` loop forever.
828
1254
  # Raise instead (correct today: the Lenient Commas Option is not adopted).
829
1255
  raise error("expected a value") if @pos == start
830
1256
 
831
- raw = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
832
- classify_quoteless(trim_blank(raw))
1257
+ # value_end is the end of the last non-whitespace char in the run; slicing to it
1258
+ # drops trailing whitespace without a regex (the caller already skipped leading
1259
+ # whitespace, so there is none to trim at the front). Equivalent to the old
1260
+ # trim_blank(raw) but with no per-scalar String#sub allocations.
1261
+ raw = @input.byteslice(start, value_end - start) # byteslice preserves @input's encoding
1262
+ classify_quoteless(raw)
833
1263
  end
834
1264
 
835
1265
  # Advance to the end of a quoteless run. Stops at structural punctuation
836
- # (',' '}' ']'), a newline, EOF, or a comment marker that is preceded by
837
- # whitespace. Spaces by themselves are not delimiters.
1266
+ # (',' '{' '}' '[' ']' openers terminate symmetrically with closers, so a
1267
+ # self-delimiting value starts fresh: `localhost {"a":1}` -> ["localhost", {...}]),
1268
+ # a newline, EOF, or a comment marker that is preceded by whitespace. Spaces by
1269
+ # themselves are not delimiters.
1270
+ # Advance @pos to the end of the quoteless run (including any trailing whitespace,
1271
+ # so the parser resumes correctly after the value). Returns value_end: the byte
1272
+ # offset just past the last NON-whitespace char, so the caller can slice off
1273
+ # trailing whitespace without a regex.
838
1274
  def scan_quoteless_run
1275
+ input = @input
1276
+ pos = @pos
1277
+ # Fast path: one C-level byteindex jumps to the first structural terminator or
1278
+ # whitespace. If it lands on a terminator (or EOF) the run had no interior whitespace,
1279
+ # so [pos, hit) is the whole value — value_end == hit (no trailing trim) and no comment
1280
+ # marker can apply (those only break after whitespace). This is the common case
1281
+ # (numbers and simple tokens). Anything with whitespace falls to the byte-by-byte loop.
1282
+ if BYTEINDEX_AVAILABLE
1283
+ hit = input.byteindex(QL_BREAK, pos) || @bytesize
1284
+ b = hit < @bytesize ? input.getbyte(hit) : nil
1285
+ if b.nil? || b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
1286
+ @pos = hit
1287
+ return hit
1288
+ end
1289
+ end
1290
+
1291
+ # Slow path: the run contains whitespace — scan byte by byte to honor interior
1292
+ # whitespace, trailing-whitespace trimming (value_end is the end of the last
1293
+ # non-whitespace char), and the comment-marker-after-whitespace rule.
1294
+ value_end = pos
839
1295
  prev_ws = false
840
- loop do
841
- b = byte
842
- break if b.nil?
843
- break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
844
- break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
1296
+ while (b = input.getbyte(pos))
1297
+ break if b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
1298
+ break if prev_ws && (b == HASH || (b == SLASH && ((c = input.getbyte(pos + 1)) == SLASH || c == STAR)))
845
1299
 
846
1300
  if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
847
1301
  prev_ws = true
848
- advance(1)
849
- elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
1302
+ pos += 1
1303
+ elsif b >= 0x80 && (n = multibyte_ws_len(pos)).positive?
850
1304
  prev_ws = true
851
- @pos += n
852
- @col += 1
1305
+ pos += n
853
1306
  else
854
1307
  prev_ws = false
855
- advance(1)
1308
+ pos += 1
1309
+ value_end = pos
856
1310
  end
857
1311
  end
858
- end
859
-
860
- def trim_blank(str)
861
- str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
1312
+ @pos = pos
1313
+ value_end
862
1314
  end
863
1315
 
864
1316
  def classify_quoteless(str)
@@ -869,7 +1321,7 @@ module SmarterJSON
869
1321
  when "undefined" then return nil
870
1322
  when "NaN" then return Float::NAN
871
1323
  when "Infinity", "+Infinity" then return Float::INFINITY
872
- when "-Infinity" then return (-Float::INFINITY)
1324
+ when "-Infinity" then return -Float::INFINITY
873
1325
  end
874
1326
  num = numeric_value(str)
875
1327
  num.equal?(NOT_NUMERIC) ? str : num
@@ -877,31 +1329,73 @@ module SmarterJSON
877
1329
 
878
1330
  # Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
879
1331
  def numeric_value(str)
880
- if HEX_RE.match?(str)
881
- neg = str.start_with?("-")
1332
+ # Cheap hex gate: only invoke HEX_RE when the token actually looks like [+-]?0x… .
1333
+ # A Regexp#match? has real per-call cost; almost no number is hex, so the 1–3 byte
1334
+ # check skips that call on the common path (measured +21% on long-token decimals).
1335
+ if hex_prefix?(str) && HEX_RE.match?(str)
1336
+ neg = str.getbyte(0) == MINUS
882
1337
  body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
883
1338
  v = body[2..-1].to_i(16)
884
1339
  return neg ? -v : v
885
1340
  end
886
1341
  return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
887
1342
 
888
- body = str.delete("_")
1343
+ # delete("_") allocates a fresh string even when there is nothing to delete; on long
1344
+ # number tokens that is a real per-value allocation. Underscores are rare, so only
1345
+ # pay it when the token actually contains one (measured +27% on long-token decimals).
1346
+ body = str.include?("_") ? str.delete("_") : str
889
1347
  body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
890
1348
  end
891
1349
 
892
- # A decimal (has '.' or exponent). bigdecimal_load: :float -> Float,
1350
+ # True when the token starts with [+-]?0[xX] the only shape HEX_RE can match.
1351
+ def hex_prefix?(str)
1352
+ c0 = str.getbyte(0)
1353
+ if c0 == ZERO
1354
+ x = str.getbyte(1)
1355
+ x == LOWER_X || x == UPPER_X
1356
+ elsif c0 == MINUS || c0 == PLUS
1357
+ str.getbyte(1) == ZERO && ((x = str.getbyte(2)) == LOWER_X || x == UPPER_X)
1358
+ else
1359
+ false
1360
+ end
1361
+ end
1362
+
1363
+ # A decimal (has '.' or exponent). decimal_precision: :float -> Float,
893
1364
  # :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
894
1365
  # than 16 significant digits (Oj's DEC_MAX threshold), else Float.
895
1366
  def decimal_value(body)
896
- case @bigdecimal_load
1367
+ case @decimal_precision
897
1368
  when :float then body.to_f
898
1369
  when :bigdecimal then to_big_decimal(body)
899
1370
  else significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
900
1371
  end
901
1372
  end
902
1373
 
1374
+ # Count significant mantissa digits (leading zeros excluded, exponent ignored) to pick
1375
+ # Float vs BigDecimal in :auto mode. A single byte-scan — the old three-regex version
1376
+ # (strip exponent, strip non-digits, strip leading zeros, .length) ran on every float
1377
+ # and dominated the number path's cost. body is a DEC_RE-validated token (digits, at most
1378
+ # one '.', optional sign, optional e/E exponent), underscores already removed.
903
1379
  def significant_digits(body)
904
- body.sub(/[eE].*\z/, "").gsub(/[^0-9]/, "").sub(/\A0+/, "").length
1380
+ count = 0
1381
+ leading = true
1382
+ i = 0
1383
+ n = body.bytesize
1384
+ while i < n
1385
+ b = body.getbyte(i)
1386
+ i += 1
1387
+ break if b == LOWER_E || b == UPPER_E # exponent: its digits aren't significant
1388
+
1389
+ next unless b >= ZERO && b <= NINE # skip sign and the decimal point
1390
+
1391
+ if leading && b == ZERO
1392
+ next # leading zero (incl. those after '.') — not significant
1393
+ else
1394
+ leading = false
1395
+ count += 1
1396
+ end
1397
+ end
1398
+ count
905
1399
  end
906
1400
 
907
1401
  def to_big_decimal(body)
@@ -912,7 +1406,11 @@ module SmarterJSON
912
1406
  body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
913
1407
  BigDecimal(body)
914
1408
  rescue ArgumentError
1409
+ # Defensive: BigDecimal() does not reject a DEC_RE-validated, normalized token,
1410
+ # so this fallback is unreachable from valid input. Kept as a safety net.
1411
+ # :nocov:
915
1412
  body.to_f
1413
+ # :nocov:
916
1414
  end
917
1415
 
918
1416
  # BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
@@ -931,7 +1429,7 @@ module SmarterJSON
931
1429
  end
932
1430
 
933
1431
  def parse_triple_quoted
934
- indent = @col - 1
1432
+ indent = column_at(@pos) - 1
935
1433
  advance(3)
936
1434
  raw_start = @pos
937
1435
  until eof?
@@ -941,7 +1439,7 @@ module SmarterJSON
941
1439
  end
942
1440
  raise error("unterminated triple-quoted string") if eof?
943
1441
 
944
- raw = @input.byteslice(raw_start, @pos - raw_start).force_encoding(@input.encoding)
1442
+ raw = @input.byteslice(raw_start, @pos - raw_start) # byteslice preserves @input's encoding
945
1443
  advance(3)
946
1444
  strip_triple(raw, indent)
947
1445
  end
@@ -971,20 +1469,30 @@ module SmarterJSON
971
1469
  def parse_string(quote)
972
1470
  advance(1)
973
1471
  start = @pos
974
- has_escape = false
1472
+ # Fast path (the common case — a string with no escapes): jump straight to the
1473
+ # closing quote with byteindex. It is called only here, from `start`, which is
1474
+ # always a character boundary, so byteindex never sees a mid-char offset.
1475
+ hit = scan_string_delimiter(quote)
1476
+ raise error("unterminated string") if hit.nil?
1477
+
1478
+ if @input.getbyte(hit) == quote
1479
+ @pos = hit
1480
+ result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
1481
+ advance(1)
1482
+ return result
1483
+ end
1484
+
1485
+ # Escape path: a backslash precedes the closing quote. Scan byte by byte from
1486
+ # here — byteindex can't be used past a backslash (a lenient \<multibyte> would
1487
+ # leave @pos mid-character), and this lets the decoder flag invalid escapes
1488
+ # exactly as before. decode_string_with_escapes handles the whole [start, finish].
1489
+ @pos = hit
975
1490
  while (b = byte)
976
1491
  if b == quote
977
- if has_escape
978
- decoded = decode_string_with_escapes(start, @pos, quote)
979
- advance(1)
980
- return decoded
981
- else
982
- result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
983
- advance(1)
984
- return result
985
- end
1492
+ decoded = decode_string_with_escapes(start, @pos, quote)
1493
+ advance(1)
1494
+ return decoded
986
1495
  elsif b == BACKSLASH
987
- has_escape = true
988
1496
  advance(1)
989
1497
  raise error("unterminated string escape") if eof?
990
1498
 
@@ -996,6 +1504,20 @@ module SmarterJSON
996
1504
  raise error("unterminated string")
997
1505
  end
998
1506
 
1507
+ # Byte index of the next closing quote or backslash at/after @pos, or nil if
1508
+ # neither occurs before EOF. byteindex scans inside MRI's C; the fallback is a
1509
+ # tight getbyte loop (the ASCII delimiters never alias UTF-8 continuation bytes,
1510
+ # so byte scanning is correct for UTF-8 string content).
1511
+ def scan_string_delimiter(quote)
1512
+ if BYTEINDEX_AVAILABLE
1513
+ @input.byteindex(quote == DQUOTE ? DQUOTE_OR_BACKSLASH : SQUOTE_OR_BACKSLASH, @pos)
1514
+ else
1515
+ i = @pos
1516
+ i += 1 while i < @bytesize && (b = @input.getbyte(i)) != quote && b != BACKSLASH
1517
+ i < @bytesize ? i : nil
1518
+ end
1519
+ end
1520
+
999
1521
  def decode_string_with_escapes(start, finish, _quote)
1000
1522
  buf = String.new(encoding: Encoding::ASCII_8BIT)
1001
1523
  i = start
@@ -1087,7 +1609,7 @@ module SmarterJSON
1087
1609
 
1088
1610
  if byte == ZERO
1089
1611
  advance(1)
1090
- if [LOWER_X, UPPER_X].include?(byte)
1612
+ if (x = byte) == LOWER_X || x == UPPER_X
1091
1613
  advance(1)
1092
1614
  hex_start = @pos
1093
1615
  advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
@@ -1112,10 +1634,10 @@ module SmarterJSON
1112
1634
  advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
1113
1635
  end
1114
1636
 
1115
- if [LOWER_E, UPPER_E].include?(byte)
1637
+ if (e = byte) == LOWER_E || e == UPPER_E
1116
1638
  is_float = true
1117
1639
  advance(1)
1118
- advance(1) if [PLUS, MINUS].include?(byte)
1640
+ advance(1) if (s = byte) == PLUS || s == MINUS
1119
1641
  raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
1120
1642
 
1121
1643
  advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
@@ -1151,11 +1673,13 @@ module SmarterJSON
1151
1673
  def warn(type, message)
1152
1674
  return unless @on_warning
1153
1675
 
1154
- @on_warning.call(Warning.new(type, message, @line, @col))
1676
+ line, col = line_col_at(@pos)
1677
+ @on_warning.call(Warning.new(type, message, line, col))
1155
1678
  end
1156
1679
 
1157
1680
  def error(message)
1158
- ParseError.new(message, @line, @col)
1681
+ line, col = line_col_at(@pos)
1682
+ ParseError.new(message, line, col)
1159
1683
  end
1160
1684
 
1161
1685
  def display_byte(b)