smarter_json 0.9.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Array#filter_map (used in Recovery#extract_payloads) is Ruby 2.7+; on Ruby < 2.7
4
+ # activate the scoped refinement backport (no-op on 2.7+, which uses native filter_map).
5
+ using SmarterJSON::Backports if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("2.7")
6
+
3
7
  module SmarterJSON
4
8
  # ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
5
9
  # from the shared SmarterJSON::Error base.
@@ -12,15 +16,20 @@ module SmarterJSON
12
16
  # is always content, never a filename — use process_file for paths.) The values
13
17
  # in `options` override Parser::DEFAULT_OPTIONS.
14
18
  #
15
- # Without a block: returns nil (zero documents), the value (one document), or an
16
- # Array of the values (two or more NDJSON / JSONL / concatenated / whitespace-
17
- # separated). :acceleration (default true) selects the C extension when compiled
18
- # and loaded (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
+ # Without a block: always returns an Array of the documents found [] for none,
20
+ # [doc] for one, [d1, d2, …] for several (NDJSON / JSONL / concatenated). A
21
+ # top-level value must be a recognized JSON value (number / literal / quoted
22
+ # string / object / array) or an implicit-root object, else it raises. For the
23
+ # single-document case use SmarterJSON.process_one (returns the bare value).
24
+ # :acceleration (default true) selects the C extension when compiled and loaded
25
+ # (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
26
  #
20
- # With a block: yields each top-level document as it is parsed, and returns nil.
21
- # For an IO this streams document-by-document in bounded memory — it reads the
22
- # stream as newline-delimited documents (NDJSON / JSONL), one per line.
27
+ # With a block: yields each top-level document as it is parsed, and returns the
28
+ # document count. For an IO this streams document-by-document in bounded memory —
29
+ # it reads the stream as newline-delimited documents (NDJSON / JSONL), one per
30
+ # line.
23
31
  def process(input, options = {}, &block)
32
+ options = Options.process_options(options)
24
33
  if input.is_a?(String)
25
34
  Recovery.process_string(input, options, &block)
26
35
  elsif input.respond_to?(:read)
@@ -39,7 +48,8 @@ module SmarterJSON
39
48
  # loading the whole file); the documents are read as newline-delimited
40
49
  # (NDJSON / JSONL), one per line.
41
50
  def process_file(path, options = {}, &block)
42
- encoding = options.fetch(:encoding, "UTF-8")
51
+ options = Options.process_options(options)
52
+ encoding = options[:encoding] || "UTF-8"
43
53
  if block
44
54
  File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
45
55
  else
@@ -47,8 +57,44 @@ module SmarterJSON
47
57
  end
48
58
  end
49
59
 
50
- # Parse a String of JSON content (the in-memory path). Returns nil (block) or
51
- # the value / Array (no block); the C extension is used when available.
60
+ # SmarterJSON.process_one(input, options = {}) the single-document accessor.
61
+ #
62
+ # Returns the first document's value (or nil when the input holds no documents).
63
+ # When the input holds MORE than one document it returns the first and warns once
64
+ # — it never raises, since an extra document is valid data; the warning goes to
65
+ # on_warning if set, else Rails.logger.warn when Rails is loaded, else Kernel#warn.
66
+ # For an IO this is bounded memory: it parses just the first document and stops as
67
+ # soon as a second is seen, instead of materialising the whole stream the way
68
+ # process(io).first would. (process(input).first and process(input)[0] silently
69
+ # drop documents 2+ — a footgun; use process_one instead.)
70
+ def process_one(input, options = {})
71
+ options = Options.process_options(options)
72
+
73
+ # IO: bounded memory — parse just the first document and stop once a second is
74
+ # seen (peek-to-warn). A String is already in memory, so use the plain no-block
75
+ # path: it returns the full (wrapper-recovered, de-duplicated) Array in one pass,
76
+ # which also avoids the reactive-recovery double-yield the block path would hit.
77
+ unless input.respond_to?(:read)
78
+ docs = process(input, options)
79
+ warn_extra_documents(options) if docs.length > 1
80
+ return docs.first
81
+ end
82
+
83
+ first = nil
84
+ count = 0
85
+ catch(:smarter_json_first_document) do
86
+ process(input, options) do |doc|
87
+ count += 1
88
+ first = doc if count == 1
89
+ throw(:smarter_json_first_document) if count > 1
90
+ end
91
+ end
92
+ warn_extra_documents(options) if count > 1
93
+ first
94
+ end
95
+
96
+ # Parse a String of JSON content (the in-memory path). Returns an Array of the
97
+ # documents found (empty for none); the C extension is used when available.
52
98
  def process_content(input, options, &block)
53
99
  if block
54
100
  if options.fetch(:acceleration, true) && HAS_ACCELERATION
@@ -63,14 +109,55 @@ module SmarterJSON
63
109
  end
64
110
  end
65
111
 
112
+ # Smart default for the nil :encoding option. A String tagged ASCII-8BIT (BINARY)
113
+ # is how Net::HTTP and many HTTP libraries hand back a response body even when the
114
+ # bytes are UTF-8. JSON's interchange encoding is UTF-8, so we relabel such input
115
+ # to UTF-8 when its bytes are valid UTF-8 — otherwise string values would come back
116
+ # tagged ASCII-8BIT and compare unequal to UTF-8 literals (a silent footgun). When
117
+ # the bytes are NOT valid UTF-8 we raise EncodingError rather than guess a legacy
118
+ # encoding — pass an explicit :encoding for that. An explicit (non-nil) :encoding,
119
+ # or any non-BINARY tag, is left untouched (the per-path force_encoding / validation
120
+ # handles it). Only relabels — never transcodes.
121
+ def normalize_default_encoding(input, options)
122
+ return input unless options[:encoding].nil?
123
+ return input unless input.encoding == Encoding::ASCII_8BIT
124
+
125
+ utf8 = input.dup.force_encoding(Encoding::UTF_8)
126
+ return utf8 if utf8.valid_encoding?
127
+
128
+ raise EncodingError, "input is tagged ASCII-8BIT and is not valid UTF-8 — pass encoding: to declare its encoding"
129
+ end
130
+
66
131
  # Stream documents from an IO incrementally, yielding each recovered top-level
67
132
  # document without slurping the whole input into memory first.
68
133
  def stream_io(io, options, &block)
69
- Framer.each_document(io) { |doc| Recovery.process_string(doc, options, &block) }
70
- nil
134
+ count = 0
135
+ Framer.each_document(io) do |doc|
136
+ # Recovery.process_string yields each value and returns how many it yielded;
137
+ # blank / comment-only framed segments yield none, so count tracks actual
138
+ # documents (== values yielded), not raw framed segments.
139
+ count += Recovery.process_string(doc, options, &block)
140
+ end
141
+ count
71
142
  end
72
143
 
73
- private_class_method :process_content, :stream_io
144
+ # process_one's "more than one document" notice — routed to on_warning if the caller
145
+ # gave one, else Rails.logger when Rails is loaded, else Kernel#warn. Never silent,
146
+ # never raised.
147
+ def warn_extra_documents(options)
148
+ message = "SmarterJSON.process_one: input has more than one document — returning the first and " \
149
+ "dropping the rest. Use SmarterJSON.process to get every document."
150
+ handler = options[:on_warning]
151
+ if handler
152
+ handler.call(Warning.new(:extra_documents, message, nil, nil))
153
+ elsif defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
154
+ Rails.logger.warn(message)
155
+ else
156
+ Kernel.warn(message)
157
+ end
158
+ end
159
+
160
+ private_class_method :process_content, :stream_io, :warn_extra_documents
74
161
 
75
162
  # Named byte values, shared by the Parser FSM and the Framer / Recovery byte
76
163
  # scanners so none of them spell out raw hex. Included where needed.
@@ -119,7 +206,7 @@ module SmarterJSON
119
206
 
120
207
  module_function
121
208
 
122
- def each_document(io, &block)
209
+ def each_document(io)
123
210
  buffer = +""
124
211
  scan = 0
125
212
  doc_start = nil
@@ -343,6 +430,7 @@ module SmarterJSON
343
430
  module_function
344
431
 
345
432
  def process_string(input, options, &block)
433
+ input = SmarterJSON.send(:normalize_default_encoding, input, options)
346
434
  return SmarterJSON.send(:process_content, input, options, &block) unless input.valid_encoding?
347
435
 
348
436
  # Recovery is REACTIVE: parse first, and only fall back to wrapper extraction when
@@ -385,15 +473,23 @@ module SmarterJSON
385
473
  handler = options[:on_warning]
386
474
  emit_wrapper_warnings(payloads, handler)
387
475
 
388
- results = payloads.map do |payload|
389
- SmarterJSON.send(:process_content, payload[:slice], options)
476
+ if block_given?
477
+ count = 0
478
+ payloads.each do |payload|
479
+ SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
480
+ block.call(doc)
481
+ count += 1
482
+ end
483
+ end
484
+ return count
390
485
  end
391
486
 
392
- return results.each(&block).then { nil } if block_given?
393
- return nil if results.empty?
394
- return results.first if results.length == 1
395
-
396
- results
487
+ # Each payload's process_content now returns an Array of its documents; flatten
488
+ # so several recovered payloads yield one flat Array<doc> (the always-array
489
+ # contract), not an Array of Arrays.
490
+ payloads.flat_map do |payload|
491
+ SmarterJSON.send(:process_content, payload[:slice], options)
492
+ end
397
493
  end
398
494
 
399
495
  def emit_wrapper_warnings(payloads, handler)
@@ -613,18 +709,22 @@ module SmarterJSON
613
709
  # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
614
710
  # would change the string — so when it doesn't match, we skip normalization.
615
711
  NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
616
- BLANK_HEAD = /\A[[:space:]]+/.freeze
617
- BLANK_TAIL = /[[:space:]]+\z/.freeze
618
-
619
- # All caller-facing settings live in one options hash (smarter_csv style).
620
- DEFAULT_OPTIONS = {
621
- acceleration: true, # use the C extension when available
622
- encoding: nil, # label the input's encoding (no transcoding)
623
- symbolize_keys: false, # Symbol keys instead of String
624
- duplicate_key: :last_wins, # :last_wins | :first_wins | :raise
625
- bigdecimal_load: :auto, # :auto | :float | :bigdecimal (Oj-compatible)
626
- on_warning: nil, # a callable invoked once per non-fatal lenient fix (a SmarterJSON::Warning)
627
- }.freeze
712
+
713
+ # parse_string scans to the next closing-quote-or-backslash. byteindex (Ruby 3.2+,
714
+ # MRI) does that jump at C speed; the getbyte loop in scan_string_delimiter is the
715
+ # portable fallback (JRuby / TruffleRuby / older MRI). Both find the same byte.
716
+ BYTEINDEX_AVAILABLE = "".respond_to?(:byteindex)
717
+ DQUOTE_OR_BACKSLASH = /["\\]/.freeze
718
+ SQUOTE_OR_BACKSLASH = /['\\]/.freeze
719
+
720
+ # scan_quoteless_run's fast path jumps (in C) to the first structural terminator
721
+ # (',' '}' ']' '{' '[') OR any whitespace ([[:space:]] covers ASCII + Unicode space,
722
+ # incl. LF/CR which also terminate). Stopping at a terminator/EOF means the run had no
723
+ # interior whitespace, so there's nothing to trim and no comment marker can apply.
724
+ QL_BREAK = /[,{}\[\]]|[[:space:]]/.freeze
725
+
726
+ # The defaults live centrally in SmarterJSON::Options (lib/smarter_json/options.rb).
727
+ DEFAULT_OPTIONS = Options::DEFAULT_OPTIONS
628
728
 
629
729
  def initialize(input, options = {})
630
730
  raise ArgumentError, "input must be a String" unless input.is_a?(String)
@@ -632,8 +732,13 @@ module SmarterJSON
632
732
  opts = DEFAULT_OPTIONS.merge(options)
633
733
  @symbolize_keys = opts[:symbolize_keys]
634
734
  @duplicate_key = opts[:duplicate_key]
635
- @bigdecimal_load = opts[:bigdecimal_load]
636
- @on_warning = opts[:on_warning]
735
+ @decimal_precision = opts[:decimal_precision]
736
+ @on_warning = opts[:on_warning]
737
+ # store_member only needs the (per-member) Hash#key? duplicate lookup when a
738
+ # repeat would change behavior: a warning must fire, or :first_wins must keep the
739
+ # first. With the default (:last_wins, no handler) a duplicate just overwrites,
740
+ # which `hash[k] = value` already does — so skip the lookup entirely.
741
+ @check_duplicates = !@on_warning.nil? || @duplicate_key == :first_wins
637
742
 
638
743
  encoding = opts[:encoding]
639
744
  @input = encoding ? input.dup.force_encoding(encoding) : input
@@ -642,8 +747,6 @@ module SmarterJSON
642
747
  @bytesize = @input.bytesize
643
748
  # Skip a UTF-8 BOM (EF BB BF) at the start of input.
644
749
  @pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
645
- @line = 1
646
- @col = 1
647
750
  end
648
751
 
649
752
  # No block: auto-detect the document count for free (the same "is there
@@ -653,17 +756,14 @@ module SmarterJSON
653
756
  # value. Commas do NOT separate documents (only whitespace / newline /
654
757
  # concatenation do), so a bracketless comma list still raises in parse_document.
655
758
  def parse
656
- skip_whitespace_and_comments
657
- return nil if eof?
658
-
659
- value = parse_document
660
- skip_whitespace_and_comments
661
- return value if eof?
662
-
663
- results = [value]
759
+ results = []
664
760
  until eof?
665
- results << parse_document
666
- skip_whitespace_and_comments
761
+ skip_document_separators
762
+ break if eof?
763
+
764
+ value = parse_document
765
+ enforce_scalar_boundary(value)
766
+ results << value
667
767
  end
668
768
  results
669
769
  end
@@ -671,13 +771,17 @@ module SmarterJSON
671
771
  # Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
672
772
  # whitespace-separated). Used by the block form of SmarterJSON.process.
673
773
  def each_value
674
- loop do
675
- skip_whitespace_and_comments
774
+ count = 0
775
+ until eof?
776
+ skip_document_separators
676
777
  break if eof?
677
778
 
678
- yield parse_document
779
+ value = parse_document
780
+ enforce_scalar_boundary(value)
781
+ yield value
782
+ count += 1
679
783
  end
680
- nil
784
+ count
681
785
  end
682
786
 
683
787
  private
@@ -688,6 +792,48 @@ module SmarterJSON
688
792
  parse_iter(implicit_root_object_ahead?)
689
793
  end
690
794
 
795
+ # Between top-level documents, whitespace, comments, AND commas all separate
796
+ # (commas collapse like the in-container lenient-comma rule). A space alone never
797
+ # separates — that is handled inside the document by the quoteless run, so
798
+ # `1 2 3` is one document (the string "1 2 3") while `1, 2, 3` is three.
799
+ def skip_document_separators
800
+ skip_whitespace_and_comments
801
+ while byte == COMMA
802
+ advance(1)
803
+ skip_whitespace_and_comments
804
+ end
805
+ end
806
+
807
+ # After a top-level value: a self-delimiting value (object / array / quoted string)
808
+ # may be followed by anything (the next document self-delimits), but a bare scalar
809
+ # (number / keyword) must be followed by a real separator — a newline, ',', a
810
+ # comment, or EOF. A space is NOT a separator, so `1 2 3` and `42 "x" true` raise
811
+ # rather than silently splitting; bare top-level words raise in parse_value itself.
812
+ def enforce_scalar_boundary(value)
813
+ return if value.is_a?(String) || value.is_a?(Hash) || value.is_a?(Array)
814
+
815
+ skip_horizontal_whitespace
816
+ b = byte
817
+ return if b.nil? || b == LF || b == CR || b == COMMA
818
+ return if b == HASH || (b == SLASH && ((c = byte_at(1)) == SLASH || c == STAR))
819
+
820
+ raise error("a top-level number or keyword must be followed by a newline, ',', or end of input")
821
+ end
822
+
823
+ # Skip horizontal whitespace only (space / tab / VT / FF) — NOT newlines, which are
824
+ # document separators. Used by the scalar-boundary check above.
825
+ def skip_horizontal_whitespace
826
+ while (b = byte)
827
+ if b == SPACE || b == TAB || b == 0x0B || b == 0x0C
828
+ advance(1)
829
+ elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
830
+ @pos += n # multibyte horizontal whitespace (NBSP, U+2000–200A, …)
831
+ else
832
+ break
833
+ end
834
+ end
835
+ end
836
+
691
837
  # Iterative container parser — explicit stack, NO Ruby recursion, so nesting
692
838
  # is bounded only by memory (like Oj and the C extension's fj_parse_iter),
693
839
  # never by the call stack. Mirrors the C driver to keep the two paths in
@@ -708,9 +854,10 @@ module SmarterJSON
708
854
  end
709
855
 
710
856
  vss = false # warnings: has a value landed in the current container since the last separator?
711
- loop do
857
+ input = @input # hoisted: @input never changes mid-parse; byte reads inline as input.getbyte(@pos)
858
+ while true
712
859
  skip_whitespace_and_comments
713
- b = byte
860
+ b = input.getbyte(@pos)
714
861
  if at_top
715
862
  if b == LBRACE
716
863
  advance(1)
@@ -729,8 +876,17 @@ module SmarterJSON
729
876
  at_top = false
730
877
  vss = false
731
878
  elsif b.nil?
879
+ # Defensive guard: parse / each_value check eof? before calling parse_iter,
880
+ # so `at_top` never meets end-of-input here. Kept to mirror the C driver.
881
+ # :nocov:
732
882
  raise error("unexpected end of input")
883
+ # :nocov:
733
884
  else
885
+ # Top-level scalar: must be a recognized JSON value (number / literal /
886
+ # quoted string). A bare word raises — there are no top-level quoteless
887
+ # strings (Decision 2 = B-broad). In-container quoteless still uses
888
+ # parse_member_value; the scalar-vs-separator boundary is enforced by the
889
+ # parse / each_value loop via enforce_scalar_boundary.
734
890
  return parse_value
735
891
  end
736
892
  elsif b == COMMA
@@ -758,12 +914,12 @@ module SmarterJSON
758
914
  else
759
915
  key = parse_object_key
760
916
  skip_whitespace_and_comments
761
- raise error("expected ':' after key #{key.inspect}") unless byte == COLON
917
+ raise error("expected ':' after key #{key.inspect}") unless input.getbyte(@pos) == COLON
762
918
 
763
919
  advance(1)
764
920
  skip_whitespace_and_comments
765
- b = byte
766
- if [LBRACE, LBRACKET].include?(b)
921
+ b = input.getbyte(@pos)
922
+ if b == LBRACE || b == LBRACKET
767
923
  child = b == LBRACE ? {} : []
768
924
  advance(1) # consume { or [
769
925
  store_member(cur, key, child)
@@ -771,7 +927,7 @@ module SmarterJSON
771
927
  cur = child
772
928
  cur_obj = (b == LBRACE)
773
929
  vss = false
774
- elsif [RBRACE, COMMA].include?(b)
930
+ elsif b == RBRACE || b == COMMA
775
931
  # key with a colon but no value -> null (don't consume } or ,; the loop does)
776
932
  store_member(cur, key, nil)
777
933
  warn(:empty_value, "key #{key.inspect} had no value — used null") if @on_warning
@@ -796,7 +952,7 @@ module SmarterJSON
796
952
  raise error("unterminated array")
797
953
  elsif b == RBRACE
798
954
  raise error("unexpected '}' — expected ']' or a value")
799
- elsif [LBRACE, LBRACKET].include?(b)
955
+ elsif b == LBRACE || b == LBRACKET
800
956
  child = b == LBRACE ? {} : []
801
957
  advance(1) # consume { or [
802
958
  cur.push(child)
@@ -818,11 +974,11 @@ module SmarterJSON
818
974
  b = byte
819
975
  return false unless b && key_start_byte?(b)
820
976
 
821
- saved = [@pos, @line, @col]
977
+ saved = @pos
822
978
  advance(1) while (c = byte) && key_continue_byte?(c)
823
979
  skip_pure_whitespace
824
980
  result = (byte == COLON)
825
- @pos, @line, @col = saved
981
+ @pos = saved
826
982
  result
827
983
  end
828
984
 
@@ -840,46 +996,72 @@ module SmarterJSON
840
996
  @pos >= @bytesize
841
997
  end
842
998
 
999
+ # Advance the byte cursor by n (clamped to EOF). No line/col bookkeeping — that
1000
+ # is computed lazily in line_col_at only when an error/warning is built. This is
1001
+ # the hot-path primitive every consumed byte goes through, so it stays O(1) with
1002
+ # no block, no re-read, and no per-byte branching. Mirrors the C fj_advance.
843
1003
  def advance(n = 1)
844
- n.times do
845
- b = @input.getbyte(@pos)
846
- return if b.nil?
1004
+ @pos += n
1005
+ @pos = @bytesize if @pos > @bytesize
1006
+ end
847
1007
 
1008
+ # Line and 1-based BYTE column at byte position `pos`, computed lazily by scanning
1009
+ # from the start of the buffer — only on the cold path (error / warning / triple-quote
1010
+ # indent), never per byte. CR, LF, and CRLF each count as one newline; the column is
1011
+ # the byte offset within the line. Mirrors the C extension's fj_line_col so both paths
1012
+ # report identical positions.
1013
+ def line_col_at(pos = @pos)
1014
+ limit = pos < @bytesize ? pos : @bytesize
1015
+ line = 1
1016
+ col = 1
1017
+ i = 0
1018
+ while i < limit
1019
+ b = @input.getbyte(i)
848
1020
  if b == LF
849
- @line += 1
850
- @col = 1
851
- @pos += 1
1021
+ line += 1
1022
+ col = 1
852
1023
  elsif b == CR
853
- @line += 1
854
- @col = 1
855
- @pos += 1
856
- @pos += 1 if @input.getbyte(@pos) == LF
1024
+ line += 1
1025
+ col = 1
1026
+ i += 1 if i + 1 < @bytesize && @input.getbyte(i + 1) == LF
857
1027
  else
858
- @col += 1
859
- @pos += 1
1028
+ col += 1
860
1029
  end
1030
+ i += 1
861
1031
  end
1032
+ [line, col]
1033
+ end
1034
+
1035
+ # 1-based byte column at `pos` (bytes since the last line start). Used for
1036
+ # triple-quoted-string indentation stripping. Mirrors the C fj_column.
1037
+ def column_at(pos = @pos)
1038
+ c = 1
1039
+ i = pos - 1
1040
+ while i >= 0 && (b = @input.getbyte(i)) != LF && b != CR
1041
+ c += 1
1042
+ i -= 1
1043
+ end
1044
+ c
862
1045
  end
863
1046
 
864
1047
  # --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
865
1048
 
866
1049
  def skip_pure_whitespace
867
- loop do
868
- b = byte
869
- break if b.nil?
870
-
1050
+ input = @input
1051
+ pos = @pos
1052
+ while (b = input.getbyte(pos))
871
1053
  if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
872
- advance(1)
1054
+ pos += 1
873
1055
  elsif b >= 0x80
874
- n = multibyte_ws_len(@pos)
1056
+ n = multibyte_ws_len(pos)
875
1057
  break if n.zero?
876
1058
 
877
- @pos += n
878
- @col += 1
1059
+ pos += n
879
1060
  else
880
1061
  break
881
1062
  end
882
1063
  end
1064
+ @pos = pos
883
1065
  end
884
1066
 
885
1067
  # Number of bytes of the Unicode-whitespace char starting at pos, or 0.
@@ -913,19 +1095,20 @@ module SmarterJSON
913
1095
  # A '#', '//', or '/*' starts a comment only when preceded by whitespace
914
1096
  # or at the very start of input (the comment-marker rule).
915
1097
  def skip_whitespace_and_comments
916
- loop do
1098
+ while true
917
1099
  skip_pure_whitespace
918
1100
  b = byte
919
- break if b.nil?
1101
+ if b == HASH
1102
+ break unless preceded_by_ws_or_start?
920
1103
 
921
- is_marker = (b == HASH) || (b == SLASH && [SLASH, STAR].include?(byte_at(1)))
922
- break unless is_marker
923
- break unless preceded_by_ws_or_start?
1104
+ skip_to_eol
1105
+ elsif b == SLASH
1106
+ c = byte_at(1)
1107
+ break unless (c == SLASH || c == STAR) && preceded_by_ws_or_start?
924
1108
 
925
- if b == SLASH && byte_at(1) == STAR
926
- skip_block_comment
1109
+ c == STAR ? skip_block_comment : skip_to_eol
927
1110
  else
928
- skip_to_eol
1111
+ break
929
1112
  end
930
1113
  end
931
1114
  end
@@ -965,8 +1148,9 @@ module SmarterJSON
965
1148
  # --- values ---
966
1149
 
967
1150
  # Top-level / strict value: no quoteless fallback.
1151
+ # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
1152
+ # so @pos is at the value's first byte — no leading skip needed here.
968
1153
  def parse_value
969
- skip_whitespace_and_comments
970
1154
  raise error("unexpected end of input") if eof?
971
1155
 
972
1156
  b = byte
@@ -999,8 +1183,9 @@ module SmarterJSON
999
1183
  end
1000
1184
 
1001
1185
  # Value in object-value or array-element position: quoteless allowed.
1186
+ # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
1187
+ # so @pos is at the value's first byte — no leading skip needed here.
1002
1188
  def parse_member_value
1003
- skip_whitespace_and_comments
1004
1189
  raise error("unexpected end of input") if eof?
1005
1190
 
1006
1191
  b = byte
@@ -1033,7 +1218,7 @@ module SmarterJSON
1033
1218
  until eof?
1034
1219
  if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
1035
1220
  closers.include?(@input.getbyte(@pos + 2))
1036
- result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1221
+ result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
1037
1222
  advance(3)
1038
1223
  return result
1039
1224
  end
@@ -1044,9 +1229,7 @@ module SmarterJSON
1044
1229
 
1045
1230
  def store_member(hash, key, value)
1046
1231
  k = @symbolize_keys ? key.to_sym : key
1047
- if hash.key?(k)
1048
- raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
1049
-
1232
+ if @check_duplicates && hash.key?(k)
1050
1233
  warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}") if @on_warning
1051
1234
  return if @duplicate_key == :first_wins
1052
1235
  end
@@ -1057,6 +1240,12 @@ module SmarterJSON
1057
1240
  b = byte
1058
1241
  return parse_string(DQUOTE) if b == DQUOTE
1059
1242
  return parse_string(SQUOTE) if b == SQUOTE
1243
+
1244
+ # A key may open with a smart/curly quote too (word-processor paste curls keys,
1245
+ # not just values) — route to the same reader values already use.
1246
+ kind = smart_quote_kind(@pos)
1247
+ return parse_smart_string(kind) if kind
1248
+
1060
1249
  raise error("expected a key") unless b && key_start_byte?(b)
1061
1250
 
1062
1251
  parse_identifier_key
@@ -1077,51 +1266,77 @@ module SmarterJSON
1077
1266
  start = @pos
1078
1267
  advance(1)
1079
1268
  advance(1) while (b = byte) && key_continue_byte?(b)
1080
- @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1269
+ @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
1081
1270
  end
1082
1271
 
1083
1272
  # --- quoteless strings & literal classification ---
1084
1273
 
1085
1274
  def parse_quoteless_or_literal
1086
1275
  start = @pos
1087
- scan_quoteless_run
1276
+ value_end = scan_quoteless_run
1088
1277
  # A quoteless run must consume at least one byte. If the first byte is a
1089
1278
  # delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
1090
1279
  # here would make the caller's `result << parse_member_value` loop forever.
1091
1280
  # Raise instead (correct today: the Lenient Commas Option is not adopted).
1092
1281
  raise error("expected a value") if @pos == start
1093
1282
 
1094
- raw = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1095
- classify_quoteless(trim_blank(raw))
1283
+ # value_end is the end of the last non-whitespace char in the run; slicing to it
1284
+ # drops trailing whitespace without a regex (the caller already skipped leading
1285
+ # whitespace, so there is none to trim at the front). Equivalent to the old
1286
+ # trim_blank(raw) but with no per-scalar String#sub allocations.
1287
+ raw = @input.byteslice(start, value_end - start) # byteslice preserves @input's encoding
1288
+ classify_quoteless(raw)
1096
1289
  end
1097
1290
 
1098
1291
  # Advance to the end of a quoteless run. Stops at structural punctuation
1099
- # (',' '}' ']'), a newline, EOF, or a comment marker that is preceded by
1100
- # whitespace. Spaces by themselves are not delimiters.
1292
+ # (',' '{' '}' '[' ']' openers terminate symmetrically with closers, so a
1293
+ # self-delimiting value starts fresh: `localhost {"a":1}` -> ["localhost", {...}]),
1294
+ # a newline, EOF, or a comment marker that is preceded by whitespace. Spaces by
1295
+ # themselves are not delimiters.
1296
+ # Advance @pos to the end of the quoteless run (including any trailing whitespace,
1297
+ # so the parser resumes correctly after the value). Returns value_end: the byte
1298
+ # offset just past the last NON-whitespace char, so the caller can slice off
1299
+ # trailing whitespace without a regex.
1101
1300
  def scan_quoteless_run
1301
+ input = @input
1302
+ pos = @pos
1303
+ # Fast path: one C-level byteindex jumps to the first structural terminator or
1304
+ # whitespace. If it lands on a terminator (or EOF) the run had no interior whitespace,
1305
+ # so [pos, hit) is the whole value — value_end == hit (no trailing trim) and no comment
1306
+ # marker can apply (those only break after whitespace). This is the common case
1307
+ # (numbers and simple tokens). Anything with whitespace falls to the byte-by-byte loop.
1308
+ if BYTEINDEX_AVAILABLE
1309
+ hit = input.byteindex(QL_BREAK, pos) || @bytesize
1310
+ b = hit < @bytesize ? input.getbyte(hit) : nil
1311
+ if b.nil? || b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
1312
+ @pos = hit
1313
+ return hit
1314
+ end
1315
+ end
1316
+
1317
+ # Slow path: the run contains whitespace — scan byte by byte to honor interior
1318
+ # whitespace, trailing-whitespace trimming (value_end is the end of the last
1319
+ # non-whitespace char), and the comment-marker-after-whitespace rule.
1320
+ value_end = pos
1102
1321
  prev_ws = false
1103
- loop do
1104
- b = byte
1105
- break if b.nil?
1106
- break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
1107
- break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
1322
+ while (b = input.getbyte(pos))
1323
+ break if b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
1324
+ break if prev_ws && (b == HASH || (b == SLASH && ((c = input.getbyte(pos + 1)) == SLASH || c == STAR)))
1108
1325
 
1109
1326
  if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
1110
1327
  prev_ws = true
1111
- advance(1)
1112
- elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
1328
+ pos += 1
1329
+ elsif b >= 0x80 && (n = multibyte_ws_len(pos)).positive?
1113
1330
  prev_ws = true
1114
- @pos += n
1115
- @col += 1
1331
+ pos += n
1116
1332
  else
1117
1333
  prev_ws = false
1118
- advance(1)
1334
+ pos += 1
1335
+ value_end = pos
1119
1336
  end
1120
1337
  end
1121
- end
1122
-
1123
- def trim_blank(str)
1124
- str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
1338
+ @pos = pos
1339
+ value_end
1125
1340
  end
1126
1341
 
1127
1342
  def classify_quoteless(str)
@@ -1132,7 +1347,7 @@ module SmarterJSON
1132
1347
  when "undefined" then return nil
1133
1348
  when "NaN" then return Float::NAN
1134
1349
  when "Infinity", "+Infinity" then return Float::INFINITY
1135
- when "-Infinity" then return (-Float::INFINITY)
1350
+ when "-Infinity" then return -Float::INFINITY
1136
1351
  end
1137
1352
  num = numeric_value(str)
1138
1353
  num.equal?(NOT_NUMERIC) ? str : num
@@ -1140,31 +1355,86 @@ module SmarterJSON
1140
1355
 
1141
1356
  # Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
1142
1357
  def numeric_value(str)
1143
- if HEX_RE.match?(str)
1144
- neg = str.start_with?("-")
1358
+ # Cheap hex gate: only invoke HEX_RE when the token actually looks like [+-]?0x… .
1359
+ # A Regexp#match? has real per-call cost; almost no number is hex, so the 1–3 byte
1360
+ # check skips that call on the common path (measured +21% on long-token decimals).
1361
+ if hex_prefix?(str) && HEX_RE.match?(str)
1362
+ neg = str.getbyte(0) == MINUS
1145
1363
  body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
1146
1364
  v = body[2..-1].to_i(16)
1147
1365
  return neg ? -v : v
1148
1366
  end
1149
1367
  return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
1150
1368
 
1151
- body = str.delete("_")
1369
+ # delete("_") allocates a fresh string even when there is nothing to delete; on long
1370
+ # number tokens that is a real per-value allocation. Underscores are rare, so only
1371
+ # pay it when the token actually contains one (measured +27% on long-token decimals).
1372
+ body = str.include?("_") ? str.delete("_") : str
1152
1373
  body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
1153
1374
  end
1154
1375
 
1155
- # A decimal (has '.' or exponent). bigdecimal_load: :float -> Float,
1376
+ # True when the token starts with [+-]?0[xX] the only shape HEX_RE can match.
1377
+ def hex_prefix?(str)
1378
+ c0 = str.getbyte(0)
1379
+ if c0 == ZERO
1380
+ x = str.getbyte(1)
1381
+ x == LOWER_X || x == UPPER_X
1382
+ elsif c0 == MINUS || c0 == PLUS
1383
+ str.getbyte(1) == ZERO && ((x = str.getbyte(2)) == LOWER_X || x == UPPER_X)
1384
+ else
1385
+ false
1386
+ end
1387
+ end
1388
+
1389
+ # A decimal (has '.' or exponent). decimal_precision: :float -> Float,
1156
1390
  # :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
1157
1391
  # than 16 significant digits (Oj's DEC_MAX threshold), else Float.
1158
1392
  def decimal_value(body)
1159
- case @bigdecimal_load
1160
- when :float then body.to_f
1393
+ case @decimal_precision
1394
+ when :float then float_or_warn(body)
1161
1395
  when :bigdecimal then to_big_decimal(body)
1162
- else significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
1396
+ else significant_digits(body) > 16 ? to_big_decimal(body) : float_or_warn(body)
1163
1397
  end
1164
1398
  end
1165
1399
 
1400
+ # A finite numeric literal whose magnitude exceeds Float range (e.g. 1e400) becomes
1401
+ # ±Infinity — a silent data change. Report it via :number_overflow (the value is still
1402
+ # returned; we warn rather than raise or invent). The Infinity/NaN *keywords* go through
1403
+ # a separate path and never reach here, so they don't warn.
1404
+ def float_or_warn(body)
1405
+ f = body.to_f
1406
+ # Only test for overflow when an on_warning handler is listening: `f.infinite?` is a
1407
+ # per-float method call we don't want on the hot number path otherwise, and with no
1408
+ # handler the warning would go nowhere anyway. Overflow is vanishingly rare.
1409
+ warn(:number_overflow, "number literal out of Float range — collapsed to #{f}") if @on_warning && f.infinite?
1410
+ f
1411
+ end
1412
+
1413
+ # Count significant mantissa digits (leading zeros excluded, exponent ignored) to pick
1414
+ # Float vs BigDecimal in :auto mode. A single byte-scan — the old three-regex version
1415
+ # (strip exponent, strip non-digits, strip leading zeros, .length) ran on every float
1416
+ # and dominated the number path's cost. body is a DEC_RE-validated token (digits, at most
1417
+ # one '.', optional sign, optional e/E exponent), underscores already removed.
1166
1418
  def significant_digits(body)
1167
- body.sub(/[eE].*\z/, "").gsub(/[^0-9]/, "").sub(/\A0+/, "").length
1419
+ count = 0
1420
+ leading = true
1421
+ i = 0
1422
+ n = body.bytesize
1423
+ while i < n
1424
+ b = body.getbyte(i)
1425
+ i += 1
1426
+ break if b == LOWER_E || b == UPPER_E # exponent: its digits aren't significant
1427
+
1428
+ next unless b >= ZERO && b <= NINE # skip sign and the decimal point
1429
+
1430
+ if leading && b == ZERO
1431
+ next # leading zero (incl. those after '.') — not significant
1432
+ else
1433
+ leading = false
1434
+ count += 1
1435
+ end
1436
+ end
1437
+ count
1168
1438
  end
1169
1439
 
1170
1440
  def to_big_decimal(body)
@@ -1175,7 +1445,11 @@ module SmarterJSON
1175
1445
  body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
1176
1446
  BigDecimal(body)
1177
1447
  rescue ArgumentError
1448
+ # Defensive: BigDecimal() does not reject a DEC_RE-validated, normalized token,
1449
+ # so this fallback is unreachable from valid input. Kept as a safety net.
1450
+ # :nocov:
1178
1451
  body.to_f
1452
+ # :nocov:
1179
1453
  end
1180
1454
 
1181
1455
  # BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
@@ -1194,7 +1468,7 @@ module SmarterJSON
1194
1468
  end
1195
1469
 
1196
1470
  def parse_triple_quoted
1197
- indent = @col - 1
1471
+ indent = column_at(@pos) - 1
1198
1472
  advance(3)
1199
1473
  raw_start = @pos
1200
1474
  until eof?
@@ -1204,7 +1478,7 @@ module SmarterJSON
1204
1478
  end
1205
1479
  raise error("unterminated triple-quoted string") if eof?
1206
1480
 
1207
- raw = @input.byteslice(raw_start, @pos - raw_start).force_encoding(@input.encoding)
1481
+ raw = @input.byteslice(raw_start, @pos - raw_start) # byteslice preserves @input's encoding
1208
1482
  advance(3)
1209
1483
  strip_triple(raw, indent)
1210
1484
  end
@@ -1234,20 +1508,30 @@ module SmarterJSON
1234
1508
  def parse_string(quote)
1235
1509
  advance(1)
1236
1510
  start = @pos
1237
- has_escape = false
1511
+ # Fast path (the common case — a string with no escapes): jump straight to the
1512
+ # closing quote with byteindex. It is called only here, from `start`, which is
1513
+ # always a character boundary, so byteindex never sees a mid-char offset.
1514
+ hit = scan_string_delimiter(quote)
1515
+ raise error("unterminated string") if hit.nil?
1516
+
1517
+ if @input.getbyte(hit) == quote
1518
+ @pos = hit
1519
+ result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
1520
+ advance(1)
1521
+ return result
1522
+ end
1523
+
1524
+ # Escape path: a backslash precedes the closing quote. Scan byte by byte from
1525
+ # here — byteindex can't be used past a backslash (a lenient \<multibyte> would
1526
+ # leave @pos mid-character), and this lets the decoder flag invalid escapes
1527
+ # exactly as before. decode_string_with_escapes handles the whole [start, finish].
1528
+ @pos = hit
1238
1529
  while (b = byte)
1239
1530
  if b == quote
1240
- if has_escape
1241
- decoded = decode_string_with_escapes(start, @pos, quote)
1242
- advance(1)
1243
- return decoded
1244
- else
1245
- result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1246
- advance(1)
1247
- return result
1248
- end
1531
+ decoded = decode_string_with_escapes(start, @pos, quote)
1532
+ advance(1)
1533
+ return decoded
1249
1534
  elsif b == BACKSLASH
1250
- has_escape = true
1251
1535
  advance(1)
1252
1536
  raise error("unterminated string escape") if eof?
1253
1537
 
@@ -1259,6 +1543,20 @@ module SmarterJSON
1259
1543
  raise error("unterminated string")
1260
1544
  end
1261
1545
 
1546
+ # Byte index of the next closing quote or backslash at/after @pos, or nil if
1547
+ # neither occurs before EOF. byteindex scans inside MRI's C; the fallback is a
1548
+ # tight getbyte loop (the ASCII delimiters never alias UTF-8 continuation bytes,
1549
+ # so byte scanning is correct for UTF-8 string content).
1550
+ def scan_string_delimiter(quote)
1551
+ if BYTEINDEX_AVAILABLE
1552
+ @input.byteindex(quote == DQUOTE ? DQUOTE_OR_BACKSLASH : SQUOTE_OR_BACKSLASH, @pos)
1553
+ else
1554
+ i = @pos
1555
+ i += 1 while i < @bytesize && (b = @input.getbyte(i)) != quote && b != BACKSLASH
1556
+ i < @bytesize ? i : nil
1557
+ end
1558
+ end
1559
+
1262
1560
  def decode_string_with_escapes(start, finish, _quote)
1263
1561
  buf = String.new(encoding: Encoding::ASCII_8BIT)
1264
1562
  i = start
@@ -1350,7 +1648,7 @@ module SmarterJSON
1350
1648
 
1351
1649
  if byte == ZERO
1352
1650
  advance(1)
1353
- if [LOWER_X, UPPER_X].include?(byte)
1651
+ if (x = byte) == LOWER_X || x == UPPER_X
1354
1652
  advance(1)
1355
1653
  hex_start = @pos
1356
1654
  advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
@@ -1375,10 +1673,10 @@ module SmarterJSON
1375
1673
  advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
1376
1674
  end
1377
1675
 
1378
- if [LOWER_E, UPPER_E].include?(byte)
1676
+ if (e = byte) == LOWER_E || e == UPPER_E
1379
1677
  is_float = true
1380
1678
  advance(1)
1381
- advance(1) if [PLUS, MINUS].include?(byte)
1679
+ advance(1) if (s = byte) == PLUS || s == MINUS
1382
1680
  raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
1383
1681
 
1384
1682
  advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
@@ -1414,11 +1712,13 @@ module SmarterJSON
1414
1712
  def warn(type, message)
1415
1713
  return unless @on_warning
1416
1714
 
1417
- @on_warning.call(Warning.new(type, message, @line, @col))
1715
+ line, col = line_col_at(@pos)
1716
+ @on_warning.call(Warning.new(type, message, line, col))
1418
1717
  end
1419
1718
 
1420
1719
  def error(message)
1421
- ParseError.new(message, @line, @col)
1720
+ line, col = line_col_at(@pos)
1721
+ ParseError.new(message, line, col)
1422
1722
  end
1423
1723
 
1424
1724
  def display_byte(b)