smarter_json 0.9.2 → 0.9.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ # Array#filter_map (used in Recovery#extract_payloads) is Ruby 2.7+; on Ruby < 2.7
4
+ # activate the scoped refinement backport (no-op on 2.7+, which uses native filter_map).
5
+ using SmarterJSON::Backports if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("2.7")
6
+
3
7
  module SmarterJSON
4
8
  # ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
5
9
  # from the shared SmarterJSON::Error base.
@@ -12,15 +16,20 @@ module SmarterJSON
12
16
  # is always content, never a filename — use process_file for paths.) The values
13
17
  # in `options` override Parser::DEFAULT_OPTIONS.
14
18
  #
15
- # Without a block: returns nil (zero documents), the value (one document), or an
16
- # Array of the values (two or more NDJSON / JSONL / concatenated / whitespace-
17
- # separated). :acceleration (default true) selects the C extension when compiled
18
- # and loaded (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
+ # Without a block: always returns an Array of the documents found [] for none,
20
+ # [doc] for one, [d1, d2, …] for several (NDJSON / JSONL / concatenated). A
21
+ # top-level value must be a recognized JSON value (number / literal / quoted
22
+ # string / object / array) or an implicit-root object, else it raises. For the
23
+ # single-document case use SmarterJSON.process_one (returns the bare value).
24
+ # :acceleration (default true) selects the C extension when compiled and loaded
25
+ # (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
19
26
  #
20
- # With a block: yields each top-level document as it is parsed, and returns nil.
21
- # For an IO this streams document-by-document in bounded memory — it reads the
22
- # stream as newline-delimited documents (NDJSON / JSONL), one per line.
27
+ # With a block: yields each top-level document as it is parsed, and returns the
28
+ # document count. For an IO this streams document-by-document in bounded memory —
29
+ # it reads the stream as newline-delimited documents (NDJSON / JSONL), one per
30
+ # line.
23
31
  def process(input, options = {}, &block)
32
+ options = Options.process_options(options)
24
33
  if input.is_a?(String)
25
34
  Recovery.process_string(input, options, &block)
26
35
  elsif input.respond_to?(:read)
@@ -39,7 +48,8 @@ module SmarterJSON
39
48
  # loading the whole file); the documents are read as newline-delimited
40
49
  # (NDJSON / JSONL), one per line.
41
50
  def process_file(path, options = {}, &block)
42
- encoding = options.fetch(:encoding, "UTF-8")
51
+ options = Options.process_options(options)
52
+ encoding = options[:encoding] || "UTF-8"
43
53
  if block
44
54
  File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
45
55
  else
@@ -47,8 +57,44 @@ module SmarterJSON
47
57
  end
48
58
  end
49
59
 
50
- # Parse a String of JSON content (the in-memory path). Returns nil (block) or
51
- # the value / Array (no block); the C extension is used when available.
60
+ # SmarterJSON.process_one(input, options = {}) the single-document accessor.
61
+ #
62
+ # Returns the first document's value (or nil when the input holds no documents).
63
+ # When the input holds MORE than one document it returns the first and warns once
64
+ # — it never raises, since an extra document is valid data; the warning goes to
65
+ # on_warning if set, else Rails.logger.warn when Rails is loaded, else Kernel#warn.
66
+ # For an IO this is bounded memory: it parses just the first document and stops as
67
+ # soon as a second is seen, instead of materialising the whole stream the way
68
+ # process(io).first would. (process(input).first and process(input)[0] silently
69
+ # drop documents 2+ — a footgun; use process_one instead.)
70
+ def process_one(input, options = {})
71
+ options = Options.process_options(options)
72
+
73
+ # IO: bounded memory — parse just the first document and stop once a second is
74
+ # seen (peek-to-warn). A String is already in memory, so use the plain no-block
75
+ # path: it returns the full (wrapper-recovered, de-duplicated) Array in one pass,
76
+ # which also avoids the reactive-recovery double-yield the block path would hit.
77
+ unless input.respond_to?(:read)
78
+ docs = process(input, options)
79
+ warn_extra_documents(options) if docs.length > 1
80
+ return docs.first
81
+ end
82
+
83
+ first = nil
84
+ count = 0
85
+ catch(:smarter_json_first_document) do
86
+ process(input, options) do |doc|
87
+ count += 1
88
+ first = doc if count == 1
89
+ throw(:smarter_json_first_document) if count > 1
90
+ end
91
+ end
92
+ warn_extra_documents(options) if count > 1
93
+ first
94
+ end
95
+
96
+ # Parse a String of JSON content (the in-memory path). Returns an Array of the
97
+ # documents found (empty for none); the C extension is used when available.
52
98
  def process_content(input, options, &block)
53
99
  if block
54
100
  if options.fetch(:acceleration, true) && HAS_ACCELERATION
@@ -66,11 +112,33 @@ module SmarterJSON
66
112
  # Stream documents from an IO incrementally, yielding each recovered top-level
67
113
  # document without slurping the whole input into memory first.
68
114
  def stream_io(io, options, &block)
69
- Framer.each_document(io) { |doc| Recovery.process_string(doc, options, &block) }
70
- nil
115
+ count = 0
116
+ Framer.each_document(io) do |doc|
117
+ # Recovery.process_string yields each value and returns how many it yielded;
118
+ # blank / comment-only framed segments yield none, so count tracks actual
119
+ # documents (== values yielded), not raw framed segments.
120
+ count += Recovery.process_string(doc, options, &block)
121
+ end
122
+ count
123
+ end
124
+
125
+ # process_one's "more than one document" notice — routed to on_warning if the caller
126
+ # gave one, else Rails.logger when Rails is loaded, else Kernel#warn. Never silent,
127
+ # never raised.
128
+ def warn_extra_documents(options)
129
+ message = "SmarterJSON.process_one: input has more than one document — returning the first and " \
130
+ "dropping the rest. Use SmarterJSON.process to get every document."
131
+ handler = options[:on_warning]
132
+ if handler
133
+ handler.call(Warning.new(:extra_documents, message, nil, nil))
134
+ elsif defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
135
+ Rails.logger.warn(message)
136
+ else
137
+ Kernel.warn(message)
138
+ end
71
139
  end
72
140
 
73
- private_class_method :process_content, :stream_io
141
+ private_class_method :process_content, :stream_io, :warn_extra_documents
74
142
 
75
143
  # Named byte values, shared by the Parser FSM and the Framer / Recovery byte
76
144
  # scanners so none of them spell out raw hex. Included where needed.
@@ -119,7 +187,7 @@ module SmarterJSON
119
187
 
120
188
  module_function
121
189
 
122
- def each_document(io, &block)
190
+ def each_document(io)
123
191
  buffer = +""
124
192
  scan = 0
125
193
  doc_start = nil
@@ -385,15 +453,23 @@ module SmarterJSON
385
453
  handler = options[:on_warning]
386
454
  emit_wrapper_warnings(payloads, handler)
387
455
 
388
- results = payloads.map do |payload|
389
- SmarterJSON.send(:process_content, payload[:slice], options)
456
+ if block_given?
457
+ count = 0
458
+ payloads.each do |payload|
459
+ SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
460
+ block.call(doc)
461
+ count += 1
462
+ end
463
+ end
464
+ return count
390
465
  end
391
466
 
392
- return results.each(&block).then { nil } if block_given?
393
- return nil if results.empty?
394
- return results.first if results.length == 1
395
-
396
- results
467
+ # Each payload's process_content now returns an Array of its documents; flatten
468
+ # so several recovered payloads yield one flat Array<doc> (the always-array
469
+ # contract), not an Array of Arrays.
470
+ payloads.flat_map do |payload|
471
+ SmarterJSON.send(:process_content, payload[:slice], options)
472
+ end
397
473
  end
398
474
 
399
475
  def emit_wrapper_warnings(payloads, handler)
@@ -613,18 +689,22 @@ module SmarterJSON
613
689
  # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
614
690
  # would change the string — so when it doesn't match, we skip normalization.
615
691
  NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
616
- BLANK_HEAD = /\A[[:space:]]+/.freeze
617
- BLANK_TAIL = /[[:space:]]+\z/.freeze
618
-
619
- # All caller-facing settings live in one options hash (smarter_csv style).
620
- DEFAULT_OPTIONS = {
621
- acceleration: true, # use the C extension when available
622
- encoding: nil, # label the input's encoding (no transcoding)
623
- symbolize_keys: false, # Symbol keys instead of String
624
- duplicate_key: :last_wins, # :last_wins | :first_wins | :raise
625
- bigdecimal_load: :auto, # :auto | :float | :bigdecimal (Oj-compatible)
626
- on_warning: nil, # a callable invoked once per non-fatal lenient fix (a SmarterJSON::Warning)
627
- }.freeze
692
+
693
+ # parse_string scans to the next closing-quote-or-backslash. byteindex (Ruby 3.2+,
694
+ # MRI) does that jump at C speed; the getbyte loop in scan_string_delimiter is the
695
+ # portable fallback (JRuby / TruffleRuby / older MRI). Both find the same byte.
696
+ BYTEINDEX_AVAILABLE = "".respond_to?(:byteindex)
697
+ DQUOTE_OR_BACKSLASH = /["\\]/.freeze
698
+ SQUOTE_OR_BACKSLASH = /['\\]/.freeze
699
+
700
+ # scan_quoteless_run's fast path jumps (in C) to the first structural terminator
701
+ # (',' '}' ']' '{' '[') OR any whitespace ([[:space:]] covers ASCII + Unicode space,
702
+ # incl. LF/CR which also terminate). Stopping at a terminator/EOF means the run had no
703
+ # interior whitespace, so there's nothing to trim and no comment marker can apply.
704
+ QL_BREAK = /[,{}\[\]]|[[:space:]]/.freeze
705
+
706
+ # The defaults live centrally in SmarterJSON::Options (lib/smarter_json/options.rb).
707
+ DEFAULT_OPTIONS = Options::DEFAULT_OPTIONS
628
708
 
629
709
  def initialize(input, options = {})
630
710
  raise ArgumentError, "input must be a String" unless input.is_a?(String)
@@ -632,8 +712,13 @@ module SmarterJSON
632
712
  opts = DEFAULT_OPTIONS.merge(options)
633
713
  @symbolize_keys = opts[:symbolize_keys]
634
714
  @duplicate_key = opts[:duplicate_key]
635
- @bigdecimal_load = opts[:bigdecimal_load]
636
- @on_warning = opts[:on_warning]
715
+ @decimal_precision = opts[:decimal_precision]
716
+ @on_warning = opts[:on_warning]
717
+ # store_member only needs the (per-member) Hash#key? duplicate lookup when a
718
+ # repeat would change behavior: a warning must fire, or :first_wins must keep the
719
+ # first. With the default (:last_wins, no handler) a duplicate just overwrites,
720
+ # which `hash[k] = value` already does — so skip the lookup entirely.
721
+ @check_duplicates = !@on_warning.nil? || @duplicate_key == :first_wins
637
722
 
638
723
  encoding = opts[:encoding]
639
724
  @input = encoding ? input.dup.force_encoding(encoding) : input
@@ -642,8 +727,6 @@ module SmarterJSON
642
727
  @bytesize = @input.bytesize
643
728
  # Skip a UTF-8 BOM (EF BB BF) at the start of input.
644
729
  @pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
645
- @line = 1
646
- @col = 1
647
730
  end
648
731
 
649
732
  # No block: auto-detect the document count for free (the same "is there
@@ -653,17 +736,14 @@ module SmarterJSON
653
736
  # value. Commas do NOT separate documents (only whitespace / newline /
654
737
  # concatenation do), so a bracketless comma list still raises in parse_document.
655
738
  def parse
656
- skip_whitespace_and_comments
657
- return nil if eof?
658
-
659
- value = parse_document
660
- skip_whitespace_and_comments
661
- return value if eof?
662
-
663
- results = [value]
739
+ results = []
664
740
  until eof?
665
- results << parse_document
666
- skip_whitespace_and_comments
741
+ skip_document_separators
742
+ break if eof?
743
+
744
+ value = parse_document
745
+ enforce_scalar_boundary(value)
746
+ results << value
667
747
  end
668
748
  results
669
749
  end
@@ -671,13 +751,17 @@ module SmarterJSON
671
751
  # Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
672
752
  # whitespace-separated). Used by the block form of SmarterJSON.process.
673
753
  def each_value
674
- loop do
675
- skip_whitespace_and_comments
754
+ count = 0
755
+ until eof?
756
+ skip_document_separators
676
757
  break if eof?
677
758
 
678
- yield parse_document
759
+ value = parse_document
760
+ enforce_scalar_boundary(value)
761
+ yield value
762
+ count += 1
679
763
  end
680
- nil
764
+ count
681
765
  end
682
766
 
683
767
  private
@@ -688,6 +772,48 @@ module SmarterJSON
688
772
  parse_iter(implicit_root_object_ahead?)
689
773
  end
690
774
 
775
+ # Between top-level documents, whitespace, comments, AND commas all separate
776
+ # (commas collapse like the in-container lenient-comma rule). A space alone never
777
+ # separates — that is handled inside the document by the quoteless run, so
778
+ # `1 2 3` is one document (the string "1 2 3") while `1, 2, 3` is three.
779
+ def skip_document_separators
780
+ skip_whitespace_and_comments
781
+ while byte == COMMA
782
+ advance(1)
783
+ skip_whitespace_and_comments
784
+ end
785
+ end
786
+
787
+ # After a top-level value: a self-delimiting value (object / array / quoted string)
788
+ # may be followed by anything (the next document self-delimits), but a bare scalar
789
+ # (number / keyword) must be followed by a real separator — a newline, ',', a
790
+ # comment, or EOF. A space is NOT a separator, so `1 2 3` and `42 "x" true` raise
791
+ # rather than silently splitting; bare top-level words raise in parse_value itself.
792
+ def enforce_scalar_boundary(value)
793
+ return if value.is_a?(String) || value.is_a?(Hash) || value.is_a?(Array)
794
+
795
+ skip_horizontal_whitespace
796
+ b = byte
797
+ return if b.nil? || b == LF || b == CR || b == COMMA
798
+ return if b == HASH || (b == SLASH && ((c = byte_at(1)) == SLASH || c == STAR))
799
+
800
+ raise error("a top-level number or keyword must be followed by a newline, ',', or end of input")
801
+ end
802
+
803
+ # Skip horizontal whitespace only (space / tab / VT / FF) — NOT newlines, which are
804
+ # document separators. Used by the scalar-boundary check above.
805
+ def skip_horizontal_whitespace
806
+ while (b = byte)
807
+ if b == SPACE || b == TAB || b == 0x0B || b == 0x0C
808
+ advance(1)
809
+ elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
810
+ @pos += n # multibyte horizontal whitespace (NBSP, U+2000–200A, …)
811
+ else
812
+ break
813
+ end
814
+ end
815
+ end
816
+
691
817
  # Iterative container parser — explicit stack, NO Ruby recursion, so nesting
692
818
  # is bounded only by memory (like Oj and the C extension's fj_parse_iter),
693
819
  # never by the call stack. Mirrors the C driver to keep the two paths in
@@ -708,9 +834,10 @@ module SmarterJSON
708
834
  end
709
835
 
710
836
  vss = false # warnings: has a value landed in the current container since the last separator?
711
- loop do
837
+ input = @input # hoisted: @input never changes mid-parse; byte reads inline as input.getbyte(@pos)
838
+ while true
712
839
  skip_whitespace_and_comments
713
- b = byte
840
+ b = input.getbyte(@pos)
714
841
  if at_top
715
842
  if b == LBRACE
716
843
  advance(1)
@@ -729,8 +856,17 @@ module SmarterJSON
729
856
  at_top = false
730
857
  vss = false
731
858
  elsif b.nil?
859
+ # Defensive guard: parse / each_value check eof? before calling parse_iter,
860
+ # so `at_top` never meets end-of-input here. Kept to mirror the C driver.
861
+ # :nocov:
732
862
  raise error("unexpected end of input")
863
+ # :nocov:
733
864
  else
865
+ # Top-level scalar: must be a recognized JSON value (number / literal /
866
+ # quoted string). A bare word raises — there are no top-level quoteless
867
+ # strings (Decision 2 = B-broad). In-container quoteless still uses
868
+ # parse_member_value; the scalar-vs-separator boundary is enforced by the
869
+ # parse / each_value loop via enforce_scalar_boundary.
734
870
  return parse_value
735
871
  end
736
872
  elsif b == COMMA
@@ -758,12 +894,12 @@ module SmarterJSON
758
894
  else
759
895
  key = parse_object_key
760
896
  skip_whitespace_and_comments
761
- raise error("expected ':' after key #{key.inspect}") unless byte == COLON
897
+ raise error("expected ':' after key #{key.inspect}") unless input.getbyte(@pos) == COLON
762
898
 
763
899
  advance(1)
764
900
  skip_whitespace_and_comments
765
- b = byte
766
- if [LBRACE, LBRACKET].include?(b)
901
+ b = input.getbyte(@pos)
902
+ if b == LBRACE || b == LBRACKET
767
903
  child = b == LBRACE ? {} : []
768
904
  advance(1) # consume { or [
769
905
  store_member(cur, key, child)
@@ -771,7 +907,7 @@ module SmarterJSON
771
907
  cur = child
772
908
  cur_obj = (b == LBRACE)
773
909
  vss = false
774
- elsif [RBRACE, COMMA].include?(b)
910
+ elsif b == RBRACE || b == COMMA
775
911
  # key with a colon but no value -> null (don't consume } or ,; the loop does)
776
912
  store_member(cur, key, nil)
777
913
  warn(:empty_value, "key #{key.inspect} had no value — used null") if @on_warning
@@ -796,7 +932,7 @@ module SmarterJSON
796
932
  raise error("unterminated array")
797
933
  elsif b == RBRACE
798
934
  raise error("unexpected '}' — expected ']' or a value")
799
- elsif [LBRACE, LBRACKET].include?(b)
935
+ elsif b == LBRACE || b == LBRACKET
800
936
  child = b == LBRACE ? {} : []
801
937
  advance(1) # consume { or [
802
938
  cur.push(child)
@@ -818,11 +954,11 @@ module SmarterJSON
818
954
  b = byte
819
955
  return false unless b && key_start_byte?(b)
820
956
 
821
- saved = [@pos, @line, @col]
957
+ saved = @pos
822
958
  advance(1) while (c = byte) && key_continue_byte?(c)
823
959
  skip_pure_whitespace
824
960
  result = (byte == COLON)
825
- @pos, @line, @col = saved
961
+ @pos = saved
826
962
  result
827
963
  end
828
964
 
@@ -840,46 +976,72 @@ module SmarterJSON
840
976
  @pos >= @bytesize
841
977
  end
842
978
 
979
+ # Advance the byte cursor by n (clamped to EOF). No line/col bookkeeping — that
980
+ # is computed lazily in line_col_at only when an error/warning is built. This is
981
+ # the hot-path primitive every consumed byte goes through, so it stays O(1) with
982
+ # no block, no re-read, and no per-byte branching. Mirrors the C fj_advance.
843
983
  def advance(n = 1)
844
- n.times do
845
- b = @input.getbyte(@pos)
846
- return if b.nil?
984
+ @pos += n
985
+ @pos = @bytesize if @pos > @bytesize
986
+ end
847
987
 
988
+ # Line and 1-based BYTE column at byte position `pos`, computed lazily by scanning
989
+ # from the start of the buffer — only on the cold path (error / warning / triple-quote
990
+ # indent), never per byte. CR, LF, and CRLF each count as one newline; the column is
991
+ # the byte offset within the line. Mirrors the C extension's fj_line_col so both paths
992
+ # report identical positions.
993
+ def line_col_at(pos = @pos)
994
+ limit = pos < @bytesize ? pos : @bytesize
995
+ line = 1
996
+ col = 1
997
+ i = 0
998
+ while i < limit
999
+ b = @input.getbyte(i)
848
1000
  if b == LF
849
- @line += 1
850
- @col = 1
851
- @pos += 1
1001
+ line += 1
1002
+ col = 1
852
1003
  elsif b == CR
853
- @line += 1
854
- @col = 1
855
- @pos += 1
856
- @pos += 1 if @input.getbyte(@pos) == LF
1004
+ line += 1
1005
+ col = 1
1006
+ i += 1 if i + 1 < @bytesize && @input.getbyte(i + 1) == LF
857
1007
  else
858
- @col += 1
859
- @pos += 1
1008
+ col += 1
860
1009
  end
1010
+ i += 1
861
1011
  end
1012
+ [line, col]
1013
+ end
1014
+
1015
+ # 1-based byte column at `pos` (bytes since the last line start). Used for
1016
+ # triple-quoted-string indentation stripping. Mirrors the C fj_column.
1017
+ def column_at(pos = @pos)
1018
+ c = 1
1019
+ i = pos - 1
1020
+ while i >= 0 && (b = @input.getbyte(i)) != LF && b != CR
1021
+ c += 1
1022
+ i -= 1
1023
+ end
1024
+ c
862
1025
  end
863
1026
 
864
1027
  # --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
865
1028
 
866
1029
  def skip_pure_whitespace
867
- loop do
868
- b = byte
869
- break if b.nil?
870
-
1030
+ input = @input
1031
+ pos = @pos
1032
+ while (b = input.getbyte(pos))
871
1033
  if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
872
- advance(1)
1034
+ pos += 1
873
1035
  elsif b >= 0x80
874
- n = multibyte_ws_len(@pos)
1036
+ n = multibyte_ws_len(pos)
875
1037
  break if n.zero?
876
1038
 
877
- @pos += n
878
- @col += 1
1039
+ pos += n
879
1040
  else
880
1041
  break
881
1042
  end
882
1043
  end
1044
+ @pos = pos
883
1045
  end
884
1046
 
885
1047
  # Number of bytes of the Unicode-whitespace char starting at pos, or 0.
@@ -913,19 +1075,20 @@ module SmarterJSON
913
1075
  # A '#', '//', or '/*' starts a comment only when preceded by whitespace
914
1076
  # or at the very start of input (the comment-marker rule).
915
1077
  def skip_whitespace_and_comments
916
- loop do
1078
+ while true
917
1079
  skip_pure_whitespace
918
1080
  b = byte
919
- break if b.nil?
1081
+ if b == HASH
1082
+ break unless preceded_by_ws_or_start?
920
1083
 
921
- is_marker = (b == HASH) || (b == SLASH && [SLASH, STAR].include?(byte_at(1)))
922
- break unless is_marker
923
- break unless preceded_by_ws_or_start?
1084
+ skip_to_eol
1085
+ elsif b == SLASH
1086
+ c = byte_at(1)
1087
+ break unless (c == SLASH || c == STAR) && preceded_by_ws_or_start?
924
1088
 
925
- if b == SLASH && byte_at(1) == STAR
926
- skip_block_comment
1089
+ c == STAR ? skip_block_comment : skip_to_eol
927
1090
  else
928
- skip_to_eol
1091
+ break
929
1092
  end
930
1093
  end
931
1094
  end
@@ -965,8 +1128,9 @@ module SmarterJSON
965
1128
  # --- values ---
966
1129
 
967
1130
  # Top-level / strict value: no quoteless fallback.
1131
+ # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
1132
+ # so @pos is at the value's first byte — no leading skip needed here.
968
1133
  def parse_value
969
- skip_whitespace_and_comments
970
1134
  raise error("unexpected end of input") if eof?
971
1135
 
972
1136
  b = byte
@@ -999,8 +1163,9 @@ module SmarterJSON
999
1163
  end
1000
1164
 
1001
1165
  # Value in object-value or array-element position: quoteless allowed.
1166
+ # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
1167
+ # so @pos is at the value's first byte — no leading skip needed here.
1002
1168
  def parse_member_value
1003
- skip_whitespace_and_comments
1004
1169
  raise error("unexpected end of input") if eof?
1005
1170
 
1006
1171
  b = byte
@@ -1033,7 +1198,7 @@ module SmarterJSON
1033
1198
  until eof?
1034
1199
  if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
1035
1200
  closers.include?(@input.getbyte(@pos + 2))
1036
- result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1201
+ result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
1037
1202
  advance(3)
1038
1203
  return result
1039
1204
  end
@@ -1044,9 +1209,7 @@ module SmarterJSON
1044
1209
 
1045
1210
  def store_member(hash, key, value)
1046
1211
  k = @symbolize_keys ? key.to_sym : key
1047
- if hash.key?(k)
1048
- raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
1049
-
1212
+ if @check_duplicates && hash.key?(k)
1050
1213
  warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}") if @on_warning
1051
1214
  return if @duplicate_key == :first_wins
1052
1215
  end
@@ -1077,51 +1240,77 @@ module SmarterJSON
1077
1240
  start = @pos
1078
1241
  advance(1)
1079
1242
  advance(1) while (b = byte) && key_continue_byte?(b)
1080
- @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1243
+ @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
1081
1244
  end
1082
1245
 
1083
1246
  # --- quoteless strings & literal classification ---
1084
1247
 
1085
1248
  def parse_quoteless_or_literal
1086
1249
  start = @pos
1087
- scan_quoteless_run
1250
+ value_end = scan_quoteless_run
1088
1251
  # A quoteless run must consume at least one byte. If the first byte is a
1089
1252
  # delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
1090
1253
  # here would make the caller's `result << parse_member_value` loop forever.
1091
1254
  # Raise instead (correct today: the Lenient Commas Option is not adopted).
1092
1255
  raise error("expected a value") if @pos == start
1093
1256
 
1094
- raw = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1095
- classify_quoteless(trim_blank(raw))
1257
+ # value_end is the end of the last non-whitespace char in the run; slicing to it
1258
+ # drops trailing whitespace without a regex (the caller already skipped leading
1259
+ # whitespace, so there is none to trim at the front). Equivalent to the old
1260
+ # trim_blank(raw) but with no per-scalar String#sub allocations.
1261
+ raw = @input.byteslice(start, value_end - start) # byteslice preserves @input's encoding
1262
+ classify_quoteless(raw)
1096
1263
  end
1097
1264
 
1098
1265
  # Advance to the end of a quoteless run. Stops at structural punctuation
1099
- # (',' '}' ']'), a newline, EOF, or a comment marker that is preceded by
1100
- # whitespace. Spaces by themselves are not delimiters.
1266
+ # (',' '{' '}' '[' ']' openers terminate symmetrically with closers, so a
1267
+ # self-delimiting value starts fresh: `localhost {"a":1}` -> ["localhost", {...}]),
1268
+ # a newline, EOF, or a comment marker that is preceded by whitespace. Spaces by
1269
+ # themselves are not delimiters.
1270
+ # Advance @pos to the end of the quoteless run (including any trailing whitespace,
1271
+ # so the parser resumes correctly after the value). Returns value_end: the byte
1272
+ # offset just past the last NON-whitespace char, so the caller can slice off
1273
+ # trailing whitespace without a regex.
1101
1274
  def scan_quoteless_run
1275
+ input = @input
1276
+ pos = @pos
1277
+ # Fast path: one C-level byteindex jumps to the first structural terminator or
1278
+ # whitespace. If it lands on a terminator (or EOF) the run had no interior whitespace,
1279
+ # so [pos, hit) is the whole value — value_end == hit (no trailing trim) and no comment
1280
+ # marker can apply (those only break after whitespace). This is the common case
1281
+ # (numbers and simple tokens). Anything with whitespace falls to the byte-by-byte loop.
1282
+ if BYTEINDEX_AVAILABLE
1283
+ hit = input.byteindex(QL_BREAK, pos) || @bytesize
1284
+ b = hit < @bytesize ? input.getbyte(hit) : nil
1285
+ if b.nil? || b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
1286
+ @pos = hit
1287
+ return hit
1288
+ end
1289
+ end
1290
+
1291
+ # Slow path: the run contains whitespace — scan byte by byte to honor interior
1292
+ # whitespace, trailing-whitespace trimming (value_end is the end of the last
1293
+ # non-whitespace char), and the comment-marker-after-whitespace rule.
1294
+ value_end = pos
1102
1295
  prev_ws = false
1103
- loop do
1104
- b = byte
1105
- break if b.nil?
1106
- break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
1107
- break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
1296
+ while (b = input.getbyte(pos))
1297
+ break if b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
1298
+ break if prev_ws && (b == HASH || (b == SLASH && ((c = input.getbyte(pos + 1)) == SLASH || c == STAR)))
1108
1299
 
1109
1300
  if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
1110
1301
  prev_ws = true
1111
- advance(1)
1112
- elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
1302
+ pos += 1
1303
+ elsif b >= 0x80 && (n = multibyte_ws_len(pos)).positive?
1113
1304
  prev_ws = true
1114
- @pos += n
1115
- @col += 1
1305
+ pos += n
1116
1306
  else
1117
1307
  prev_ws = false
1118
- advance(1)
1308
+ pos += 1
1309
+ value_end = pos
1119
1310
  end
1120
1311
  end
1121
- end
1122
-
1123
- def trim_blank(str)
1124
- str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
1312
+ @pos = pos
1313
+ value_end
1125
1314
  end
1126
1315
 
1127
1316
  def classify_quoteless(str)
@@ -1132,7 +1321,7 @@ module SmarterJSON
1132
1321
  when "undefined" then return nil
1133
1322
  when "NaN" then return Float::NAN
1134
1323
  when "Infinity", "+Infinity" then return Float::INFINITY
1135
- when "-Infinity" then return (-Float::INFINITY)
1324
+ when "-Infinity" then return -Float::INFINITY
1136
1325
  end
1137
1326
  num = numeric_value(str)
1138
1327
  num.equal?(NOT_NUMERIC) ? str : num
@@ -1140,31 +1329,73 @@ module SmarterJSON
1140
1329
 
1141
1330
  # Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
1142
1331
  def numeric_value(str)
1143
- if HEX_RE.match?(str)
1144
- neg = str.start_with?("-")
1332
+ # Cheap hex gate: only invoke HEX_RE when the token actually looks like [+-]?0x… .
1333
+ # A Regexp#match? has real per-call cost; almost no number is hex, so the 1–3 byte
1334
+ # check skips that call on the common path (measured +21% on long-token decimals).
1335
+ if hex_prefix?(str) && HEX_RE.match?(str)
1336
+ neg = str.getbyte(0) == MINUS
1145
1337
  body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
1146
1338
  v = body[2..-1].to_i(16)
1147
1339
  return neg ? -v : v
1148
1340
  end
1149
1341
  return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
1150
1342
 
1151
- body = str.delete("_")
1343
+ # delete("_") allocates a fresh string even when there is nothing to delete; on long
1344
+ # number tokens that is a real per-value allocation. Underscores are rare, so only
1345
+ # pay it when the token actually contains one (measured +27% on long-token decimals).
1346
+ body = str.include?("_") ? str.delete("_") : str
1152
1347
  body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
1153
1348
  end
1154
1349
 
1155
- # A decimal (has '.' or exponent). bigdecimal_load: :float -> Float,
1350
+ # True when the token starts with [+-]?0[xX] the only shape HEX_RE can match.
1351
+ def hex_prefix?(str)
1352
+ c0 = str.getbyte(0)
1353
+ if c0 == ZERO
1354
+ x = str.getbyte(1)
1355
+ x == LOWER_X || x == UPPER_X
1356
+ elsif c0 == MINUS || c0 == PLUS
1357
+ str.getbyte(1) == ZERO && ((x = str.getbyte(2)) == LOWER_X || x == UPPER_X)
1358
+ else
1359
+ false
1360
+ end
1361
+ end
1362
+
1363
+ # A decimal (has '.' or exponent). decimal_precision: :float -> Float,
1156
1364
  # :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
1157
1365
  # than 16 significant digits (Oj's DEC_MAX threshold), else Float.
1158
1366
  def decimal_value(body)
1159
- case @bigdecimal_load
1367
+ case @decimal_precision
1160
1368
  when :float then body.to_f
1161
1369
  when :bigdecimal then to_big_decimal(body)
1162
1370
  else significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
1163
1371
  end
1164
1372
  end
1165
1373
 
1374
+ # Count significant mantissa digits (leading zeros excluded, exponent ignored) to pick
1375
+ # Float vs BigDecimal in :auto mode. A single byte-scan — the old three-regex version
1376
+ # (strip exponent, strip non-digits, strip leading zeros, .length) ran on every float
1377
+ # and dominated the number path's cost. body is a DEC_RE-validated token (digits, at most
1378
+ # one '.', optional sign, optional e/E exponent), underscores already removed.
1166
1379
  def significant_digits(body)
1167
- body.sub(/[eE].*\z/, "").gsub(/[^0-9]/, "").sub(/\A0+/, "").length
1380
+ count = 0
1381
+ leading = true
1382
+ i = 0
1383
+ n = body.bytesize
1384
+ while i < n
1385
+ b = body.getbyte(i)
1386
+ i += 1
1387
+ break if b == LOWER_E || b == UPPER_E # exponent: its digits aren't significant
1388
+
1389
+ next unless b >= ZERO && b <= NINE # skip sign and the decimal point
1390
+
1391
+ if leading && b == ZERO
1392
+ next # leading zero (incl. those after '.') — not significant
1393
+ else
1394
+ leading = false
1395
+ count += 1
1396
+ end
1397
+ end
1398
+ count
1168
1399
  end
1169
1400
 
1170
1401
  def to_big_decimal(body)
@@ -1175,7 +1406,11 @@ module SmarterJSON
1175
1406
  body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
1176
1407
  BigDecimal(body)
1177
1408
  rescue ArgumentError
1409
+ # Defensive: BigDecimal() does not reject a DEC_RE-validated, normalized token,
1410
+ # so this fallback is unreachable from valid input. Kept as a safety net.
1411
+ # :nocov:
1178
1412
  body.to_f
1413
+ # :nocov:
1179
1414
  end
1180
1415
 
1181
1416
  # BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
@@ -1194,7 +1429,7 @@ module SmarterJSON
1194
1429
  end
1195
1430
 
1196
1431
  def parse_triple_quoted
1197
- indent = @col - 1
1432
+ indent = column_at(@pos) - 1
1198
1433
  advance(3)
1199
1434
  raw_start = @pos
1200
1435
  until eof?
@@ -1204,7 +1439,7 @@ module SmarterJSON
1204
1439
  end
1205
1440
  raise error("unterminated triple-quoted string") if eof?
1206
1441
 
1207
- raw = @input.byteslice(raw_start, @pos - raw_start).force_encoding(@input.encoding)
1442
+ raw = @input.byteslice(raw_start, @pos - raw_start) # byteslice preserves @input's encoding
1208
1443
  advance(3)
1209
1444
  strip_triple(raw, indent)
1210
1445
  end
@@ -1234,20 +1469,30 @@ module SmarterJSON
1234
1469
  def parse_string(quote)
1235
1470
  advance(1)
1236
1471
  start = @pos
1237
- has_escape = false
1472
+ # Fast path (the common case — a string with no escapes): jump straight to the
1473
+ # closing quote with byteindex. It is called only here, from `start`, which is
1474
+ # always a character boundary, so byteindex never sees a mid-char offset.
1475
+ hit = scan_string_delimiter(quote)
1476
+ raise error("unterminated string") if hit.nil?
1477
+
1478
+ if @input.getbyte(hit) == quote
1479
+ @pos = hit
1480
+ result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
1481
+ advance(1)
1482
+ return result
1483
+ end
1484
+
1485
+ # Escape path: a backslash precedes the closing quote. Scan byte by byte from
1486
+ # here — byteindex can't be used past a backslash (a lenient \<multibyte> would
1487
+ # leave @pos mid-character), and this lets the decoder flag invalid escapes
1488
+ # exactly as before. decode_string_with_escapes handles the whole [start, finish].
1489
+ @pos = hit
1238
1490
  while (b = byte)
1239
1491
  if b == quote
1240
- if has_escape
1241
- decoded = decode_string_with_escapes(start, @pos, quote)
1242
- advance(1)
1243
- return decoded
1244
- else
1245
- result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
1246
- advance(1)
1247
- return result
1248
- end
1492
+ decoded = decode_string_with_escapes(start, @pos, quote)
1493
+ advance(1)
1494
+ return decoded
1249
1495
  elsif b == BACKSLASH
1250
- has_escape = true
1251
1496
  advance(1)
1252
1497
  raise error("unterminated string escape") if eof?
1253
1498
 
@@ -1259,6 +1504,20 @@ module SmarterJSON
1259
1504
  raise error("unterminated string")
1260
1505
  end
1261
1506
 
1507
+ # Byte index of the next closing quote or backslash at/after @pos, or nil if
1508
+ # neither occurs before EOF. byteindex scans inside MRI's C; the fallback is a
1509
+ # tight getbyte loop (the ASCII delimiters never alias UTF-8 continuation bytes,
1510
+ # so byte scanning is correct for UTF-8 string content).
1511
+ def scan_string_delimiter(quote)
1512
+ if BYTEINDEX_AVAILABLE
1513
+ @input.byteindex(quote == DQUOTE ? DQUOTE_OR_BACKSLASH : SQUOTE_OR_BACKSLASH, @pos)
1514
+ else
1515
+ i = @pos
1516
+ i += 1 while i < @bytesize && (b = @input.getbyte(i)) != quote && b != BACKSLASH
1517
+ i < @bytesize ? i : nil
1518
+ end
1519
+ end
1520
+
1262
1521
  def decode_string_with_escapes(start, finish, _quote)
1263
1522
  buf = String.new(encoding: Encoding::ASCII_8BIT)
1264
1523
  i = start
@@ -1350,7 +1609,7 @@ module SmarterJSON
1350
1609
 
1351
1610
  if byte == ZERO
1352
1611
  advance(1)
1353
- if [LOWER_X, UPPER_X].include?(byte)
1612
+ if (x = byte) == LOWER_X || x == UPPER_X
1354
1613
  advance(1)
1355
1614
  hex_start = @pos
1356
1615
  advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
@@ -1375,10 +1634,10 @@ module SmarterJSON
1375
1634
  advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
1376
1635
  end
1377
1636
 
1378
- if [LOWER_E, UPPER_E].include?(byte)
1637
+ if (e = byte) == LOWER_E || e == UPPER_E
1379
1638
  is_float = true
1380
1639
  advance(1)
1381
- advance(1) if [PLUS, MINUS].include?(byte)
1640
+ advance(1) if (s = byte) == PLUS || s == MINUS
1382
1641
  raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
1383
1642
 
1384
1643
  advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
@@ -1414,11 +1673,13 @@ module SmarterJSON
1414
1673
  def warn(type, message)
1415
1674
  return unless @on_warning
1416
1675
 
1417
- @on_warning.call(Warning.new(type, message, @line, @col))
1676
+ line, col = line_col_at(@pos)
1677
+ @on_warning.call(Warning.new(type, message, line, col))
1418
1678
  end
1419
1679
 
1420
1680
  def error(message)
1421
- ParseError.new(message, @line, @col)
1681
+ line, col = line_col_at(@pos)
1682
+ ParseError.new(message, line, col)
1422
1683
  end
1423
1684
 
1424
1685
  def display_byte(b)