RubyGems - smarter_json - Versions diffs - 0.9.2 → 1.0.0 - Mend

smarter_json 0.9.2 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/CHANGELOG.md +89 -55
data/README.md +216 -73
data/docs/_introduction.md +6 -12
data/docs/basic_read_api.md +29 -19
data/docs/basic_write_api.md +3 -3
data/docs/examples.md +32 -23
data/docs/options.md +20 -19
data/ext/smarter_json/smarter_json.c +246 -92
data/ext/smarter_json/vendor/LICENSE-fast_float-MIT +27 -0
data/ext/smarter_json/vendor/eisel_lemire.h +117 -0
data/ext/smarter_json/vendor/eisel_lemire.md +29 -0
data/ext/smarter_json/vendor/eisel_lemire_powers.h +663 -0
data/lib/smarter_json/backports.rb +28 -0
data/lib/smarter_json/generator.rb +100 -65
data/lib/smarter_json/options.rb +65 -0
data/lib/smarter_json/parser.rb +441 -141
data/lib/smarter_json/version.rb +1 -1
data/lib/smarter_json.rb +3 -1
metadata +21 -11
data/ext/smarter_json/vendor/ryu.h +0 -819
data/ext/smarter_json/vendor/ryu.md +0 -22

data/lib/smarter_json/parser.rb CHANGED Viewed

@@ -1,5 +1,9 @@
 # frozen_string_literal: true
+# Array#filter_map (used in Recovery#extract_payloads) is Ruby 2.7+; on Ruby < 2.7
+# activate the scoped refinement backport (no-op on 2.7+, which uses native filter_map).
+using SmarterJSON::Backports if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("2.7")
 module SmarterJSON
   # ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
   # from the shared SmarterJSON::Error base.
@@ -12,15 +16,20 @@ module SmarterJSON
   # is always content, never a filename — use process_file for paths.) The values
   # in `options` override Parser::DEFAULT_OPTIONS.
   #
-  # Without a block: returns nil (zero documents), the value (one document), or an
-  # Array of the values (two or more — NDJSON / JSONL / concatenated / whitespace-
-  # separated). :acceleration (default true) selects the C extension when compiled
-  # and loaded (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
+  # Without a block: always returns an Array of the documents found — [] for none,
+  # [doc] for one, [d1, d2, …] for several (NDJSON / JSONL / concatenated). A
+  # top-level value must be a recognized JSON value (number / literal / quoted
+  # string / object / array) or an implicit-root object, else it raises. For the
+  # single-document case use SmarterJSON.process_one (returns the bare value).
+  # :acceleration (default true) selects the C extension when compiled and loaded
+  # (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
   #
-  # With a block: yields each top-level document as it is parsed, and returns nil.
-  # For an IO this streams document-by-document in bounded memory — it reads the
-  # stream as newline-delimited documents (NDJSON / JSONL), one per line.
+  # With a block: yields each top-level document as it is parsed, and returns the
+  # document count. For an IO this streams document-by-document in bounded memory —
+  # it reads the stream as newline-delimited documents (NDJSON / JSONL), one per
+  # line.
   def process(input, options = {}, &block)
+    options = Options.process_options(options)
     if input.is_a?(String)
       Recovery.process_string(input, options, &block)
     elsif input.respond_to?(:read)
@@ -39,7 +48,8 @@ module SmarterJSON
   # loading the whole file); the documents are read as newline-delimited
   # (NDJSON / JSONL), one per line.
   def process_file(path, options = {}, &block)
-    encoding = options.fetch(:encoding, "UTF-8")
+    options = Options.process_options(options)
+    encoding = options[:encoding] || "UTF-8"
     if block
       File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
     else
@@ -47,8 +57,44 @@ module SmarterJSON
     end
   end
-  # Parse a String of JSON content (the in-memory path). Returns nil (block) or
-  # the value / Array (no block); the C extension is used when available.
+  # SmarterJSON.process_one(input, options = {}) — the single-document accessor.
+  #
+  # Returns the first document's value (or nil when the input holds no documents).
+  # When the input holds MORE than one document it returns the first and warns once
+  # — it never raises, since an extra document is valid data; the warning goes to
+  # on_warning if set, else Rails.logger.warn when Rails is loaded, else Kernel#warn.
+  # For an IO this is bounded memory: it parses just the first document and stops as
+  # soon as a second is seen, instead of materialising the whole stream the way
+  # process(io).first would. (process(input).first and process(input)[0] silently
+  # drop documents 2+ — a footgun; use process_one instead.)
+  def process_one(input, options = {})
+    options = Options.process_options(options)
+    # IO: bounded memory — parse just the first document and stop once a second is
+    # seen (peek-to-warn). A String is already in memory, so use the plain no-block
+    # path: it returns the full (wrapper-recovered, de-duplicated) Array in one pass,
+    # which also avoids the reactive-recovery double-yield the block path would hit.
+    unless input.respond_to?(:read)
+      docs = process(input, options)
+      warn_extra_documents(options) if docs.length > 1
+      return docs.first
+    end
+    first = nil
+    count = 0
+    catch(:smarter_json_first_document) do
+      process(input, options) do |doc|
+        count += 1
+        first = doc if count == 1
+        throw(:smarter_json_first_document) if count > 1
+      end
+    end
+    warn_extra_documents(options) if count > 1
+    first
+  end
+  # Parse a String of JSON content (the in-memory path). Returns an Array of the
+  # documents found (empty for none); the C extension is used when available.
   def process_content(input, options, &block)
     if block
       if options.fetch(:acceleration, true) && HAS_ACCELERATION
@@ -63,14 +109,55 @@ module SmarterJSON
     end
   end
+  # Smart default for the nil :encoding option. A String tagged ASCII-8BIT (BINARY)
+  # is how Net::HTTP and many HTTP libraries hand back a response body even when the
+  # bytes are UTF-8. JSON's interchange encoding is UTF-8, so we relabel such input
+  # to UTF-8 when its bytes are valid UTF-8 — otherwise string values would come back
+  # tagged ASCII-8BIT and compare unequal to UTF-8 literals (a silent footgun). When
+  # the bytes are NOT valid UTF-8 we raise EncodingError rather than guess a legacy
+  # encoding — pass an explicit :encoding for that. An explicit (non-nil) :encoding,
+  # or any non-BINARY tag, is left untouched (the per-path force_encoding / validation
+  # handles it). Only relabels — never transcodes.
+  def normalize_default_encoding(input, options)
+    return input unless options[:encoding].nil?
+    return input unless input.encoding == Encoding::ASCII_8BIT
+    utf8 = input.dup.force_encoding(Encoding::UTF_8)
+    return utf8 if utf8.valid_encoding?
+    raise EncodingError, "input is tagged ASCII-8BIT and is not valid UTF-8 — pass encoding: to declare its encoding"
+  end
   # Stream documents from an IO incrementally, yielding each recovered top-level
   # document without slurping the whole input into memory first.
   def stream_io(io, options, &block)
-    Framer.each_document(io) { |doc| Recovery.process_string(doc, options, &block) }
-    nil
+    count = 0
+    Framer.each_document(io) do |doc|
+      # Recovery.process_string yields each value and returns how many it yielded;
+      # blank / comment-only framed segments yield none, so count tracks actual
+      # documents (== values yielded), not raw framed segments.
+      count += Recovery.process_string(doc, options, &block)
+    end
+    count
   end
-  private_class_method :process_content, :stream_io
+  # process_one's "more than one document" notice — routed to on_warning if the caller
+  # gave one, else Rails.logger when Rails is loaded, else Kernel#warn. Never silent,
+  # never raised.
+  def warn_extra_documents(options)
+    message = "SmarterJSON.process_one: input has more than one document — returning the first and " \
+              "dropping the rest. Use SmarterJSON.process to get every document."
+    handler = options[:on_warning]
+    if handler
+      handler.call(Warning.new(:extra_documents, message, nil, nil))
+    elsif defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
+      Rails.logger.warn(message)
+    else
+      Kernel.warn(message)
+    end
+  end
+  private_class_method :process_content, :stream_io, :warn_extra_documents
   # Named byte values, shared by the Parser FSM and the Framer / Recovery byte
   # scanners so none of them spell out raw hex. Included where needed.
@@ -119,7 +206,7 @@ module SmarterJSON
     module_function
-    def each_document(io, &block)
+    def each_document(io)
       buffer = +""
       scan = 0
       doc_start = nil
@@ -343,6 +430,7 @@ module SmarterJSON
     module_function
     def process_string(input, options, &block)
+      input = SmarterJSON.send(:normalize_default_encoding, input, options)
       return SmarterJSON.send(:process_content, input, options, &block) unless input.valid_encoding?
       # Recovery is REACTIVE: parse first, and only fall back to wrapper extraction when
@@ -385,15 +473,23 @@ module SmarterJSON
       handler = options[:on_warning]
       emit_wrapper_warnings(payloads, handler)
-      results = payloads.map do |payload|
-        SmarterJSON.send(:process_content, payload[:slice], options)
+      if block_given?
+        count = 0
+        payloads.each do |payload|
+          SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
+            block.call(doc)
+            count += 1
+          end
+        end
+        return count
       end
-      return results.each(&block).then { nil } if block_given?
-      return nil if results.empty?
-      return results.first if results.length == 1
-      results
+      # Each payload's process_content now returns an Array of its documents; flatten
+      # so several recovered payloads yield one flat Array<doc> (the always-array
+      # contract), not an Array of Arrays.
+      payloads.flat_map do |payload|
+        SmarterJSON.send(:process_content, payload[:slice], options)
+      end
     end
     def emit_wrapper_warnings(payloads, handler)
@@ -613,18 +709,22 @@ module SmarterJSON
     # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
     # would change the string — so when it doesn't match, we skip normalization.
     NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
-    BLANK_HEAD  = /\A[[:space:]]+/.freeze
-    BLANK_TAIL  = /[[:space:]]+\z/.freeze
-    # All caller-facing settings live in one options hash (smarter_csv style).
-    DEFAULT_OPTIONS = {
-      acceleration: true, # use the C extension when available
-      encoding: nil, # label the input's encoding (no transcoding)
-      symbolize_keys: false, # Symbol keys instead of String
-      duplicate_key: :last_wins, # :last_wins | :first_wins | :raise
-      bigdecimal_load: :auto, # :auto | :float | :bigdecimal (Oj-compatible)
-      on_warning: nil, # a callable invoked once per non-fatal lenient fix (a SmarterJSON::Warning)
-    }.freeze
+    # parse_string scans to the next closing-quote-or-backslash. byteindex (Ruby 3.2+,
+    # MRI) does that jump at C speed; the getbyte loop in scan_string_delimiter is the
+    # portable fallback (JRuby / TruffleRuby / older MRI). Both find the same byte.
+    BYTEINDEX_AVAILABLE = "".respond_to?(:byteindex)
+    DQUOTE_OR_BACKSLASH = /["\\]/.freeze
+    SQUOTE_OR_BACKSLASH = /['\\]/.freeze
+    # scan_quoteless_run's fast path jumps (in C) to the first structural terminator
+    # (',' '}' ']' '{' '[') OR any whitespace ([[:space:]] covers ASCII + Unicode space,
+    # incl. LF/CR which also terminate). Stopping at a terminator/EOF means the run had no
+    # interior whitespace, so there's nothing to trim and no comment marker can apply.
+    QL_BREAK = /[,{}\[\]]|[[:space:]]/.freeze
+    # The defaults live centrally in SmarterJSON::Options (lib/smarter_json/options.rb).
+    DEFAULT_OPTIONS = Options::DEFAULT_OPTIONS
     def initialize(input, options = {})
       raise ArgumentError, "input must be a String" unless input.is_a?(String)
@@ -632,8 +732,13 @@ module SmarterJSON
       opts = DEFAULT_OPTIONS.merge(options)
       @symbolize_keys  = opts[:symbolize_keys]
       @duplicate_key   = opts[:duplicate_key]
-      @bigdecimal_load = opts[:bigdecimal_load]
-      @on_warning      = opts[:on_warning]
+      @decimal_precision = opts[:decimal_precision]
+      @on_warning = opts[:on_warning]
+      # store_member only needs the (per-member) Hash#key? duplicate lookup when a
+      # repeat would change behavior: a warning must fire, or :first_wins must keep the
+      # first. With the default (:last_wins, no handler) a duplicate just overwrites,
+      # which `hash[k] = value` already does — so skip the lookup entirely.
+      @check_duplicates = !@on_warning.nil? || @duplicate_key == :first_wins
       encoding = opts[:encoding]
       @input = encoding ? input.dup.force_encoding(encoding) : input
@@ -642,8 +747,6 @@ module SmarterJSON
       @bytesize = @input.bytesize
       # Skip a UTF-8 BOM (EF BB BF) at the start of input.
       @pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
-      @line = 1
-      @col = 1
     end
     # No block: auto-detect the document count for free (the same "is there
@@ -653,17 +756,14 @@ module SmarterJSON
     # value. Commas do NOT separate documents (only whitespace / newline /
     # concatenation do), so a bracketless comma list still raises in parse_document.
     def parse
-      skip_whitespace_and_comments
-      return nil if eof?
-      value = parse_document
-      skip_whitespace_and_comments
-      return value if eof?
-      results = [value]
+      results = []
       until eof?
-        results << parse_document
-        skip_whitespace_and_comments
+        skip_document_separators
+        break if eof?
+        value = parse_document
+        enforce_scalar_boundary(value)
+        results << value
       end
       results
     end
@@ -671,13 +771,17 @@ module SmarterJSON
     # Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
     # whitespace-separated). Used by the block form of SmarterJSON.process.
     def each_value
-      loop do
-        skip_whitespace_and_comments
+      count = 0
+      until eof?
+        skip_document_separators
         break if eof?
-        yield parse_document
+        value = parse_document
+        enforce_scalar_boundary(value)
+        yield value
+        count += 1
       end
-      nil
+      count
     end
     private
@@ -688,6 +792,48 @@ module SmarterJSON
       parse_iter(implicit_root_object_ahead?)
     end
+    # Between top-level documents, whitespace, comments, AND commas all separate
+    # (commas collapse like the in-container lenient-comma rule). A space alone never
+    # separates — that is handled inside the document by the quoteless run, so
+    # `1 2 3` is one document (the string "1 2 3") while `1, 2, 3` is three.
+    def skip_document_separators
+      skip_whitespace_and_comments
+      while byte == COMMA
+        advance(1)
+        skip_whitespace_and_comments
+      end
+    end
+    # After a top-level value: a self-delimiting value (object / array / quoted string)
+    # may be followed by anything (the next document self-delimits), but a bare scalar
+    # (number / keyword) must be followed by a real separator — a newline, ',', a
+    # comment, or EOF. A space is NOT a separator, so `1 2 3` and `42 "x" true` raise
+    # rather than silently splitting; bare top-level words raise in parse_value itself.
+    def enforce_scalar_boundary(value)
+      return if value.is_a?(String) || value.is_a?(Hash) || value.is_a?(Array)
+      skip_horizontal_whitespace
+      b = byte
+      return if b.nil? || b == LF || b == CR || b == COMMA
+      return if b == HASH || (b == SLASH && ((c = byte_at(1)) == SLASH || c == STAR))
+      raise error("a top-level number or keyword must be followed by a newline, ',', or end of input")
+    end
+    # Skip horizontal whitespace only (space / tab / VT / FF) — NOT newlines, which are
+    # document separators. Used by the scalar-boundary check above.
+    def skip_horizontal_whitespace
+      while (b = byte)
+        if b == SPACE || b == TAB || b == 0x0B || b == 0x0C
+          advance(1)
+        elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
+          @pos += n # multibyte horizontal whitespace (NBSP, U+2000–200A, …)
+        else
+          break
+        end
+      end
+    end
     # Iterative container parser — explicit stack, NO Ruby recursion, so nesting
     # is bounded only by memory (like Oj and the C extension's fj_parse_iter),
     # never by the call stack. Mirrors the C driver to keep the two paths in
@@ -708,9 +854,10 @@ module SmarterJSON
       end
       vss = false # warnings: has a value landed in the current container since the last separator?
-      loop do
+      input = @input # hoisted: @input never changes mid-parse; byte reads inline as input.getbyte(@pos)
+      while true
         skip_whitespace_and_comments
-        b = byte
+        b = input.getbyte(@pos)
         if at_top
           if b == LBRACE
             advance(1)
@@ -729,8 +876,17 @@ module SmarterJSON
             at_top = false
             vss = false
           elsif b.nil?
+            # Defensive guard: parse / each_value check eof? before calling parse_iter,
+            # so `at_top` never meets end-of-input here. Kept to mirror the C driver.
+            # :nocov:
             raise error("unexpected end of input")
+            # :nocov:
           else
+            # Top-level scalar: must be a recognized JSON value (number / literal /
+            # quoted string). A bare word raises — there are no top-level quoteless
+            # strings (Decision 2 = B-broad). In-container quoteless still uses
+            # parse_member_value; the scalar-vs-separator boundary is enforced by the
+            # parse / each_value loop via enforce_scalar_boundary.
             return parse_value
           end
         elsif b == COMMA
@@ -758,12 +914,12 @@ module SmarterJSON
           else
             key = parse_object_key
             skip_whitespace_and_comments
-            raise error("expected ':' after key #{key.inspect}") unless byte == COLON
+            raise error("expected ':' after key #{key.inspect}") unless input.getbyte(@pos) == COLON
             advance(1)
             skip_whitespace_and_comments
-            b = byte
-            if [LBRACE, LBRACKET].include?(b)
+            b = input.getbyte(@pos)
+            if b == LBRACE || b == LBRACKET
               child = b == LBRACE ? {} : []
               advance(1) # consume { or [
               store_member(cur, key, child)
@@ -771,7 +927,7 @@ module SmarterJSON
               cur = child
               cur_obj = (b == LBRACE)
               vss = false
-            elsif [RBRACE, COMMA].include?(b)
+            elsif b == RBRACE || b == COMMA
               # key with a colon but no value -> null (don't consume } or ,; the loop does)
               store_member(cur, key, nil)
               warn(:empty_value, "key #{key.inspect} had no value — used null") if @on_warning
@@ -796,7 +952,7 @@ module SmarterJSON
             raise error("unterminated array")
           elsif b == RBRACE
             raise error("unexpected '}' — expected ']' or a value")
-          elsif [LBRACE, LBRACKET].include?(b)
+          elsif b == LBRACE || b == LBRACKET
             child = b == LBRACE ? {} : []
             advance(1) # consume { or [
             cur.push(child)
@@ -818,11 +974,11 @@ module SmarterJSON
       b = byte
       return false unless b && key_start_byte?(b)
-      saved = [@pos, @line, @col]
+      saved = @pos
       advance(1) while (c = byte) && key_continue_byte?(c)
       skip_pure_whitespace
       result = (byte == COLON)
-      @pos, @line, @col = saved
+      @pos = saved
       result
     end
@@ -840,46 +996,72 @@ module SmarterJSON
       @pos >= @bytesize
     end
+    # Advance the byte cursor by n (clamped to EOF). No line/col bookkeeping — that
+    # is computed lazily in line_col_at only when an error/warning is built. This is
+    # the hot-path primitive every consumed byte goes through, so it stays O(1) with
+    # no block, no re-read, and no per-byte branching. Mirrors the C fj_advance.
     def advance(n = 1)
-      n.times do
-        b = @input.getbyte(@pos)
-        return if b.nil?
+      @pos += n
+      @pos = @bytesize if @pos > @bytesize
+    end
+    # Line and 1-based BYTE column at byte position `pos`, computed lazily by scanning
+    # from the start of the buffer — only on the cold path (error / warning / triple-quote
+    # indent), never per byte. CR, LF, and CRLF each count as one newline; the column is
+    # the byte offset within the line. Mirrors the C extension's fj_line_col so both paths
+    # report identical positions.
+    def line_col_at(pos = @pos)
+      limit = pos < @bytesize ? pos : @bytesize
+      line = 1
+      col = 1
+      i = 0
+      while i < limit
+        b = @input.getbyte(i)
         if b == LF
-          @line += 1
-          @col = 1
-          @pos += 1
+          line += 1
+          col = 1
         elsif b == CR
-          @line += 1
-          @col = 1
-          @pos += 1
-          @pos += 1 if @input.getbyte(@pos) == LF
+          line += 1
+          col = 1
+          i += 1 if i + 1 < @bytesize && @input.getbyte(i + 1) == LF
         else
-          @col += 1
-          @pos += 1
+          col += 1
         end
+        i += 1
       end
+      [line, col]
+    end
+    # 1-based byte column at `pos` (bytes since the last line start). Used for
+    # triple-quoted-string indentation stripping. Mirrors the C fj_column.
+    def column_at(pos = @pos)
+      c = 1
+      i = pos - 1
+      while i >= 0 && (b = @input.getbyte(i)) != LF && b != CR
+        c += 1
+        i -= 1
+      end
+      c
     end
     # --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
     def skip_pure_whitespace
-      loop do
-        b = byte
-        break if b.nil?
+      input = @input
+      pos = @pos
+      while (b = input.getbyte(pos))
         if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
-          advance(1)
+          pos += 1
         elsif b >= 0x80
-          n = multibyte_ws_len(@pos)
+          n = multibyte_ws_len(pos)
           break if n.zero?
-          @pos += n
-          @col += 1
+          pos += n
         else
           break
         end
       end
+      @pos = pos
     end
     # Number of bytes of the Unicode-whitespace char starting at pos, or 0.
@@ -913,19 +1095,20 @@ module SmarterJSON
     # A '#', '//', or '/*' starts a comment only when preceded by whitespace
     # or at the very start of input (the comment-marker rule).
     def skip_whitespace_and_comments
-      loop do
+      while true
         skip_pure_whitespace
         b = byte
-        break if b.nil?
+        if b == HASH
+          break unless preceded_by_ws_or_start?
-        is_marker = (b == HASH) || (b == SLASH && [SLASH, STAR].include?(byte_at(1)))
-        break unless is_marker
-        break unless preceded_by_ws_or_start?
+          skip_to_eol
+        elsif b == SLASH
+          c = byte_at(1)
+          break unless (c == SLASH || c == STAR) && preceded_by_ws_or_start?
-        if b == SLASH && byte_at(1) == STAR
-          skip_block_comment
+          c == STAR ? skip_block_comment : skip_to_eol
         else
-          skip_to_eol
+          break
         end
       end
     end
@@ -965,8 +1148,9 @@ module SmarterJSON
     # --- values ---
     # Top-level / strict value: no quoteless fallback.
+    # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
+    # so @pos is at the value's first byte — no leading skip needed here.
     def parse_value
-      skip_whitespace_and_comments
       raise error("unexpected end of input") if eof?
       b = byte
@@ -999,8 +1183,9 @@ module SmarterJSON
     end
     # Value in object-value or array-element position: quoteless allowed.
+    # Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
+    # so @pos is at the value's first byte — no leading skip needed here.
     def parse_member_value
-      skip_whitespace_and_comments
       raise error("unexpected end of input") if eof?
       b = byte
@@ -1033,7 +1218,7 @@ module SmarterJSON
       until eof?
         if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
            closers.include?(@input.getbyte(@pos + 2))
-          result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
+          result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
           advance(3)
           return result
         end
@@ -1044,9 +1229,7 @@ module SmarterJSON
     def store_member(hash, key, value)
       k = @symbolize_keys ? key.to_sym : key
-      if hash.key?(k)
-        raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
+      if @check_duplicates && hash.key?(k)
         warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}") if @on_warning
         return if @duplicate_key == :first_wins
       end
@@ -1057,6 +1240,12 @@ module SmarterJSON
       b = byte
       return parse_string(DQUOTE) if b == DQUOTE
       return parse_string(SQUOTE) if b == SQUOTE
+      # A key may open with a smart/curly quote too (word-processor paste curls keys,
+      # not just values) — route to the same reader values already use.
+      kind = smart_quote_kind(@pos)
+      return parse_smart_string(kind) if kind
       raise error("expected a key") unless b && key_start_byte?(b)
       parse_identifier_key
@@ -1077,51 +1266,77 @@ module SmarterJSON
       start = @pos
       advance(1)
       advance(1) while (b = byte) && key_continue_byte?(b)
-      @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
+      @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
     end
     # --- quoteless strings & literal classification ---
     def parse_quoteless_or_literal
       start = @pos
-      scan_quoteless_run
+      value_end = scan_quoteless_run
       # A quoteless run must consume at least one byte. If the first byte is a
       # delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
       # here would make the caller's `result << parse_member_value` loop forever.
       # Raise instead (correct today: the Lenient Commas Option is not adopted).
       raise error("expected a value") if @pos == start
-      raw = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
-      classify_quoteless(trim_blank(raw))
+      # value_end is the end of the last non-whitespace char in the run; slicing to it
+      # drops trailing whitespace without a regex (the caller already skipped leading
+      # whitespace, so there is none to trim at the front). Equivalent to the old
+      # trim_blank(raw) but with no per-scalar String#sub allocations.
+      raw = @input.byteslice(start, value_end - start) # byteslice preserves @input's encoding
+      classify_quoteless(raw)
     end
     # Advance to the end of a quoteless run. Stops at structural punctuation
-    # (',' '}' ']'), a newline, EOF, or a comment marker that is preceded by
-    # whitespace. Spaces by themselves are not delimiters.
+    # (',' '{' '}' '[' ']' — openers terminate symmetrically with closers, so a
+    # self-delimiting value starts fresh: `localhost {"a":1}` -> ["localhost", {...}]),
+    # a newline, EOF, or a comment marker that is preceded by whitespace. Spaces by
+    # themselves are not delimiters.
+    # Advance @pos to the end of the quoteless run (including any trailing whitespace,
+    # so the parser resumes correctly after the value). Returns value_end: the byte
+    # offset just past the last NON-whitespace char, so the caller can slice off
+    # trailing whitespace without a regex.
     def scan_quoteless_run
+      input = @input
+      pos = @pos
+      # Fast path: one C-level byteindex jumps to the first structural terminator or
+      # whitespace. If it lands on a terminator (or EOF) the run had no interior whitespace,
+      # so [pos, hit) is the whole value — value_end == hit (no trailing trim) and no comment
+      # marker can apply (those only break after whitespace). This is the common case
+      # (numbers and simple tokens). Anything with whitespace falls to the byte-by-byte loop.
+      if BYTEINDEX_AVAILABLE
+        hit = input.byteindex(QL_BREAK, pos) || @bytesize
+        b = hit < @bytesize ? input.getbyte(hit) : nil
+        if b.nil? || b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
+          @pos = hit
+          return hit
+        end
+      end
+      # Slow path: the run contains whitespace — scan byte by byte to honor interior
+      # whitespace, trailing-whitespace trimming (value_end is the end of the last
+      # non-whitespace char), and the comment-marker-after-whitespace rule.
+      value_end = pos
       prev_ws = false
-      loop do
-        b = byte
-        break if b.nil?
-        break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
-        break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
+      while (b = input.getbyte(pos))
+        break if b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
+        break if prev_ws && (b == HASH || (b == SLASH && ((c = input.getbyte(pos + 1)) == SLASH || c == STAR)))
         if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
           prev_ws = true
-          advance(1)
-        elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
+          pos += 1
+        elsif b >= 0x80 && (n = multibyte_ws_len(pos)).positive?
           prev_ws = true
-          @pos += n
-          @col += 1
+          pos += n
         else
           prev_ws = false
-          advance(1)
+          pos += 1
+          value_end = pos
         end
       end
-    end
-    def trim_blank(str)
-      str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
+      @pos = pos
+      value_end
     end
     def classify_quoteless(str)
@@ -1132,7 +1347,7 @@ module SmarterJSON
       when "undefined"             then return nil
       when "NaN"                   then return Float::NAN
       when "Infinity", "+Infinity" then return Float::INFINITY
-      when "-Infinity"             then return (-Float::INFINITY)
+      when "-Infinity"             then return -Float::INFINITY
       end
       num = numeric_value(str)
       num.equal?(NOT_NUMERIC) ? str : num
@@ -1140,31 +1355,86 @@ module SmarterJSON
     # Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
     def numeric_value(str)
-      if HEX_RE.match?(str)
-        neg = str.start_with?("-")
+      # Cheap hex gate: only invoke HEX_RE when the token actually looks like [+-]?0x… .
+      # A Regexp#match? has real per-call cost; almost no number is hex, so the 1–3 byte
+      # check skips that call on the common path (measured +21% on long-token decimals).
+      if hex_prefix?(str) && HEX_RE.match?(str)
+        neg = str.getbyte(0) == MINUS
         body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
         v = body[2..-1].to_i(16)
         return neg ? -v : v
       end
       return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
-      body = str.delete("_")
+      # delete("_") allocates a fresh string even when there is nothing to delete; on long
+      # number tokens that is a real per-value allocation. Underscores are rare, so only
+      # pay it when the token actually contains one (measured +27% on long-token decimals).
+      body = str.include?("_") ? str.delete("_") : str
       body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
     end
-    # A decimal (has '.' or exponent). bigdecimal_load: :float -> Float,
+    # True when the token starts with [+-]?0[xX] — the only shape HEX_RE can match.
+    def hex_prefix?(str)
+      c0 = str.getbyte(0)
+      if c0 == ZERO
+        x = str.getbyte(1)
+        x == LOWER_X || x == UPPER_X
+      elsif c0 == MINUS || c0 == PLUS
+        str.getbyte(1) == ZERO && ((x = str.getbyte(2)) == LOWER_X || x == UPPER_X)
+      else
+        false
+      end
+    end
+    # A decimal (has '.' or exponent). decimal_precision: :float -> Float,
     # :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
     # than 16 significant digits (Oj's DEC_MAX threshold), else Float.
     def decimal_value(body)
-      case @bigdecimal_load
-      when :float      then body.to_f
+      case @decimal_precision
+      when :float      then float_or_warn(body)
       when :bigdecimal then to_big_decimal(body)
-      else                  significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
+      else                  significant_digits(body) > 16 ? to_big_decimal(body) : float_or_warn(body)
       end
     end
+    # A finite numeric literal whose magnitude exceeds Float range (e.g. 1e400) becomes
+    # ±Infinity — a silent data change. Report it via :number_overflow (the value is still
+    # returned; we warn rather than raise or invent). The Infinity/NaN *keywords* go through
+    # a separate path and never reach here, so they don't warn.
+    def float_or_warn(body)
+      f = body.to_f
+      # Only test for overflow when an on_warning handler is listening: `f.infinite?` is a
+      # per-float method call we don't want on the hot number path otherwise, and with no
+      # handler the warning would go nowhere anyway. Overflow is vanishingly rare.
+      warn(:number_overflow, "number literal out of Float range — collapsed to #{f}") if @on_warning && f.infinite?
+      f
+    end
+    # Count significant mantissa digits (leading zeros excluded, exponent ignored) to pick
+    # Float vs BigDecimal in :auto mode. A single byte-scan — the old three-regex version
+    # (strip exponent, strip non-digits, strip leading zeros, .length) ran on every float
+    # and dominated the number path's cost. body is a DEC_RE-validated token (digits, at most
+    # one '.', optional sign, optional e/E exponent), underscores already removed.
     def significant_digits(body)
-      body.sub(/[eE].*\z/, "").gsub(/[^0-9]/, "").sub(/\A0+/, "").length
+      count = 0
+      leading = true
+      i = 0
+      n = body.bytesize
+      while i < n
+        b = body.getbyte(i)
+        i += 1
+        break if b == LOWER_E || b == UPPER_E # exponent: its digits aren't significant
+        next unless b >= ZERO && b <= NINE    # skip sign and the decimal point
+        if leading && b == ZERO
+          next                                # leading zero (incl. those after '.') — not significant
+        else
+          leading = false
+          count += 1
+        end
+      end
+      count
     end
     def to_big_decimal(body)
@@ -1175,7 +1445,11 @@ module SmarterJSON
       body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
       BigDecimal(body)
     rescue ArgumentError
+      # Defensive: BigDecimal() does not reject a DEC_RE-validated, normalized token,
+      # so this fallback is unreachable from valid input. Kept as a safety net.
+      # :nocov:
       body.to_f
+      # :nocov:
     end
     # BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
@@ -1194,7 +1468,7 @@ module SmarterJSON
     end
     def parse_triple_quoted
-      indent = @col - 1
+      indent = column_at(@pos) - 1
       advance(3)
       raw_start = @pos
       until eof?
@@ -1204,7 +1478,7 @@ module SmarterJSON
       end
       raise error("unterminated triple-quoted string") if eof?
-      raw = @input.byteslice(raw_start, @pos - raw_start).force_encoding(@input.encoding)
+      raw = @input.byteslice(raw_start, @pos - raw_start) # byteslice preserves @input's encoding
       advance(3)
       strip_triple(raw, indent)
     end
@@ -1234,20 +1508,30 @@ module SmarterJSON
     def parse_string(quote)
       advance(1)
       start = @pos
-      has_escape = false
+      # Fast path (the common case — a string with no escapes): jump straight to the
+      # closing quote with byteindex. It is called only here, from `start`, which is
+      # always a character boundary, so byteindex never sees a mid-char offset.
+      hit = scan_string_delimiter(quote)
+      raise error("unterminated string") if hit.nil?
+      if @input.getbyte(hit) == quote
+        @pos = hit
+        result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
+        advance(1)
+        return result
+      end
+      # Escape path: a backslash precedes the closing quote. Scan byte by byte from
+      # here — byteindex can't be used past a backslash (a lenient \<multibyte> would
+      # leave @pos mid-character), and this lets the decoder flag invalid escapes
+      # exactly as before. decode_string_with_escapes handles the whole [start, finish].
+      @pos = hit
       while (b = byte)
         if b == quote
-          if has_escape
-            decoded = decode_string_with_escapes(start, @pos, quote)
-            advance(1)
-            return decoded
-          else
-            result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
-            advance(1)
-            return result
-          end
+          decoded = decode_string_with_escapes(start, @pos, quote)
+          advance(1)
+          return decoded
         elsif b == BACKSLASH
-          has_escape = true
           advance(1)
           raise error("unterminated string escape") if eof?
@@ -1259,6 +1543,20 @@ module SmarterJSON
       raise error("unterminated string")
     end
+    # Byte index of the next closing quote or backslash at/after @pos, or nil if
+    # neither occurs before EOF. byteindex scans inside MRI's C; the fallback is a
+    # tight getbyte loop (the ASCII delimiters never alias UTF-8 continuation bytes,
+    # so byte scanning is correct for UTF-8 string content).
+    def scan_string_delimiter(quote)
+      if BYTEINDEX_AVAILABLE
+        @input.byteindex(quote == DQUOTE ? DQUOTE_OR_BACKSLASH : SQUOTE_OR_BACKSLASH, @pos)
+      else
+        i = @pos
+        i += 1 while i < @bytesize && (b = @input.getbyte(i)) != quote && b != BACKSLASH
+        i < @bytesize ? i : nil
+      end
+    end
     def decode_string_with_escapes(start, finish, _quote)
       buf = String.new(encoding: Encoding::ASCII_8BIT)
       i = start
@@ -1350,7 +1648,7 @@ module SmarterJSON
       if byte == ZERO
         advance(1)
-        if [LOWER_X, UPPER_X].include?(byte)
+        if (x = byte) == LOWER_X || x == UPPER_X
           advance(1)
           hex_start = @pos
           advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
@@ -1375,10 +1673,10 @@ module SmarterJSON
         advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
       end
-      if [LOWER_E, UPPER_E].include?(byte)
+      if (e = byte) == LOWER_E || e == UPPER_E
         is_float = true
         advance(1)
-        advance(1) if [PLUS, MINUS].include?(byte)
+        advance(1) if (s = byte) == PLUS || s == MINUS
         raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
         advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
@@ -1414,11 +1712,13 @@ module SmarterJSON
     def warn(type, message)
       return unless @on_warning
-      @on_warning.call(Warning.new(type, message, @line, @col))
+      line, col = line_col_at(@pos)
+      @on_warning.call(Warning.new(type, message, line, col))
     end
     def error(message)
-      ParseError.new(message, @line, @col)
+      line, col = line_col_at(@pos)
+      ParseError.new(message, line, col)
     end
     def display_byte(b)