RubyGems - smarter_json - Versions diffs - 0.6.0 - Mend

smarter_json 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +7 -0
data/.gitignore +46 -0
data/CHANGELOG.md +83 -0
data/LICENSE.txt +21 -0
data/README.md +115 -0
data/Rakefile +22 -0
data/docs/_introduction.md +48 -0
data/docs/basic_read_api.md +85 -0
data/docs/basic_write_api.md +144 -0
data/docs/examples.md +140 -0
data/docs/options.md +84 -0
data/ext/smarter_json/extconf.rb +30 -0
data/ext/smarter_json/smarter_json.c +1470 -0
data/ext/smarter_json/smarter_json.h +9 -0
data/ext/smarter_json/vendor/ryu.h +819 -0
data/ext/smarter_json/vendor/ryu.md +22 -0
data/lib/smarter_json/errors.rb +28 -0
data/lib/smarter_json/generator.rb +209 -0
data/lib/smarter_json/parser.rb +957 -0
data/lib/smarter_json/version.rb +5 -0
data/lib/smarter_json/warning.rb +17 -0
data/lib/smarter_json.rb +25 -0
metadata +87 -0

data/lib/smarter_json/parser.rb ADDED Viewed

@@ -0,0 +1,957 @@
+# frozen_string_literal: true
+module SmarterJSON
+  # ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
+  # from the shared SmarterJSON::Error base.
+  module_function
+  # SmarterJSON.process(input, options = {}) — the main entry point.
+  #
+  # `input` is either a String of JSON content or an IO to read from. (A String
+  # is always content, never a filename — use process_file for paths.) The values
+  # in `options` override Parser::DEFAULT_OPTIONS.
+  #
+  # Without a block: returns nil (zero documents), the value (one document), or an
+  # Array of the values (two or more — NDJSON / JSONL / concatenated / whitespace-
+  # separated). :acceleration (default true) selects the C extension when compiled
+  # and loaded (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
+  #
+  # With a block: yields each top-level document as it is parsed, and returns nil.
+  # For an IO this streams document-by-document in bounded memory — it reads the
+  # stream as newline-delimited documents (NDJSON / JSONL), one per line.
+  def process(input, options = {}, &block)
+    if input.is_a?(String)
+      process_content(input, options, &block)
+    elsif input.respond_to?(:read)
+      block ? stream_io(input, options, &block) : process_content(input.read, options)
+    else
+      raise ArgumentError, "SmarterJSON.process expects a String or an IO, got #{input.class}"
+    end
+  end
+  # SmarterJSON.process_file(path, options = {}) — open a file and process it.
+  #
+  # The :encoding option labels the file's encoding (default "UTF-8"); it does NOT
+  # trigger a transcoding pass — the parser works on the bytes in their native
+  # encoding and emits string values with the same encoding tag. With a block,
+  # streams document-by-document straight from disk in bounded memory (never
+  # loading the whole file); the documents are read as newline-delimited
+  # (NDJSON / JSONL), one per line.
+  def process_file(path, options = {}, &block)
+    encoding = options.fetch(:encoding, "UTF-8")
+    if block
+      File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
+    else
+      process_content(File.read(path, encoding: encoding), options)
+    end
+  end
+  # Parse a String of JSON content (the in-memory path). Returns nil (block) or
+  # the value / Array (no block); the C extension is used when available.
+  def process_content(input, options, &block)
+    if block
+      if options.fetch(:acceleration, true) && HAS_ACCELERATION
+        parse_c(input, options, &block)
+      else
+        Parser.new(input, options).each_value(&block)
+      end
+    elsif options.fetch(:acceleration, true) && HAS_ACCELERATION
+      parse_c(input, options) # returns [result, warnings] when options[:warnings]
+    else
+      parser = Parser.new(input, options)
+      options.fetch(:warnings, false) ? [parser.parse, parser.warnings] : parser.parse
+    end
+  end
+  # Stream documents from an IO, one line (= one document) at a time, yielding
+  # each — bounded memory. Newline-delimited (NDJSON / JSONL); a single document
+  # spanning multiple lines is not supported by the streaming path.
+  def stream_io(io, options, &block)
+    io.each_line("\n") { |line| process_content(line, options, &block) }
+    nil
+  end
+  private_class_method :process_content, :stream_io
+  # Hand-rolled FSM single-pass parser.
+  # Layer 1: strict JSON (RFC 8259).
+  # Layer 2: JSON5 additions — line/block comments, trailing comma,
+  #          unquoted ECMAScript identifier keys, single-quoted strings,
+  #          hex numbers, leading/trailing decimal points, Infinity/NaN,
+  #          explicit + sign, \-line-continuation inside strings.
+  # Layer 3: HJSON-inspired additions — #/comment-marker rule, triple-quoted
+  #          strings, quoteless single-line strings, implicit root object,
+  #          newline-as-separator, broader unquoted keys, recognized-literals-win.
+  # Layer 4: smarter_json additions — UTF-8 BOM skip, smart/curly quotes,
+  #          Python literals (True/False/None) and undefined, underscores in
+  #          numeric literals, and encoding validation (SmarterJSON::EncodingError).
+  class Parser
+    LBRACE     = 0x7B
+    RBRACE     = 0x7D
+    LBRACKET   = 0x5B
+    RBRACKET   = 0x5D
+    COLON      = 0x3A
+    COMMA      = 0x2C
+    DQUOTE     = 0x22
+    SQUOTE     = 0x27
+    BACKSLASH  = 0x5C
+    SLASH      = 0x2F
+    STAR       = 0x2A
+    HASH       = 0x23
+    MINUS      = 0x2D
+    PLUS       = 0x2B
+    DOT        = 0x2E
+    ZERO       = 0x30
+    NINE       = 0x39
+    LOWER_E    = 0x65
+    UPPER_E    = 0x45
+    LOWER_T    = 0x74
+    LOWER_F    = 0x66
+    LOWER_N    = 0x6E
+    LOWER_U    = 0x75
+    LOWER_X    = 0x78
+    UPPER_X    = 0x58
+    UPPER_I    = 0x49
+    UPPER_N    = 0x4E
+    UPPER_T    = 0x54
+    UPPER_F    = 0x46
+    UNDERSCORE = 0x5F
+    DOLLAR     = 0x24
+    SPACE      = 0x20
+    TAB        = 0x09
+    LF         = 0x0A
+    CR         = 0x0D
+    NOT_NUMERIC = Object.new
+    HEX_RE      = /\A[-+]?0[xX][0-9a-fA-F_]+\z/.freeze
+    # Mantissa must carry at least one digit (int part, or a leading-dot fraction), so a
+    # bare exponent like "-e695881" is NOT a number — it falls through to a quoteless
+    # string, matching the C path. Trailing exponent stays optional.
+    DEC_RE      = /\A[-+]?(?:(?:0|[1-9][0-9_]*)(?:\.[0-9_]*)?|\.[0-9_]+)(?:[eE][-+]?[0-9_]+)?\z/.freeze
+    # A decimal BigDecimal() would reject as-is: a leading dot (".5") or a dot not
+    # followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
+    # would change the string — so when it doesn't match, we skip normalization.
+    NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
+    BLANK_HEAD  = /\A[[:space:]]+/.freeze
+    BLANK_TAIL  = /[[:space:]]+\z/.freeze
+    # All caller-facing settings live in one options hash (smarter_csv style).
+    DEFAULT_OPTIONS = {
+      acceleration: true, # use the C extension when available
+      encoding: nil, # label the input's encoding (no transcoding)
+      symbolize_keys: false, # Symbol keys instead of String
+      duplicate_key: :last_wins, # :last_wins | :first_wins | :raise
+      bigdecimal_load: :auto, # :auto | :float | :bigdecimal (Oj-compatible)
+      warnings: false, # collect non-fatal lenient fixes; process returns [result, warnings]
+    }.freeze
+    # Warnings collected during the parse (empty slots, empty values, dropped duplicate
+    # keys). Empty unless the parser was built with warnings: true. Public so the module
+    # functions can read it after parse / each_value.
+    attr_reader :warnings
+    def initialize(input, options = {})
+      raise ArgumentError, "input must be a String" unless input.is_a?(String)
+      opts = DEFAULT_OPTIONS.merge(options)
+      @symbolize_keys  = opts[:symbolize_keys]
+      @duplicate_key   = opts[:duplicate_key]
+      @bigdecimal_load = opts[:bigdecimal_load]
+      @collect_warnings = opts[:warnings]
+      @warnings = []
+      encoding = opts[:encoding]
+      @input = encoding ? input.dup.force_encoding(encoding) : input
+      raise EncodingError, "invalid byte sequence for #{@input.encoding.name}" unless @input.valid_encoding?
+      @bytesize = @input.bytesize
+      # Skip a UTF-8 BOM (EF BB BF) at the start of input.
+      @pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
+      @line = 1
+      @col = 1
+    end
+    # No block: auto-detect the document count for free (the same "is there
+    # trailing content?" check that used to raise). 0 documents -> nil; 1 document
+    # -> the value itself (single-document path, no Array allocated); 2+ documents
+    # (NDJSON / JSONL / concatenated / whitespace-separated) -> an Array of every
+    # value. Commas do NOT separate documents (only whitespace / newline /
+    # concatenation do), so a bracketless comma list still raises in parse_document.
+    def parse
+      skip_whitespace_and_comments
+      return nil if eof?
+      value = parse_document
+      skip_whitespace_and_comments
+      return value if eof?
+      results = [value]
+      until eof?
+        results << parse_document
+        skip_whitespace_and_comments
+      end
+      results
+    end
+    # Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
+    # whitespace-separated). Used by the block form of SmarterJSON.process.
+    def each_value
+      loop do
+        skip_whitespace_and_comments
+        break if eof?
+        yield parse_document
+      end
+      nil
+    end
+    private
+    # --- top-level dispatch ---
+    def parse_document
+      parse_iter(implicit_root_object_ahead?)
+    end
+    # Iterative container parser — explicit stack, NO Ruby recursion, so nesting
+    # is bounded only by memory (like Oj and the C extension's fj_parse_iter),
+    # never by the call stack. Mirrors the C driver to keep the two paths in
+    # parity.
+    def parse_iter(implicit_root)
+      stack = []
+      root = nil
+      cur = nil
+      cur_obj = false
+      at_top = true
+      if implicit_root
+        root = {}
+        stack.push(root)
+        cur = root
+        cur_obj = true
+        at_top = false
+      end
+      vss = false # warnings: has a value landed in the current container since the last separator?
+      loop do
+        skip_whitespace_and_comments
+        b = byte
+        if at_top
+          if b == LBRACE
+            advance(1)
+            root = {}
+            stack.push(root)
+            cur = root
+            cur_obj = true
+            at_top = false
+            vss = false
+          elsif b == LBRACKET
+            advance(1)
+            root = []
+            stack.push(root)
+            cur = root
+            cur_obj = false
+            at_top = false
+            vss = false
+          elsif b.nil?
+            raise error("unexpected end of input")
+          else
+            return parse_value
+          end
+        elsif b == COMMA
+          # Commas are collapsing separators inside a container: an empty slot (leading,
+          # interior, or trailing comma) adds nothing. Skip it; the next iteration reads
+          # the following value/key or the closing bracket.
+          warn(:empty_slot, "extra comma — collapsed an empty slot") unless vss
+          vss = false
+          advance(1)
+        elsif cur_obj
+          if b == RBRACE
+            advance(1)
+            stack.pop
+            return root if stack.empty?
+            cur = stack.last
+            cur_obj = cur.is_a?(Hash)
+            vss = true # the just-closed container is a value in its parent
+          elsif b.nil?
+            return root if implicit_root && stack.size == 1
+            raise error("unterminated object")
+          elsif b == RBRACKET
+            raise error("unexpected ']' — expected a key or '}'")
+          else
+            key = parse_object_key
+            skip_whitespace_and_comments
+            raise error("expected ':' after key #{key.inspect}") unless byte == COLON
+            advance(1)
+            skip_whitespace_and_comments
+            b = byte
+            if [LBRACE, LBRACKET].include?(b)
+              child = b == LBRACE ? {} : []
+              advance(1) # consume { or [
+              store_member(cur, key, child)
+              stack.push(child)
+              cur = child
+              cur_obj = (b == LBRACE)
+              vss = false
+            elsif [RBRACE, COMMA].include?(b)
+              # key with a colon but no value -> null (don't consume } or ,; the loop does)
+              store_member(cur, key, nil)
+              warn(:empty_value, "key #{key.inspect} had no value — used null")
+              vss = true
+            elsif b.nil?
+              raise error("unexpected end of input")
+            else
+              store_member(cur, key, parse_member_value)
+              vss = true
+            end
+          end
+        else # array
+          if b == RBRACKET
+            advance(1)
+            stack.pop
+            return root if stack.empty?
+            cur = stack.last
+            cur_obj = cur.is_a?(Hash)
+            vss = true # the just-closed container is a value in its parent
+          elsif b.nil?
+            raise error("unterminated array")
+          elsif b == RBRACE
+            raise error("unexpected '}' — expected ']' or a value")
+          elsif [LBRACE, LBRACKET].include?(b)
+            child = b == LBRACE ? {} : []
+            advance(1) # consume { or [
+            cur.push(child)
+            stack.push(child)
+            cur = child
+            cur_obj = (b == LBRACE)
+            vss = false
+          else
+            cur.push(parse_member_value)
+            vss = true
+          end
+        end
+      end
+    end
+    # At the start of a document: an unquoted identifier followed by ':' means
+    # an implicit root object (no outer braces). Look ahead without consuming.
+    def implicit_root_object_ahead?
+      b = byte
+      return false unless b && key_start_byte?(b)
+      saved = [@pos, @line, @col]
+      advance(1) while (c = byte) && key_continue_byte?(c)
+      skip_pure_whitespace
+      result = (byte == COLON)
+      @pos, @line, @col = saved
+      result
+    end
+    # --- byte access ---
+    def byte
+      @input.getbyte(@pos)
+    end
+    def byte_at(offset)
+      @input.getbyte(@pos + offset)
+    end
+    def eof?
+      @pos >= @bytesize
+    end
+    def advance(n = 1)
+      n.times do
+        b = @input.getbyte(@pos)
+        return if b.nil?
+        if b == LF
+          @line += 1
+          @col = 1
+          @pos += 1
+        elsif b == CR
+          @line += 1
+          @col = 1
+          @pos += 1
+          @pos += 1 if @input.getbyte(@pos) == LF
+        else
+          @col += 1
+          @pos += 1
+        end
+      end
+    end
+    # --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
+    def skip_pure_whitespace
+      loop do
+        b = byte
+        break if b.nil?
+        if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
+          advance(1)
+        elsif b >= 0x80
+          n = multibyte_ws_len(@pos)
+          break if n.zero?
+          @pos += n
+          @col += 1
+        else
+          break
+        end
+      end
+    end
+    # Number of bytes of the Unicode-whitespace char starting at pos, or 0.
+    # Only meaningful for bytes >= 0x80.
+    def multibyte_ws_len(pos)
+      b0 = @input.getbyte(pos)
+      return 0 if b0 != 0xC2 && (b0 < 0xE1 || b0 > 0xE3) # reject-gate
+      b1 = @input.getbyte(pos + 1)
+      return 0 if b1.nil?
+      return [0xA0, 0x85].include?(b1) ? 2 : 0 if b0 == 0xC2 # NBSP, NEL
+      b2 = @input.getbyte(pos + 2)
+      return 0 if b2.nil?
+      case b0
+      when 0xE1
+        return 3 if b1 == 0x9A && b2 == 0x80                 # U+1680
+      when 0xE2
+        if b1 == 0x80
+          return 3 if (b2 >= 0x80 && b2 <= 0x8A) || b2 == 0xA8 || b2 == 0xA9 || b2 == 0xAF
+        elsif b1 == 0x81 && b2 == 0x9F
+          return 3                                           # U+205F
+        end
+      when 0xE3
+        return 3 if b1 == 0x80 && b2 == 0x80                 # U+3000
+      end
+      0
+    end
+    # A '#', '//', or '/*' starts a comment only when preceded by whitespace
+    # or at the very start of input (the comment-marker rule).
+    def skip_whitespace_and_comments
+      loop do
+        skip_pure_whitespace
+        b = byte
+        break if b.nil?
+        is_marker = (b == HASH) || (b == SLASH && [SLASH, STAR].include?(byte_at(1)))
+        break unless is_marker
+        break unless preceded_by_ws_or_start?
+        if b == SLASH && byte_at(1) == STAR
+          skip_block_comment
+        else
+          skip_to_eol
+        end
+      end
+    end
+    def preceded_by_ws_or_start?
+      return true if @pos.zero?
+      prev = @input.getbyte(@pos - 1)
+      return true if prev == SPACE || (prev >= TAB && prev <= CR)
+      return false if prev < 0x80
+      # rare: a multibyte whitespace char ending right before @pos
+      i = @pos - 1
+      i -= 1 while i.positive? && (@input.getbyte(i) & 0xC0) == 0x80
+      n = multibyte_ws_len(i)
+      n.positive? && (i + n == @pos)
+    end
+    def skip_to_eol
+      advance(1) while (c = byte) && c != LF && c != CR
+    end
+    def skip_block_comment
+      advance(2) # consume /*
+      until eof?
+        break if byte == STAR && byte_at(1) == SLASH
+        advance(1)
+      end
+      raise error("unterminated block comment") if eof?
+      advance(2) # consume */
+    end
+    # Layer 1 (strict JSON) shape: whitespace + at most one comma + whitespace.
+    # The Lenient Commas Option becomes a one-line change here.
+    # --- values ---
+    # Top-level / strict value: no quoteless fallback.
+    def parse_value
+      skip_whitespace_and_comments
+      raise error("unexpected end of input") if eof?
+      b = byte
+      case b
+      when DQUOTE   then parse_string(DQUOTE)
+      when SQUOTE   then parse_single_or_triple
+      when MINUS, PLUS, DOT, ZERO..NINE, UPPER_I then parse_number
+      when UPPER_N then parse_upper_n # NaN vs None
+      when LOWER_T then parse_literal_keyword("true", true)
+      when LOWER_F then parse_literal_keyword("false", false)
+      when LOWER_N then parse_literal_keyword("null", nil)
+      when LOWER_U then parse_literal_keyword("undefined", nil)
+      when UPPER_T then parse_literal_keyword("True", true)
+      when UPPER_F then parse_literal_keyword("False", false)
+      else
+        kind = smart_quote_kind(@pos)
+        return parse_smart_string(kind) if kind
+        raise error("unexpected character #{display_byte(b)}")
+      end
+    end
+    # Disambiguate NaN (number) from None (Python null) at a strict position.
+    def parse_upper_n
+      if byte_at(1) == 0x61 # 'a' → NaN
+        parse_number
+      else
+        parse_literal_keyword("None", nil)
+      end
+    end
+    # Value in object-value or array-element position: quoteless allowed.
+    def parse_member_value
+      skip_whitespace_and_comments
+      raise error("unexpected end of input") if eof?
+      b = byte
+      case b
+      when DQUOTE   then parse_string(DQUOTE)
+      when SQUOTE   then parse_single_or_triple
+      else
+        kind = smart_quote_kind(@pos)
+        kind ? parse_smart_string(kind) : parse_quoteless_or_literal
+      end
+    end
+    # Smart / curly quotes (U+201C/201D double, U+2018/2019 single), UTF-8
+    # E2 80 9C/9D/98/99. Returns :double, :single, or nil.
+    def smart_quote_kind(pos)
+      return nil unless @input.getbyte(pos) == 0xE2 && @input.getbyte(pos + 1) == 0x80
+      case @input.getbyte(pos + 2)
+      when 0x9C, 0x9D then :double
+      when 0x98, 0x99 then :single
+      end
+    end
+    # Content between smart quotes is taken literally (no escape processing).
+    # Accepts either curly variant as opener/closer (lenient about direction).
+    def parse_smart_string(kind)
+      closers = kind == :double ? [0x9C, 0x9D] : [0x98, 0x99]
+      advance(3)
+      start = @pos
+      until eof?
+        if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
+           closers.include?(@input.getbyte(@pos + 2))
+          result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
+          advance(3)
+          return result
+        end
+        advance(1)
+      end
+      raise error("unterminated smart-quoted string")
+    end
+    def store_member(hash, key, value)
+      k = @symbolize_keys ? key.to_sym : key
+      if hash.key?(k)
+        raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
+        warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}")
+        return if @duplicate_key == :first_wins
+      end
+      hash[k] = value
+    end
+    def parse_object_key
+      b = byte
+      return parse_string(DQUOTE) if b == DQUOTE
+      return parse_string(SQUOTE) if b == SQUOTE
+      raise error("expected a key") unless b && key_start_byte?(b)
+      parse_identifier_key
+    end
+    def key_start_byte?(b)
+      (b >= 0x41 && b <= 0x5A) ||   # A-Z
+        (b >= 0x61 && b <= 0x7A) || # a-z
+        b == UNDERSCORE ||
+        b == DOLLAR
+    end
+    def key_continue_byte?(b)
+      key_start_byte?(b) || (b >= ZERO && b <= NINE) || b == MINUS # hyphen allowed
+    end
+    def parse_identifier_key
+      start = @pos
+      advance(1)
+      advance(1) while (b = byte) && key_continue_byte?(b)
+      @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
+    end
+    # --- quoteless strings & literal classification ---
+    def parse_quoteless_or_literal
+      start = @pos
+      scan_quoteless_run
+      # A quoteless run must consume at least one byte. If the first byte is a
+      # delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
+      # here would make the caller's `result << parse_member_value` loop forever.
+      # Raise instead (correct today: the Lenient Commas Option is not adopted).
+      raise error("expected a value") if @pos == start
+      raw = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
+      classify_quoteless(trim_blank(raw))
+    end
+    # Advance to the end of a quoteless run. Stops at structural punctuation
+    # (',' '}' ']'), a newline, EOF, or a comment marker that is preceded by
+    # whitespace. Spaces by themselves are not delimiters.
+    def scan_quoteless_run
+      prev_ws = false
+      loop do
+        b = byte
+        break if b.nil?
+        break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
+        break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
+        if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
+          prev_ws = true
+          advance(1)
+        elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
+          prev_ws = true
+          @pos += n
+          @col += 1
+        else
+          prev_ws = false
+          advance(1)
+        end
+      end
+    end
+    def trim_blank(str)
+      str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
+    end
+    def classify_quoteless(str)
+      case str
+      when "true", "True"          then return true
+      when "false", "False"        then return false
+      when "null", "None"          then return nil
+      when "undefined"             then return nil
+      when "NaN"                   then return Float::NAN
+      when "Infinity", "+Infinity" then return Float::INFINITY
+      when "-Infinity"             then return (-Float::INFINITY)
+      end
+      num = numeric_value(str)
+      num.equal?(NOT_NUMERIC) ? str : num
+    end
+    # Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
+    def numeric_value(str)
+      if HEX_RE.match?(str)
+        neg = str.start_with?("-")
+        body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
+        v = body[2..-1].to_i(16)
+        return neg ? -v : v
+      end
+      return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
+      body = str.delete("_")
+      body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
+    end
+    # A decimal (has '.' or exponent). bigdecimal_load: :float -> Float,
+    # :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
+    # than 16 significant digits (Oj's DEC_MAX threshold), else Float.
+    def decimal_value(body)
+      case @bigdecimal_load
+      when :float      then body.to_f
+      when :bigdecimal then to_big_decimal(body)
+      else                  significant_digits(body) > 16 ? to_big_decimal(body) : body.to_f
+      end
+    end
+    def significant_digits(body)
+      body.sub(/[eE].*\z/, "").gsub(/[^0-9]/, "").sub(/\A0+/, "").length
+    end
+    def to_big_decimal(body)
+      # Fast path (mirrors the C extension): a clean token goes straight to
+      # BigDecimal(); only a bare/trailing dot needs the normalizing rewrite,
+      # which BigDecimal() would otherwise reject. (body has no underscores here
+      # — numeric_value already stripped them.)
+      body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
+      BigDecimal(body)
+    rescue ArgumentError
+      body.to_f
+    end
+    # BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
+    def normalize_for_bigdecimal(body)
+      body.sub(/\A([+-]?)\./, '\10.').sub(/\.([eE]|\z)/, '.0\1')
+    end
+    # --- quoted strings ---
+    def parse_single_or_triple
+      if byte_at(1) == SQUOTE && byte_at(2) == SQUOTE
+        parse_triple_quoted
+      else
+        parse_string(SQUOTE)
+      end
+    end
+    def parse_triple_quoted
+      indent = @col - 1
+      advance(3)
+      raw_start = @pos
+      until eof?
+        break if byte == SQUOTE && byte_at(1) == SQUOTE && byte_at(2) == SQUOTE
+        advance(1)
+      end
+      raise error("unterminated triple-quoted string") if eof?
+      raw = @input.byteslice(raw_start, @pos - raw_start).force_encoding(@input.encoding)
+      advance(3)
+      strip_triple(raw, indent)
+    end
+    def strip_triple(raw, indent)
+      text = raw.gsub(/\r\n?/, "\n")
+      leading_newline = text.start_with?("\n")
+      lines = text.split("\n", -1)
+      out = []
+      lines.each_with_index do |line, idx|
+        if idx.zero?
+          leading_newline ? next : (out << line)
+        else
+          out << strip_indent(line, indent)
+        end
+      end
+      out.pop if out.last && out.last =~ /\A[ \t]*\z/
+      out.join("\n").force_encoding(@input.encoding)
+    end
+    def strip_indent(line, indent)
+      i = 0
+      i += 1 while i < indent && [" ", "\t"].include?(line[i])
+      line[i..-1] || ""
+    end
+    def parse_string(quote)
+      advance(1)
+      start = @pos
+      has_escape = false
+      while (b = byte)
+        if b == quote
+          if has_escape
+            decoded = decode_string_with_escapes(start, @pos, quote)
+            advance(1)
+            return decoded
+          else
+            result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
+            advance(1)
+            return result
+          end
+        elsif b == BACKSLASH
+          has_escape = true
+          advance(1)
+          raise error("unterminated string escape") if eof?
+          advance(1)
+        else
+          advance(1)
+        end
+      end
+      raise error("unterminated string")
+    end
+    def decode_string_with_escapes(start, finish, _quote)
+      buf = String.new(encoding: Encoding::ASCII_8BIT)
+      i = start
+      while i < finish
+        b = @input.getbyte(i)
+        unless b == BACKSLASH
+          buf << b
+          i += 1
+          next
+        end
+        i += 1
+        esc = @input.getbyte(i)
+        case esc
+        when DQUOTE    then buf << '"'.b
+        when SQUOTE    then buf << "'".b
+        when BACKSLASH then buf << "\\".b
+        when SLASH     then buf << "/".b
+        when 0x62      then buf << "\b".b
+        when 0x66      then buf << "\f".b
+        when 0x6E      then buf << "\n".b
+        when 0x72      then buf << "\r".b
+        when 0x74      then buf << "\t".b
+        when LF
+          # JSON5 line continuation: \<LF> emits nothing
+        when CR
+          i += 1 if @input.getbyte(i + 1) == LF
+        when LOWER_U
+          cp, consumed = decode_unicode_escape(i)
+          buf << [cp].pack("U").b
+          i += consumed
+          next
+        else
+          raise error("invalid escape \\#{esc&.chr || "?"}")
+        end
+        i += 1
+      end
+      buf.force_encoding(@input.encoding)
+    end
+    def decode_unicode_escape(i)
+      raise error("incomplete \\u escape") if i + 4 >= @bytesize
+      hex = @input.byteslice(i + 1, 4)
+      # Match on a binary view: the 4 bytes may split a raw multibyte character, and a
+      # regex on an invalid-UTF-8 String raises ArgumentError. On binary, non-hex bytes
+      # simply fail the match and we raise a clean ParseError below.
+      raise error("invalid \\u escape") unless hex.b.match?(/\A\h{4}\z/)
+      cp = hex.to_i(16)
+      consumed = 5
+      if cp >= 0xD800 && cp <= 0xDBFF
+        unless @input.getbyte(i + consumed) == BACKSLASH && @input.getbyte(i + consumed + 1) == LOWER_U
+          raise error("unpaired high surrogate in string")
+        end
+        hex2 = @input.byteslice(i + consumed + 2, 4)
+        raise error("invalid low surrogate \\u escape") unless hex2 && hex2.bytesize == 4 && hex2.b.match?(/\A\h{4}\z/)
+        cp2 = hex2.to_i(16)
+        raise error("invalid low surrogate value") unless cp2 >= 0xDC00 && cp2 <= 0xDFFF
+        cp = 0x10000 + ((cp - 0xD800) << 10) + (cp2 - 0xDC00)
+        consumed += 6
+      end
+      [cp, consumed]
+    end
+    # --- numbers (top-level / strict positions) ---
+    def parse_number
+      negative = false
+      if byte == MINUS
+        negative = true
+        advance(1)
+      elsif byte == PLUS
+        advance(1)
+      end
+      if byte == UPPER_I
+        consume_keyword!("Infinity")
+        return negative ? -Float::INFINITY : Float::INFINITY
+      end
+      if byte == UPPER_N
+        consume_keyword!("NaN")
+        return Float::NAN
+      end
+      int_start = @pos
+      if byte == ZERO
+        advance(1)
+        if [LOWER_X, UPPER_X].include?(byte)
+          advance(1)
+          hex_start = @pos
+          advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
+          raise error("invalid hex number") if @pos == hex_start
+          value = @input.byteslice(hex_start, @pos - hex_start).delete("_").to_i(16)
+          return negative ? -value : value
+        end
+      elsif byte && byte >= 0x31 && byte <= NINE
+        advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
+      elsif byte == DOT
+        # leading decimal handled below
+      else
+        raise error("invalid number")
+      end
+      is_float = false
+      if byte == DOT
+        is_float = true
+        advance(1)
+        advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
+      end
+      if [LOWER_E, UPPER_E].include?(byte)
+        is_float = true
+        advance(1)
+        advance(1) if [PLUS, MINUS].include?(byte)
+        raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
+        advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
+      end
+      slice = @input.byteslice(int_start, @pos - int_start).delete("_")
+      value = is_float ? decimal_value(slice) : slice.to_i
+      negative ? -value : value
+    end
+    def hex_digit?(b)
+      (b >= ZERO && b <= NINE) ||
+        (b >= 0x41 && b <= 0x46) ||
+        (b >= 0x61 && b <= 0x66)
+    end
+    def consume_keyword!(word)
+      word.bytesize.times do |i|
+        raise error("invalid literal #{word.inspect}") unless byte_at(i) == word.getbyte(i)
+      end
+      advance(word.bytesize)
+    end
+    def parse_literal_keyword(word, value)
+      consume_keyword!(word)
+      value
+    end
+    # Record a non-fatal lenient fix (only when built with warnings: true).
+    def warn(type, message)
+      return unless @collect_warnings
+      @warnings << Warning.new(type, message, @line, @col)
+    end
+    def error(message)
+      ParseError.new(message, @line, @col)
+    end
+    def display_byte(b)
+      return "EOF" if b.nil?
+      if b >= 0x20 && b < 0x7F
+        "'#{b.chr}'"
+      else
+        format("0x%02X", b)
+      end
+    end
+  end
+end