smarter_json 0.9.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/CHANGELOG.md +89 -55
- data/README.md +216 -73
- data/docs/_introduction.md +6 -12
- data/docs/basic_read_api.md +29 -19
- data/docs/basic_write_api.md +3 -3
- data/docs/examples.md +32 -23
- data/docs/options.md +20 -19
- data/ext/smarter_json/smarter_json.c +246 -92
- data/ext/smarter_json/vendor/LICENSE-fast_float-MIT +27 -0
- data/ext/smarter_json/vendor/eisel_lemire.h +117 -0
- data/ext/smarter_json/vendor/eisel_lemire.md +29 -0
- data/ext/smarter_json/vendor/eisel_lemire_powers.h +663 -0
- data/lib/smarter_json/backports.rb +28 -0
- data/lib/smarter_json/generator.rb +100 -65
- data/lib/smarter_json/options.rb +65 -0
- data/lib/smarter_json/parser.rb +441 -141
- data/lib/smarter_json/version.rb +1 -1
- data/lib/smarter_json.rb +3 -1
- metadata +21 -11
- data/ext/smarter_json/vendor/ryu.h +0 -819
- data/ext/smarter_json/vendor/ryu.md +0 -22
data/lib/smarter_json/parser.rb
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
# Array#filter_map (used in Recovery#extract_payloads) is Ruby 2.7+; on Ruby < 2.7
|
|
4
|
+
# activate the scoped refinement backport (no-op on 2.7+, which uses native filter_map).
|
|
5
|
+
using SmarterJSON::Backports if Gem::Version.new(RUBY_VERSION) < Gem::Version.new("2.7")
|
|
6
|
+
|
|
3
7
|
module SmarterJSON
|
|
4
8
|
# ParseError / EncodingError live in errors.rb (loaded first) so they can inherit
|
|
5
9
|
# from the shared SmarterJSON::Error base.
|
|
@@ -12,15 +16,20 @@ module SmarterJSON
|
|
|
12
16
|
# is always content, never a filename — use process_file for paths.) The values
|
|
13
17
|
# in `options` override Parser::DEFAULT_OPTIONS.
|
|
14
18
|
#
|
|
15
|
-
# Without a block: returns
|
|
16
|
-
#
|
|
17
|
-
#
|
|
18
|
-
#
|
|
19
|
+
# Without a block: always returns an Array of the documents found — [] for none,
|
|
20
|
+
# [doc] for one, [d1, d2, …] for several (NDJSON / JSONL / concatenated). A
|
|
21
|
+
# top-level value must be a recognized JSON value (number / literal / quoted
|
|
22
|
+
# string / object / array) or an implicit-root object, else it raises. For the
|
|
23
|
+
# single-document case use SmarterJSON.process_one (returns the bare value).
|
|
24
|
+
# :acceleration (default true) selects the C extension when compiled and loaded
|
|
25
|
+
# (SmarterJSON::HAS_ACCELERATION); otherwise the pure-Ruby parser.
|
|
19
26
|
#
|
|
20
|
-
# With a block: yields each top-level document as it is parsed, and returns
|
|
21
|
-
# For an IO this streams document-by-document in bounded memory —
|
|
22
|
-
# stream as newline-delimited documents (NDJSON / JSONL), one per
|
|
27
|
+
# With a block: yields each top-level document as it is parsed, and returns the
|
|
28
|
+
# document count. For an IO this streams document-by-document in bounded memory —
|
|
29
|
+
# it reads the stream as newline-delimited documents (NDJSON / JSONL), one per
|
|
30
|
+
# line.
|
|
23
31
|
def process(input, options = {}, &block)
|
|
32
|
+
options = Options.process_options(options)
|
|
24
33
|
if input.is_a?(String)
|
|
25
34
|
Recovery.process_string(input, options, &block)
|
|
26
35
|
elsif input.respond_to?(:read)
|
|
@@ -39,7 +48,8 @@ module SmarterJSON
|
|
|
39
48
|
# loading the whole file); the documents are read as newline-delimited
|
|
40
49
|
# (NDJSON / JSONL), one per line.
|
|
41
50
|
def process_file(path, options = {}, &block)
|
|
42
|
-
|
|
51
|
+
options = Options.process_options(options)
|
|
52
|
+
encoding = options[:encoding] || "UTF-8"
|
|
43
53
|
if block
|
|
44
54
|
File.open(path, "r:#{encoding}") { |io| stream_io(io, options, &block) }
|
|
45
55
|
else
|
|
@@ -47,8 +57,44 @@ module SmarterJSON
|
|
|
47
57
|
end
|
|
48
58
|
end
|
|
49
59
|
|
|
50
|
-
#
|
|
51
|
-
#
|
|
60
|
+
# SmarterJSON.process_one(input, options = {}) — the single-document accessor.
|
|
61
|
+
#
|
|
62
|
+
# Returns the first document's value (or nil when the input holds no documents).
|
|
63
|
+
# When the input holds MORE than one document it returns the first and warns once
|
|
64
|
+
# — it never raises, since an extra document is valid data; the warning goes to
|
|
65
|
+
# on_warning if set, else Rails.logger.warn when Rails is loaded, else Kernel#warn.
|
|
66
|
+
# For an IO this is bounded memory: it parses just the first document and stops as
|
|
67
|
+
# soon as a second is seen, instead of materialising the whole stream the way
|
|
68
|
+
# process(io).first would. (process(input).first and process(input)[0] silently
|
|
69
|
+
# drop documents 2+ — a footgun; use process_one instead.)
|
|
70
|
+
def process_one(input, options = {})
|
|
71
|
+
options = Options.process_options(options)
|
|
72
|
+
|
|
73
|
+
# IO: bounded memory — parse just the first document and stop once a second is
|
|
74
|
+
# seen (peek-to-warn). A String is already in memory, so use the plain no-block
|
|
75
|
+
# path: it returns the full (wrapper-recovered, de-duplicated) Array in one pass,
|
|
76
|
+
# which also avoids the reactive-recovery double-yield the block path would hit.
|
|
77
|
+
unless input.respond_to?(:read)
|
|
78
|
+
docs = process(input, options)
|
|
79
|
+
warn_extra_documents(options) if docs.length > 1
|
|
80
|
+
return docs.first
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
first = nil
|
|
84
|
+
count = 0
|
|
85
|
+
catch(:smarter_json_first_document) do
|
|
86
|
+
process(input, options) do |doc|
|
|
87
|
+
count += 1
|
|
88
|
+
first = doc if count == 1
|
|
89
|
+
throw(:smarter_json_first_document) if count > 1
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
warn_extra_documents(options) if count > 1
|
|
93
|
+
first
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Parse a String of JSON content (the in-memory path). Returns an Array of the
|
|
97
|
+
# documents found (empty for none); the C extension is used when available.
|
|
52
98
|
def process_content(input, options, &block)
|
|
53
99
|
if block
|
|
54
100
|
if options.fetch(:acceleration, true) && HAS_ACCELERATION
|
|
@@ -63,14 +109,55 @@ module SmarterJSON
|
|
|
63
109
|
end
|
|
64
110
|
end
|
|
65
111
|
|
|
112
|
+
# Smart default for the nil :encoding option. A String tagged ASCII-8BIT (BINARY)
|
|
113
|
+
# is how Net::HTTP and many HTTP libraries hand back a response body even when the
|
|
114
|
+
# bytes are UTF-8. JSON's interchange encoding is UTF-8, so we relabel such input
|
|
115
|
+
# to UTF-8 when its bytes are valid UTF-8 — otherwise string values would come back
|
|
116
|
+
# tagged ASCII-8BIT and compare unequal to UTF-8 literals (a silent footgun). When
|
|
117
|
+
# the bytes are NOT valid UTF-8 we raise EncodingError rather than guess a legacy
|
|
118
|
+
# encoding — pass an explicit :encoding for that. An explicit (non-nil) :encoding,
|
|
119
|
+
# or any non-BINARY tag, is left untouched (the per-path force_encoding / validation
|
|
120
|
+
# handles it). Only relabels — never transcodes.
|
|
121
|
+
def normalize_default_encoding(input, options)
|
|
122
|
+
return input unless options[:encoding].nil?
|
|
123
|
+
return input unless input.encoding == Encoding::ASCII_8BIT
|
|
124
|
+
|
|
125
|
+
utf8 = input.dup.force_encoding(Encoding::UTF_8)
|
|
126
|
+
return utf8 if utf8.valid_encoding?
|
|
127
|
+
|
|
128
|
+
raise EncodingError, "input is tagged ASCII-8BIT and is not valid UTF-8 — pass encoding: to declare its encoding"
|
|
129
|
+
end
|
|
130
|
+
|
|
66
131
|
# Stream documents from an IO incrementally, yielding each recovered top-level
|
|
67
132
|
# document without slurping the whole input into memory first.
|
|
68
133
|
def stream_io(io, options, &block)
|
|
69
|
-
|
|
70
|
-
|
|
134
|
+
count = 0
|
|
135
|
+
Framer.each_document(io) do |doc|
|
|
136
|
+
# Recovery.process_string yields each value and returns how many it yielded;
|
|
137
|
+
# blank / comment-only framed segments yield none, so count tracks actual
|
|
138
|
+
# documents (== values yielded), not raw framed segments.
|
|
139
|
+
count += Recovery.process_string(doc, options, &block)
|
|
140
|
+
end
|
|
141
|
+
count
|
|
71
142
|
end
|
|
72
143
|
|
|
73
|
-
|
|
144
|
+
# process_one's "more than one document" notice — routed to on_warning if the caller
|
|
145
|
+
# gave one, else Rails.logger when Rails is loaded, else Kernel#warn. Never silent,
|
|
146
|
+
# never raised.
|
|
147
|
+
def warn_extra_documents(options)
|
|
148
|
+
message = "SmarterJSON.process_one: input has more than one document — returning the first and " \
|
|
149
|
+
"dropping the rest. Use SmarterJSON.process to get every document."
|
|
150
|
+
handler = options[:on_warning]
|
|
151
|
+
if handler
|
|
152
|
+
handler.call(Warning.new(:extra_documents, message, nil, nil))
|
|
153
|
+
elsif defined?(Rails) && Rails.respond_to?(:logger) && Rails.logger
|
|
154
|
+
Rails.logger.warn(message)
|
|
155
|
+
else
|
|
156
|
+
Kernel.warn(message)
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
private_class_method :process_content, :stream_io, :warn_extra_documents
|
|
74
161
|
|
|
75
162
|
# Named byte values, shared by the Parser FSM and the Framer / Recovery byte
|
|
76
163
|
# scanners so none of them spell out raw hex. Included where needed.
|
|
@@ -119,7 +206,7 @@ module SmarterJSON
|
|
|
119
206
|
|
|
120
207
|
module_function
|
|
121
208
|
|
|
122
|
-
def each_document(io
|
|
209
|
+
def each_document(io)
|
|
123
210
|
buffer = +""
|
|
124
211
|
scan = 0
|
|
125
212
|
doc_start = nil
|
|
@@ -343,6 +430,7 @@ module SmarterJSON
|
|
|
343
430
|
module_function
|
|
344
431
|
|
|
345
432
|
def process_string(input, options, &block)
|
|
433
|
+
input = SmarterJSON.send(:normalize_default_encoding, input, options)
|
|
346
434
|
return SmarterJSON.send(:process_content, input, options, &block) unless input.valid_encoding?
|
|
347
435
|
|
|
348
436
|
# Recovery is REACTIVE: parse first, and only fall back to wrapper extraction when
|
|
@@ -385,15 +473,23 @@ module SmarterJSON
|
|
|
385
473
|
handler = options[:on_warning]
|
|
386
474
|
emit_wrapper_warnings(payloads, handler)
|
|
387
475
|
|
|
388
|
-
|
|
389
|
-
|
|
476
|
+
if block_given?
|
|
477
|
+
count = 0
|
|
478
|
+
payloads.each do |payload|
|
|
479
|
+
SmarterJSON.send(:process_content, payload[:slice], options) do |doc|
|
|
480
|
+
block.call(doc)
|
|
481
|
+
count += 1
|
|
482
|
+
end
|
|
483
|
+
end
|
|
484
|
+
return count
|
|
390
485
|
end
|
|
391
486
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
487
|
+
# Each payload's process_content now returns an Array of its documents; flatten
|
|
488
|
+
# so several recovered payloads yield one flat Array<doc> (the always-array
|
|
489
|
+
# contract), not an Array of Arrays.
|
|
490
|
+
payloads.flat_map do |payload|
|
|
491
|
+
SmarterJSON.send(:process_content, payload[:slice], options)
|
|
492
|
+
end
|
|
397
493
|
end
|
|
398
494
|
|
|
399
495
|
def emit_wrapper_warnings(payloads, handler)
|
|
@@ -613,18 +709,22 @@ module SmarterJSON
|
|
|
613
709
|
# followed by a digit ("5.", "5.e3"). Matches iff normalize_for_bigdecimal
|
|
614
710
|
# would change the string — so when it doesn't match, we skip normalization.
|
|
615
711
|
NEEDS_DECIMAL_FIXUP = /\A[+-]?\.|\.(?:[eE]|\z)/.freeze
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
#
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
712
|
+
|
|
713
|
+
# parse_string scans to the next closing-quote-or-backslash. byteindex (Ruby 3.2+,
|
|
714
|
+
# MRI) does that jump at C speed; the getbyte loop in scan_string_delimiter is the
|
|
715
|
+
# portable fallback (JRuby / TruffleRuby / older MRI). Both find the same byte.
|
|
716
|
+
BYTEINDEX_AVAILABLE = "".respond_to?(:byteindex)
|
|
717
|
+
DQUOTE_OR_BACKSLASH = /["\\]/.freeze
|
|
718
|
+
SQUOTE_OR_BACKSLASH = /['\\]/.freeze
|
|
719
|
+
|
|
720
|
+
# scan_quoteless_run's fast path jumps (in C) to the first structural terminator
|
|
721
|
+
# (',' '}' ']' '{' '[') OR any whitespace ([[:space:]] covers ASCII + Unicode space,
|
|
722
|
+
# incl. LF/CR which also terminate). Stopping at a terminator/EOF means the run had no
|
|
723
|
+
# interior whitespace, so there's nothing to trim and no comment marker can apply.
|
|
724
|
+
QL_BREAK = /[,{}\[\]]|[[:space:]]/.freeze
|
|
725
|
+
|
|
726
|
+
# The defaults live centrally in SmarterJSON::Options (lib/smarter_json/options.rb).
|
|
727
|
+
DEFAULT_OPTIONS = Options::DEFAULT_OPTIONS
|
|
628
728
|
|
|
629
729
|
def initialize(input, options = {})
|
|
630
730
|
raise ArgumentError, "input must be a String" unless input.is_a?(String)
|
|
@@ -632,8 +732,13 @@ module SmarterJSON
|
|
|
632
732
|
opts = DEFAULT_OPTIONS.merge(options)
|
|
633
733
|
@symbolize_keys = opts[:symbolize_keys]
|
|
634
734
|
@duplicate_key = opts[:duplicate_key]
|
|
635
|
-
@
|
|
636
|
-
@on_warning
|
|
735
|
+
@decimal_precision = opts[:decimal_precision]
|
|
736
|
+
@on_warning = opts[:on_warning]
|
|
737
|
+
# store_member only needs the (per-member) Hash#key? duplicate lookup when a
|
|
738
|
+
# repeat would change behavior: a warning must fire, or :first_wins must keep the
|
|
739
|
+
# first. With the default (:last_wins, no handler) a duplicate just overwrites,
|
|
740
|
+
# which `hash[k] = value` already does — so skip the lookup entirely.
|
|
741
|
+
@check_duplicates = !@on_warning.nil? || @duplicate_key == :first_wins
|
|
637
742
|
|
|
638
743
|
encoding = opts[:encoding]
|
|
639
744
|
@input = encoding ? input.dup.force_encoding(encoding) : input
|
|
@@ -642,8 +747,6 @@ module SmarterJSON
|
|
|
642
747
|
@bytesize = @input.bytesize
|
|
643
748
|
# Skip a UTF-8 BOM (EF BB BF) at the start of input.
|
|
644
749
|
@pos = @input.getbyte(0) == 0xEF && @input.getbyte(1) == 0xBB && @input.getbyte(2) == 0xBF ? 3 : 0
|
|
645
|
-
@line = 1
|
|
646
|
-
@col = 1
|
|
647
750
|
end
|
|
648
751
|
|
|
649
752
|
# No block: auto-detect the document count for free (the same "is there
|
|
@@ -653,17 +756,14 @@ module SmarterJSON
|
|
|
653
756
|
# value. Commas do NOT separate documents (only whitespace / newline /
|
|
654
757
|
# concatenation do), so a bracketless comma list still raises in parse_document.
|
|
655
758
|
def parse
|
|
656
|
-
|
|
657
|
-
return nil if eof?
|
|
658
|
-
|
|
659
|
-
value = parse_document
|
|
660
|
-
skip_whitespace_and_comments
|
|
661
|
-
return value if eof?
|
|
662
|
-
|
|
663
|
-
results = [value]
|
|
759
|
+
results = []
|
|
664
760
|
until eof?
|
|
665
|
-
|
|
666
|
-
|
|
761
|
+
skip_document_separators
|
|
762
|
+
break if eof?
|
|
763
|
+
|
|
764
|
+
value = parse_document
|
|
765
|
+
enforce_scalar_boundary(value)
|
|
766
|
+
results << value
|
|
667
767
|
end
|
|
668
768
|
results
|
|
669
769
|
end
|
|
@@ -671,13 +771,17 @@ module SmarterJSON
|
|
|
671
771
|
# Yield each top-level value until EOF (JSONL / NDJSON / concatenated /
|
|
672
772
|
# whitespace-separated). Used by the block form of SmarterJSON.process.
|
|
673
773
|
def each_value
|
|
674
|
-
|
|
675
|
-
|
|
774
|
+
count = 0
|
|
775
|
+
until eof?
|
|
776
|
+
skip_document_separators
|
|
676
777
|
break if eof?
|
|
677
778
|
|
|
678
|
-
|
|
779
|
+
value = parse_document
|
|
780
|
+
enforce_scalar_boundary(value)
|
|
781
|
+
yield value
|
|
782
|
+
count += 1
|
|
679
783
|
end
|
|
680
|
-
|
|
784
|
+
count
|
|
681
785
|
end
|
|
682
786
|
|
|
683
787
|
private
|
|
@@ -688,6 +792,48 @@ module SmarterJSON
|
|
|
688
792
|
parse_iter(implicit_root_object_ahead?)
|
|
689
793
|
end
|
|
690
794
|
|
|
795
|
+
# Between top-level documents, whitespace, comments, AND commas all separate
|
|
796
|
+
# (commas collapse like the in-container lenient-comma rule). A space alone never
|
|
797
|
+
# separates — that is handled inside the document by the quoteless run, so
|
|
798
|
+
# `1 2 3` is one document (the string "1 2 3") while `1, 2, 3` is three.
|
|
799
|
+
def skip_document_separators
|
|
800
|
+
skip_whitespace_and_comments
|
|
801
|
+
while byte == COMMA
|
|
802
|
+
advance(1)
|
|
803
|
+
skip_whitespace_and_comments
|
|
804
|
+
end
|
|
805
|
+
end
|
|
806
|
+
|
|
807
|
+
# After a top-level value: a self-delimiting value (object / array / quoted string)
|
|
808
|
+
# may be followed by anything (the next document self-delimits), but a bare scalar
|
|
809
|
+
# (number / keyword) must be followed by a real separator — a newline, ',', a
|
|
810
|
+
# comment, or EOF. A space is NOT a separator, so `1 2 3` and `42 "x" true` raise
|
|
811
|
+
# rather than silently splitting; bare top-level words raise in parse_value itself.
|
|
812
|
+
def enforce_scalar_boundary(value)
|
|
813
|
+
return if value.is_a?(String) || value.is_a?(Hash) || value.is_a?(Array)
|
|
814
|
+
|
|
815
|
+
skip_horizontal_whitespace
|
|
816
|
+
b = byte
|
|
817
|
+
return if b.nil? || b == LF || b == CR || b == COMMA
|
|
818
|
+
return if b == HASH || (b == SLASH && ((c = byte_at(1)) == SLASH || c == STAR))
|
|
819
|
+
|
|
820
|
+
raise error("a top-level number or keyword must be followed by a newline, ',', or end of input")
|
|
821
|
+
end
|
|
822
|
+
|
|
823
|
+
# Skip horizontal whitespace only (space / tab / VT / FF) — NOT newlines, which are
|
|
824
|
+
# document separators. Used by the scalar-boundary check above.
|
|
825
|
+
def skip_horizontal_whitespace
|
|
826
|
+
while (b = byte)
|
|
827
|
+
if b == SPACE || b == TAB || b == 0x0B || b == 0x0C
|
|
828
|
+
advance(1)
|
|
829
|
+
elsif b >= 0x80 && (n = multibyte_ws_len(@pos)).positive?
|
|
830
|
+
@pos += n # multibyte horizontal whitespace (NBSP, U+2000–200A, …)
|
|
831
|
+
else
|
|
832
|
+
break
|
|
833
|
+
end
|
|
834
|
+
end
|
|
835
|
+
end
|
|
836
|
+
|
|
691
837
|
# Iterative container parser — explicit stack, NO Ruby recursion, so nesting
|
|
692
838
|
# is bounded only by memory (like Oj and the C extension's fj_parse_iter),
|
|
693
839
|
# never by the call stack. Mirrors the C driver to keep the two paths in
|
|
@@ -708,9 +854,10 @@ module SmarterJSON
|
|
|
708
854
|
end
|
|
709
855
|
|
|
710
856
|
vss = false # warnings: has a value landed in the current container since the last separator?
|
|
711
|
-
|
|
857
|
+
input = @input # hoisted: @input never changes mid-parse; byte reads inline as input.getbyte(@pos)
|
|
858
|
+
while true
|
|
712
859
|
skip_whitespace_and_comments
|
|
713
|
-
b =
|
|
860
|
+
b = input.getbyte(@pos)
|
|
714
861
|
if at_top
|
|
715
862
|
if b == LBRACE
|
|
716
863
|
advance(1)
|
|
@@ -729,8 +876,17 @@ module SmarterJSON
|
|
|
729
876
|
at_top = false
|
|
730
877
|
vss = false
|
|
731
878
|
elsif b.nil?
|
|
879
|
+
# Defensive guard: parse / each_value check eof? before calling parse_iter,
|
|
880
|
+
# so `at_top` never meets end-of-input here. Kept to mirror the C driver.
|
|
881
|
+
# :nocov:
|
|
732
882
|
raise error("unexpected end of input")
|
|
883
|
+
# :nocov:
|
|
733
884
|
else
|
|
885
|
+
# Top-level scalar: must be a recognized JSON value (number / literal /
|
|
886
|
+
# quoted string). A bare word raises — there are no top-level quoteless
|
|
887
|
+
# strings (Decision 2 = B-broad). In-container quoteless still uses
|
|
888
|
+
# parse_member_value; the scalar-vs-separator boundary is enforced by the
|
|
889
|
+
# parse / each_value loop via enforce_scalar_boundary.
|
|
734
890
|
return parse_value
|
|
735
891
|
end
|
|
736
892
|
elsif b == COMMA
|
|
@@ -758,12 +914,12 @@ module SmarterJSON
|
|
|
758
914
|
else
|
|
759
915
|
key = parse_object_key
|
|
760
916
|
skip_whitespace_and_comments
|
|
761
|
-
raise error("expected ':' after key #{key.inspect}") unless
|
|
917
|
+
raise error("expected ':' after key #{key.inspect}") unless input.getbyte(@pos) == COLON
|
|
762
918
|
|
|
763
919
|
advance(1)
|
|
764
920
|
skip_whitespace_and_comments
|
|
765
|
-
b =
|
|
766
|
-
if
|
|
921
|
+
b = input.getbyte(@pos)
|
|
922
|
+
if b == LBRACE || b == LBRACKET
|
|
767
923
|
child = b == LBRACE ? {} : []
|
|
768
924
|
advance(1) # consume { or [
|
|
769
925
|
store_member(cur, key, child)
|
|
@@ -771,7 +927,7 @@ module SmarterJSON
|
|
|
771
927
|
cur = child
|
|
772
928
|
cur_obj = (b == LBRACE)
|
|
773
929
|
vss = false
|
|
774
|
-
elsif
|
|
930
|
+
elsif b == RBRACE || b == COMMA
|
|
775
931
|
# key with a colon but no value -> null (don't consume } or ,; the loop does)
|
|
776
932
|
store_member(cur, key, nil)
|
|
777
933
|
warn(:empty_value, "key #{key.inspect} had no value — used null") if @on_warning
|
|
@@ -796,7 +952,7 @@ module SmarterJSON
|
|
|
796
952
|
raise error("unterminated array")
|
|
797
953
|
elsif b == RBRACE
|
|
798
954
|
raise error("unexpected '}' — expected ']' or a value")
|
|
799
|
-
elsif
|
|
955
|
+
elsif b == LBRACE || b == LBRACKET
|
|
800
956
|
child = b == LBRACE ? {} : []
|
|
801
957
|
advance(1) # consume { or [
|
|
802
958
|
cur.push(child)
|
|
@@ -818,11 +974,11 @@ module SmarterJSON
|
|
|
818
974
|
b = byte
|
|
819
975
|
return false unless b && key_start_byte?(b)
|
|
820
976
|
|
|
821
|
-
saved =
|
|
977
|
+
saved = @pos
|
|
822
978
|
advance(1) while (c = byte) && key_continue_byte?(c)
|
|
823
979
|
skip_pure_whitespace
|
|
824
980
|
result = (byte == COLON)
|
|
825
|
-
@pos
|
|
981
|
+
@pos = saved
|
|
826
982
|
result
|
|
827
983
|
end
|
|
828
984
|
|
|
@@ -840,46 +996,72 @@ module SmarterJSON
|
|
|
840
996
|
@pos >= @bytesize
|
|
841
997
|
end
|
|
842
998
|
|
|
999
|
+
# Advance the byte cursor by n (clamped to EOF). No line/col bookkeeping — that
|
|
1000
|
+
# is computed lazily in line_col_at only when an error/warning is built. This is
|
|
1001
|
+
# the hot-path primitive every consumed byte goes through, so it stays O(1) with
|
|
1002
|
+
# no block, no re-read, and no per-byte branching. Mirrors the C fj_advance.
|
|
843
1003
|
def advance(n = 1)
|
|
844
|
-
n
|
|
845
|
-
|
|
846
|
-
|
|
1004
|
+
@pos += n
|
|
1005
|
+
@pos = @bytesize if @pos > @bytesize
|
|
1006
|
+
end
|
|
847
1007
|
|
|
1008
|
+
# Line and 1-based BYTE column at byte position `pos`, computed lazily by scanning
|
|
1009
|
+
# from the start of the buffer — only on the cold path (error / warning / triple-quote
|
|
1010
|
+
# indent), never per byte. CR, LF, and CRLF each count as one newline; the column is
|
|
1011
|
+
# the byte offset within the line. Mirrors the C extension's fj_line_col so both paths
|
|
1012
|
+
# report identical positions.
|
|
1013
|
+
def line_col_at(pos = @pos)
|
|
1014
|
+
limit = pos < @bytesize ? pos : @bytesize
|
|
1015
|
+
line = 1
|
|
1016
|
+
col = 1
|
|
1017
|
+
i = 0
|
|
1018
|
+
while i < limit
|
|
1019
|
+
b = @input.getbyte(i)
|
|
848
1020
|
if b == LF
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
@pos += 1
|
|
1021
|
+
line += 1
|
|
1022
|
+
col = 1
|
|
852
1023
|
elsif b == CR
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
@pos += 1 if @input.getbyte(@pos) == LF
|
|
1024
|
+
line += 1
|
|
1025
|
+
col = 1
|
|
1026
|
+
i += 1 if i + 1 < @bytesize && @input.getbyte(i + 1) == LF
|
|
857
1027
|
else
|
|
858
|
-
|
|
859
|
-
@pos += 1
|
|
1028
|
+
col += 1
|
|
860
1029
|
end
|
|
1030
|
+
i += 1
|
|
861
1031
|
end
|
|
1032
|
+
[line, col]
|
|
1033
|
+
end
|
|
1034
|
+
|
|
1035
|
+
# 1-based byte column at `pos` (bytes since the last line start). Used for
|
|
1036
|
+
# triple-quoted-string indentation stripping. Mirrors the C fj_column.
|
|
1037
|
+
def column_at(pos = @pos)
|
|
1038
|
+
c = 1
|
|
1039
|
+
i = pos - 1
|
|
1040
|
+
while i >= 0 && (b = @input.getbyte(i)) != LF && b != CR
|
|
1041
|
+
c += 1
|
|
1042
|
+
i -= 1
|
|
1043
|
+
end
|
|
1044
|
+
c
|
|
862
1045
|
end
|
|
863
1046
|
|
|
864
1047
|
# --- whitespace (Unicode [[:space:]] / Rails blank?; see smarter_json.md §4.7) ---
|
|
865
1048
|
|
|
866
1049
|
def skip_pure_whitespace
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
1050
|
+
input = @input
|
|
1051
|
+
pos = @pos
|
|
1052
|
+
while (b = input.getbyte(pos))
|
|
871
1053
|
if b == SPACE || (b >= TAB && b <= CR) # 0x20, or 0x09..0x0D
|
|
872
|
-
|
|
1054
|
+
pos += 1
|
|
873
1055
|
elsif b >= 0x80
|
|
874
|
-
n = multibyte_ws_len(
|
|
1056
|
+
n = multibyte_ws_len(pos)
|
|
875
1057
|
break if n.zero?
|
|
876
1058
|
|
|
877
|
-
|
|
878
|
-
@col += 1
|
|
1059
|
+
pos += n
|
|
879
1060
|
else
|
|
880
1061
|
break
|
|
881
1062
|
end
|
|
882
1063
|
end
|
|
1064
|
+
@pos = pos
|
|
883
1065
|
end
|
|
884
1066
|
|
|
885
1067
|
# Number of bytes of the Unicode-whitespace char starting at pos, or 0.
|
|
@@ -913,19 +1095,20 @@ module SmarterJSON
|
|
|
913
1095
|
# A '#', '//', or '/*' starts a comment only when preceded by whitespace
|
|
914
1096
|
# or at the very start of input (the comment-marker rule).
|
|
915
1097
|
def skip_whitespace_and_comments
|
|
916
|
-
|
|
1098
|
+
while true
|
|
917
1099
|
skip_pure_whitespace
|
|
918
1100
|
b = byte
|
|
919
|
-
|
|
1101
|
+
if b == HASH
|
|
1102
|
+
break unless preceded_by_ws_or_start?
|
|
920
1103
|
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
1104
|
+
skip_to_eol
|
|
1105
|
+
elsif b == SLASH
|
|
1106
|
+
c = byte_at(1)
|
|
1107
|
+
break unless (c == SLASH || c == STAR) && preceded_by_ws_or_start?
|
|
924
1108
|
|
|
925
|
-
|
|
926
|
-
skip_block_comment
|
|
1109
|
+
c == STAR ? skip_block_comment : skip_to_eol
|
|
927
1110
|
else
|
|
928
|
-
|
|
1111
|
+
break
|
|
929
1112
|
end
|
|
930
1113
|
end
|
|
931
1114
|
end
|
|
@@ -965,8 +1148,9 @@ module SmarterJSON
|
|
|
965
1148
|
# --- values ---
|
|
966
1149
|
|
|
967
1150
|
# Top-level / strict value: no quoteless fallback.
|
|
1151
|
+
# Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
|
|
1152
|
+
# so @pos is at the value's first byte — no leading skip needed here.
|
|
968
1153
|
def parse_value
|
|
969
|
-
skip_whitespace_and_comments
|
|
970
1154
|
raise error("unexpected end of input") if eof?
|
|
971
1155
|
|
|
972
1156
|
b = byte
|
|
@@ -999,8 +1183,9 @@ module SmarterJSON
|
|
|
999
1183
|
end
|
|
1000
1184
|
|
|
1001
1185
|
# Value in object-value or array-element position: quoteless allowed.
|
|
1186
|
+
# Precondition: callers (parse_iter) have already run skip_whitespace_and_comments,
|
|
1187
|
+
# so @pos is at the value's first byte — no leading skip needed here.
|
|
1002
1188
|
def parse_member_value
|
|
1003
|
-
skip_whitespace_and_comments
|
|
1004
1189
|
raise error("unexpected end of input") if eof?
|
|
1005
1190
|
|
|
1006
1191
|
b = byte
|
|
@@ -1033,7 +1218,7 @@ module SmarterJSON
|
|
|
1033
1218
|
until eof?
|
|
1034
1219
|
if @input.getbyte(@pos) == 0xE2 && @input.getbyte(@pos + 1) == 0x80 &&
|
|
1035
1220
|
closers.include?(@input.getbyte(@pos + 2))
|
|
1036
|
-
result = @input.byteslice(start, @pos - start)
|
|
1221
|
+
result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
|
|
1037
1222
|
advance(3)
|
|
1038
1223
|
return result
|
|
1039
1224
|
end
|
|
@@ -1044,9 +1229,7 @@ module SmarterJSON
|
|
|
1044
1229
|
|
|
1045
1230
|
def store_member(hash, key, value)
|
|
1046
1231
|
k = @symbolize_keys ? key.to_sym : key
|
|
1047
|
-
if hash.key?(k)
|
|
1048
|
-
raise error("duplicate key #{k.inspect}") if @duplicate_key == :raise
|
|
1049
|
-
|
|
1232
|
+
if @check_duplicates && hash.key?(k)
|
|
1050
1233
|
warn(:duplicate_key, "duplicate key #{k.inspect} — #{@duplicate_key}") if @on_warning
|
|
1051
1234
|
return if @duplicate_key == :first_wins
|
|
1052
1235
|
end
|
|
@@ -1057,6 +1240,12 @@ module SmarterJSON
|
|
|
1057
1240
|
b = byte
|
|
1058
1241
|
return parse_string(DQUOTE) if b == DQUOTE
|
|
1059
1242
|
return parse_string(SQUOTE) if b == SQUOTE
|
|
1243
|
+
|
|
1244
|
+
# A key may open with a smart/curly quote too (word-processor paste curls keys,
|
|
1245
|
+
# not just values) — route to the same reader values already use.
|
|
1246
|
+
kind = smart_quote_kind(@pos)
|
|
1247
|
+
return parse_smart_string(kind) if kind
|
|
1248
|
+
|
|
1060
1249
|
raise error("expected a key") unless b && key_start_byte?(b)
|
|
1061
1250
|
|
|
1062
1251
|
parse_identifier_key
|
|
@@ -1077,51 +1266,77 @@ module SmarterJSON
|
|
|
1077
1266
|
start = @pos
|
|
1078
1267
|
advance(1)
|
|
1079
1268
|
advance(1) while (b = byte) && key_continue_byte?(b)
|
|
1080
|
-
@input.byteslice(start, @pos - start)
|
|
1269
|
+
@input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
|
|
1081
1270
|
end
|
|
1082
1271
|
|
|
1083
1272
|
# --- quoteless strings & literal classification ---
|
|
1084
1273
|
|
|
1085
1274
|
def parse_quoteless_or_literal
|
|
1086
1275
|
start = @pos
|
|
1087
|
-
scan_quoteless_run
|
|
1276
|
+
value_end = scan_quoteless_run
|
|
1088
1277
|
# A quoteless run must consume at least one byte. If the first byte is a
|
|
1089
1278
|
# delimiter (',' '}' ']'), the run is empty and @pos didn't move — returning
|
|
1090
1279
|
# here would make the caller's `result << parse_member_value` loop forever.
|
|
1091
1280
|
# Raise instead (correct today: the Lenient Commas Option is not adopted).
|
|
1092
1281
|
raise error("expected a value") if @pos == start
|
|
1093
1282
|
|
|
1094
|
-
|
|
1095
|
-
|
|
1283
|
+
# value_end is the end of the last non-whitespace char in the run; slicing to it
|
|
1284
|
+
# drops trailing whitespace without a regex (the caller already skipped leading
|
|
1285
|
+
# whitespace, so there is none to trim at the front). Equivalent to the old
|
|
1286
|
+
# trim_blank(raw) but with no per-scalar String#sub allocations.
|
|
1287
|
+
raw = @input.byteslice(start, value_end - start) # byteslice preserves @input's encoding
|
|
1288
|
+
classify_quoteless(raw)
|
|
1096
1289
|
end
|
|
1097
1290
|
|
|
1098
1291
|
# Advance to the end of a quoteless run. Stops at structural punctuation
|
|
1099
|
-
# (',' '}' ']'
|
|
1100
|
-
#
|
|
1292
|
+
# (',' '{' '}' '[' ']' — openers terminate symmetrically with closers, so a
|
|
1293
|
+
# self-delimiting value starts fresh: `localhost {"a":1}` -> ["localhost", {...}]),
|
|
1294
|
+
# a newline, EOF, or a comment marker that is preceded by whitespace. Spaces by
|
|
1295
|
+
# themselves are not delimiters.
|
|
1296
|
+
# Advance @pos to the end of the quoteless run (including any trailing whitespace,
|
|
1297
|
+
# so the parser resumes correctly after the value). Returns value_end: the byte
|
|
1298
|
+
# offset just past the last NON-whitespace char, so the caller can slice off
|
|
1299
|
+
# trailing whitespace without a regex.
|
|
1101
1300
|
def scan_quoteless_run
|
|
1301
|
+
input = @input
|
|
1302
|
+
pos = @pos
|
|
1303
|
+
# Fast path: one C-level byteindex jumps to the first structural terminator or
|
|
1304
|
+
# whitespace. If it lands on a terminator (or EOF) the run had no interior whitespace,
|
|
1305
|
+
# so [pos, hit) is the whole value — value_end == hit (no trailing trim) and no comment
|
|
1306
|
+
# marker can apply (those only break after whitespace). This is the common case
|
|
1307
|
+
# (numbers and simple tokens). Anything with whitespace falls to the byte-by-byte loop.
|
|
1308
|
+
if BYTEINDEX_AVAILABLE
|
|
1309
|
+
hit = input.byteindex(QL_BREAK, pos) || @bytesize
|
|
1310
|
+
b = hit < @bytesize ? input.getbyte(hit) : nil
|
|
1311
|
+
if b.nil? || b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
|
|
1312
|
+
@pos = hit
|
|
1313
|
+
return hit
|
|
1314
|
+
end
|
|
1315
|
+
end
|
|
1316
|
+
|
|
1317
|
+
# Slow path: the run contains whitespace — scan byte by byte to honor interior
|
|
1318
|
+
# whitespace, trailing-whitespace trimming (value_end is the end of the last
|
|
1319
|
+
# non-whitespace char), and the comment-marker-after-whitespace rule.
|
|
1320
|
+
value_end = pos
|
|
1102
1321
|
prev_ws = false
|
|
1103
|
-
|
|
1104
|
-
b
|
|
1105
|
-
break if b.
|
|
1106
|
-
break if [COMMA, RBRACE, RBRACKET, LF, CR].include?(b)
|
|
1107
|
-
break if prev_ws && (b == HASH || (b == SLASH && [SLASH, STAR].include?(byte_at(1))))
|
|
1322
|
+
while (b = input.getbyte(pos))
|
|
1323
|
+
break if b == COMMA || b == RBRACE || b == RBRACKET || b == LBRACE || b == LBRACKET || b == LF || b == CR
|
|
1324
|
+
break if prev_ws && (b == HASH || (b == SLASH && ((c = input.getbyte(pos + 1)) == SLASH || c == STAR)))
|
|
1108
1325
|
|
|
1109
1326
|
if b == SPACE || (b >= TAB && b <= CR) # tab/VT/FF/space (LF/CR already broke)
|
|
1110
1327
|
prev_ws = true
|
|
1111
|
-
|
|
1112
|
-
elsif b >= 0x80 && (n = multibyte_ws_len(
|
|
1328
|
+
pos += 1
|
|
1329
|
+
elsif b >= 0x80 && (n = multibyte_ws_len(pos)).positive?
|
|
1113
1330
|
prev_ws = true
|
|
1114
|
-
|
|
1115
|
-
@col += 1
|
|
1331
|
+
pos += n
|
|
1116
1332
|
else
|
|
1117
1333
|
prev_ws = false
|
|
1118
|
-
|
|
1334
|
+
pos += 1
|
|
1335
|
+
value_end = pos
|
|
1119
1336
|
end
|
|
1120
1337
|
end
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
def trim_blank(str)
|
|
1124
|
-
str.sub(BLANK_HEAD, "").sub(BLANK_TAIL, "")
|
|
1338
|
+
@pos = pos
|
|
1339
|
+
value_end
|
|
1125
1340
|
end
|
|
1126
1341
|
|
|
1127
1342
|
def classify_quoteless(str)
|
|
@@ -1132,7 +1347,7 @@ module SmarterJSON
|
|
|
1132
1347
|
when "undefined" then return nil
|
|
1133
1348
|
when "NaN" then return Float::NAN
|
|
1134
1349
|
when "Infinity", "+Infinity" then return Float::INFINITY
|
|
1135
|
-
when "-Infinity" then return
|
|
1350
|
+
when "-Infinity" then return -Float::INFINITY
|
|
1136
1351
|
end
|
|
1137
1352
|
num = numeric_value(str)
|
|
1138
1353
|
num.equal?(NOT_NUMERIC) ? str : num
|
|
@@ -1140,31 +1355,86 @@ module SmarterJSON
|
|
|
1140
1355
|
|
|
1141
1356
|
# Returns an Integer/Float, or NOT_NUMERIC if the whole token isn't a number.
|
|
1142
1357
|
def numeric_value(str)
|
|
1143
|
-
|
|
1144
|
-
|
|
1358
|
+
# Cheap hex gate: only invoke HEX_RE when the token actually looks like [+-]?0x… .
|
|
1359
|
+
# A Regexp#match? has real per-call cost; almost no number is hex, so the 1–3 byte
|
|
1360
|
+
# check skips that call on the common path (measured +21% on long-token decimals).
|
|
1361
|
+
if hex_prefix?(str) && HEX_RE.match?(str)
|
|
1362
|
+
neg = str.getbyte(0) == MINUS
|
|
1145
1363
|
body = str.sub(/\A[-+]/, "").delete("_") # "0x...."
|
|
1146
1364
|
v = body[2..-1].to_i(16)
|
|
1147
1365
|
return neg ? -v : v
|
|
1148
1366
|
end
|
|
1149
1367
|
return NOT_NUMERIC unless DEC_RE.match?(str) && str.match?(/[0-9]/)
|
|
1150
1368
|
|
|
1151
|
-
|
|
1369
|
+
# delete("_") allocates a fresh string even when there is nothing to delete; on long
|
|
1370
|
+
# number tokens that is a real per-value allocation. Underscores are rare, so only
|
|
1371
|
+
# pay it when the token actually contains one (measured +27% on long-token decimals).
|
|
1372
|
+
body = str.include?("_") ? str.delete("_") : str
|
|
1152
1373
|
body.match?(/[.eE]/) ? decimal_value(body) : body.to_i
|
|
1153
1374
|
end
|
|
1154
1375
|
|
|
1155
|
-
#
|
|
1376
|
+
# True when the token starts with [+-]?0[xX] — the only shape HEX_RE can match.
|
|
1377
|
+
def hex_prefix?(str)
|
|
1378
|
+
c0 = str.getbyte(0)
|
|
1379
|
+
if c0 == ZERO
|
|
1380
|
+
x = str.getbyte(1)
|
|
1381
|
+
x == LOWER_X || x == UPPER_X
|
|
1382
|
+
elsif c0 == MINUS || c0 == PLUS
|
|
1383
|
+
str.getbyte(1) == ZERO && ((x = str.getbyte(2)) == LOWER_X || x == UPPER_X)
|
|
1384
|
+
else
|
|
1385
|
+
false
|
|
1386
|
+
end
|
|
1387
|
+
end
|
|
1388
|
+
|
|
1389
|
+
# A decimal (has '.' or exponent). decimal_precision: :float -> Float,
|
|
1156
1390
|
# :bigdecimal -> BigDecimal, :auto -> BigDecimal when the mantissa has more
|
|
1157
1391
|
# than 16 significant digits (Oj's DEC_MAX threshold), else Float.
|
|
1158
1392
|
def decimal_value(body)
|
|
1159
|
-
case @
|
|
1160
|
-
when :float then body
|
|
1393
|
+
case @decimal_precision
|
|
1394
|
+
when :float then float_or_warn(body)
|
|
1161
1395
|
when :bigdecimal then to_big_decimal(body)
|
|
1162
|
-
else significant_digits(body) > 16 ? to_big_decimal(body) : body
|
|
1396
|
+
else significant_digits(body) > 16 ? to_big_decimal(body) : float_or_warn(body)
|
|
1163
1397
|
end
|
|
1164
1398
|
end
|
|
1165
1399
|
|
|
1400
|
+
# A finite numeric literal whose magnitude exceeds Float range (e.g. 1e400) becomes
|
|
1401
|
+
# ±Infinity — a silent data change. Report it via :number_overflow (the value is still
|
|
1402
|
+
# returned; we warn rather than raise or invent). The Infinity/NaN *keywords* go through
|
|
1403
|
+
# a separate path and never reach here, so they don't warn.
|
|
1404
|
+
def float_or_warn(body)
|
|
1405
|
+
f = body.to_f
|
|
1406
|
+
# Only test for overflow when an on_warning handler is listening: `f.infinite?` is a
|
|
1407
|
+
# per-float method call we don't want on the hot number path otherwise, and with no
|
|
1408
|
+
# handler the warning would go nowhere anyway. Overflow is vanishingly rare.
|
|
1409
|
+
warn(:number_overflow, "number literal out of Float range — collapsed to #{f}") if @on_warning && f.infinite?
|
|
1410
|
+
f
|
|
1411
|
+
end
|
|
1412
|
+
|
|
1413
|
+
# Count significant mantissa digits (leading zeros excluded, exponent ignored) to pick
|
|
1414
|
+
# Float vs BigDecimal in :auto mode. A single byte-scan — the old three-regex version
|
|
1415
|
+
# (strip exponent, strip non-digits, strip leading zeros, .length) ran on every float
|
|
1416
|
+
# and dominated the number path's cost. body is a DEC_RE-validated token (digits, at most
|
|
1417
|
+
# one '.', optional sign, optional e/E exponent), underscores already removed.
|
|
1166
1418
|
def significant_digits(body)
|
|
1167
|
-
|
|
1419
|
+
count = 0
|
|
1420
|
+
leading = true
|
|
1421
|
+
i = 0
|
|
1422
|
+
n = body.bytesize
|
|
1423
|
+
while i < n
|
|
1424
|
+
b = body.getbyte(i)
|
|
1425
|
+
i += 1
|
|
1426
|
+
break if b == LOWER_E || b == UPPER_E # exponent: its digits aren't significant
|
|
1427
|
+
|
|
1428
|
+
next unless b >= ZERO && b <= NINE # skip sign and the decimal point
|
|
1429
|
+
|
|
1430
|
+
if leading && b == ZERO
|
|
1431
|
+
next # leading zero (incl. those after '.') — not significant
|
|
1432
|
+
else
|
|
1433
|
+
leading = false
|
|
1434
|
+
count += 1
|
|
1435
|
+
end
|
|
1436
|
+
end
|
|
1437
|
+
count
|
|
1168
1438
|
end
|
|
1169
1439
|
|
|
1170
1440
|
def to_big_decimal(body)
|
|
@@ -1175,7 +1445,11 @@ module SmarterJSON
|
|
|
1175
1445
|
body = normalize_for_bigdecimal(body) if NEEDS_DECIMAL_FIXUP.match?(body)
|
|
1176
1446
|
BigDecimal(body)
|
|
1177
1447
|
rescue ArgumentError
|
|
1448
|
+
# Defensive: BigDecimal() does not reject a DEC_RE-validated, normalized token,
|
|
1449
|
+
# so this fallback is unreachable from valid input. Kept as a safety net.
|
|
1450
|
+
# :nocov:
|
|
1178
1451
|
body.to_f
|
|
1452
|
+
# :nocov:
|
|
1179
1453
|
end
|
|
1180
1454
|
|
|
1181
1455
|
# BigDecimal() rejects a bare leading/trailing dot (".5", "5.", "5.e3").
|
|
@@ -1194,7 +1468,7 @@ module SmarterJSON
|
|
|
1194
1468
|
end
|
|
1195
1469
|
|
|
1196
1470
|
def parse_triple_quoted
|
|
1197
|
-
indent = @
|
|
1471
|
+
indent = column_at(@pos) - 1
|
|
1198
1472
|
advance(3)
|
|
1199
1473
|
raw_start = @pos
|
|
1200
1474
|
until eof?
|
|
@@ -1204,7 +1478,7 @@ module SmarterJSON
|
|
|
1204
1478
|
end
|
|
1205
1479
|
raise error("unterminated triple-quoted string") if eof?
|
|
1206
1480
|
|
|
1207
|
-
raw = @input.byteslice(raw_start, @pos - raw_start)
|
|
1481
|
+
raw = @input.byteslice(raw_start, @pos - raw_start) # byteslice preserves @input's encoding
|
|
1208
1482
|
advance(3)
|
|
1209
1483
|
strip_triple(raw, indent)
|
|
1210
1484
|
end
|
|
@@ -1234,20 +1508,30 @@ module SmarterJSON
|
|
|
1234
1508
|
def parse_string(quote)
|
|
1235
1509
|
advance(1)
|
|
1236
1510
|
start = @pos
|
|
1237
|
-
|
|
1511
|
+
# Fast path (the common case — a string with no escapes): jump straight to the
|
|
1512
|
+
# closing quote with byteindex. It is called only here, from `start`, which is
|
|
1513
|
+
# always a character boundary, so byteindex never sees a mid-char offset.
|
|
1514
|
+
hit = scan_string_delimiter(quote)
|
|
1515
|
+
raise error("unterminated string") if hit.nil?
|
|
1516
|
+
|
|
1517
|
+
if @input.getbyte(hit) == quote
|
|
1518
|
+
@pos = hit
|
|
1519
|
+
result = @input.byteslice(start, @pos - start) # byteslice preserves @input's encoding
|
|
1520
|
+
advance(1)
|
|
1521
|
+
return result
|
|
1522
|
+
end
|
|
1523
|
+
|
|
1524
|
+
# Escape path: a backslash precedes the closing quote. Scan byte by byte from
|
|
1525
|
+
# here — byteindex can't be used past a backslash (a lenient \<multibyte> would
|
|
1526
|
+
# leave @pos mid-character), and this lets the decoder flag invalid escapes
|
|
1527
|
+
# exactly as before. decode_string_with_escapes handles the whole [start, finish].
|
|
1528
|
+
@pos = hit
|
|
1238
1529
|
while (b = byte)
|
|
1239
1530
|
if b == quote
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
return decoded
|
|
1244
|
-
else
|
|
1245
|
-
result = @input.byteslice(start, @pos - start).force_encoding(@input.encoding)
|
|
1246
|
-
advance(1)
|
|
1247
|
-
return result
|
|
1248
|
-
end
|
|
1531
|
+
decoded = decode_string_with_escapes(start, @pos, quote)
|
|
1532
|
+
advance(1)
|
|
1533
|
+
return decoded
|
|
1249
1534
|
elsif b == BACKSLASH
|
|
1250
|
-
has_escape = true
|
|
1251
1535
|
advance(1)
|
|
1252
1536
|
raise error("unterminated string escape") if eof?
|
|
1253
1537
|
|
|
@@ -1259,6 +1543,20 @@ module SmarterJSON
|
|
|
1259
1543
|
raise error("unterminated string")
|
|
1260
1544
|
end
|
|
1261
1545
|
|
|
1546
|
+
# Byte index of the next closing quote or backslash at/after @pos, or nil if
|
|
1547
|
+
# neither occurs before EOF. byteindex scans inside MRI's C; the fallback is a
|
|
1548
|
+
# tight getbyte loop (the ASCII delimiters never alias UTF-8 continuation bytes,
|
|
1549
|
+
# so byte scanning is correct for UTF-8 string content).
|
|
1550
|
+
def scan_string_delimiter(quote)
|
|
1551
|
+
if BYTEINDEX_AVAILABLE
|
|
1552
|
+
@input.byteindex(quote == DQUOTE ? DQUOTE_OR_BACKSLASH : SQUOTE_OR_BACKSLASH, @pos)
|
|
1553
|
+
else
|
|
1554
|
+
i = @pos
|
|
1555
|
+
i += 1 while i < @bytesize && (b = @input.getbyte(i)) != quote && b != BACKSLASH
|
|
1556
|
+
i < @bytesize ? i : nil
|
|
1557
|
+
end
|
|
1558
|
+
end
|
|
1559
|
+
|
|
1262
1560
|
def decode_string_with_escapes(start, finish, _quote)
|
|
1263
1561
|
buf = String.new(encoding: Encoding::ASCII_8BIT)
|
|
1264
1562
|
i = start
|
|
@@ -1350,7 +1648,7 @@ module SmarterJSON
|
|
|
1350
1648
|
|
|
1351
1649
|
if byte == ZERO
|
|
1352
1650
|
advance(1)
|
|
1353
|
-
if
|
|
1651
|
+
if (x = byte) == LOWER_X || x == UPPER_X
|
|
1354
1652
|
advance(1)
|
|
1355
1653
|
hex_start = @pos
|
|
1356
1654
|
advance(1) while (b = byte) && (hex_digit?(b) || b == UNDERSCORE)
|
|
@@ -1375,10 +1673,10 @@ module SmarterJSON
|
|
|
1375
1673
|
advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
|
|
1376
1674
|
end
|
|
1377
1675
|
|
|
1378
|
-
if
|
|
1676
|
+
if (e = byte) == LOWER_E || e == UPPER_E
|
|
1379
1677
|
is_float = true
|
|
1380
1678
|
advance(1)
|
|
1381
|
-
advance(1) if
|
|
1679
|
+
advance(1) if (s = byte) == PLUS || s == MINUS
|
|
1382
1680
|
raise error("invalid number: expected digits in exponent") unless byte && byte >= ZERO && byte <= NINE
|
|
1383
1681
|
|
|
1384
1682
|
advance(1) while (b = byte) && ((b >= ZERO && b <= NINE) || b == UNDERSCORE)
|
|
@@ -1414,11 +1712,13 @@ module SmarterJSON
|
|
|
1414
1712
|
def warn(type, message)
|
|
1415
1713
|
return unless @on_warning
|
|
1416
1714
|
|
|
1417
|
-
|
|
1715
|
+
line, col = line_col_at(@pos)
|
|
1716
|
+
@on_warning.call(Warning.new(type, message, line, col))
|
|
1418
1717
|
end
|
|
1419
1718
|
|
|
1420
1719
|
def error(message)
|
|
1421
|
-
|
|
1720
|
+
line, col = line_col_at(@pos)
|
|
1721
|
+
ParseError.new(message, line, col)
|
|
1422
1722
|
end
|
|
1423
1723
|
|
|
1424
1724
|
def display_byte(b)
|