smarter_json 0.9.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/CHANGELOG.md +89 -55
- data/README.md +216 -73
- data/docs/_introduction.md +6 -12
- data/docs/basic_read_api.md +29 -19
- data/docs/basic_write_api.md +3 -3
- data/docs/examples.md +32 -23
- data/docs/options.md +20 -19
- data/ext/smarter_json/smarter_json.c +246 -92
- data/ext/smarter_json/vendor/LICENSE-fast_float-MIT +27 -0
- data/ext/smarter_json/vendor/eisel_lemire.h +117 -0
- data/ext/smarter_json/vendor/eisel_lemire.md +29 -0
- data/ext/smarter_json/vendor/eisel_lemire_powers.h +663 -0
- data/lib/smarter_json/backports.rb +28 -0
- data/lib/smarter_json/generator.rb +100 -65
- data/lib/smarter_json/options.rb +65 -0
- data/lib/smarter_json/parser.rb +441 -141
- data/lib/smarter_json/version.rb +1 -1
- data/lib/smarter_json.rb +3 -1
- metadata +21 -11
- data/ext/smarter_json/vendor/ryu.h +0 -819
- data/ext/smarter_json/vendor/ryu.md +0 -22
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
#ifdef __ARM_NEON
|
|
5
5
|
#include <arm_neon.h>
|
|
6
6
|
#endif
|
|
7
|
-
#include "vendor/
|
|
7
|
+
#include "vendor/eisel_lemire.h" /* Eisel-Lemire decimal->double, correctly rounded (fast_float) */
|
|
8
8
|
|
|
9
9
|
/* Branch hints / prefetch on the hot scan loops. No-ops on compilers without the
|
|
10
10
|
* builtins (the code is correct either way; these only steer code layout). */
|
|
@@ -12,10 +12,12 @@
|
|
|
12
12
|
# define FJ_LIKELY(x) __builtin_expect(!!(x), 1)
|
|
13
13
|
# define FJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
|
|
14
14
|
# define FJ_PREFETCH(p) __builtin_prefetch(p)
|
|
15
|
+
# define FJ_ALWAYS_INLINE inline __attribute__((always_inline))
|
|
15
16
|
#else
|
|
16
17
|
# define FJ_LIKELY(x) (x)
|
|
17
18
|
# define FJ_UNLIKELY(x) (x)
|
|
18
19
|
# define FJ_PREFETCH(p) ((void)0)
|
|
20
|
+
# define FJ_ALWAYS_INLINE inline
|
|
19
21
|
#endif
|
|
20
22
|
|
|
21
23
|
/*
|
|
@@ -38,6 +40,7 @@ static ID fj_call_id; /* cached :call (invoking the on_warning handler) */
|
|
|
38
40
|
static VALUE fj_sym_empty_slot;
|
|
39
41
|
static VALUE fj_sym_empty_value;
|
|
40
42
|
static VALUE fj_sym_duplicate_key;
|
|
43
|
+
static VALUE fj_sym_number_overflow;
|
|
41
44
|
static ID fj_bigdecimal_id; /* cached BigDecimal() method id (set in Init) */
|
|
42
45
|
static ID fj_to_sym_id; /* cached :to_sym (symbolize_keys) */
|
|
43
46
|
static ID fj_key_p_id; /* cached :key? (non-default duplicate_key modes) */
|
|
@@ -48,8 +51,7 @@ static ID fj_name_id;
|
|
|
48
51
|
static VALUE fj_sym_encoding;
|
|
49
52
|
static VALUE fj_sym_symbolize_keys;
|
|
50
53
|
static VALUE fj_sym_first_wins;
|
|
51
|
-
static VALUE
|
|
52
|
-
static VALUE fj_sym_bigdecimal_load;
|
|
54
|
+
static VALUE fj_sym_decimal_precision;
|
|
53
55
|
static VALUE fj_sym_float;
|
|
54
56
|
static VALUE fj_sym_bigdecimal;
|
|
55
57
|
static VALUE fj_sym_on_warning;
|
|
@@ -70,8 +72,7 @@ typedef struct {
|
|
|
70
72
|
int depth;
|
|
71
73
|
int symbolize_keys;
|
|
72
74
|
int dup_first_wins;
|
|
73
|
-
int
|
|
74
|
-
int bigdecimal_load; /* 0 = float, 1 = auto, 2 = bigdecimal */
|
|
75
|
+
int decimal_precision; /* 0 = float, 1 = auto, 2 = bigdecimal */
|
|
75
76
|
fj_kc_slot *kcache; /* per-parse key cache (NULL when interning unavailable) */
|
|
76
77
|
VALUE on_warning; /* on_warning: callable invoked per non-fatal lenient fix, else Qnil */
|
|
77
78
|
} fj_state;
|
|
@@ -168,20 +169,39 @@ static long fj_mbws(const char *p, long n) {
|
|
|
168
169
|
return 0;
|
|
169
170
|
}
|
|
170
171
|
|
|
172
|
+
/* Skip a run of whitespace. This is hot on pretty-printed input, where most of
|
|
173
|
+
* the bytes are indentation. Indentation is homogeneous — all spaces OR all tabs,
|
|
174
|
+
* the two common styles — so a run of it is skipped 8 bytes at a time with a
|
|
175
|
+
* single 64-bit compare (the uniform-byte patterns read the same regardless of
|
|
176
|
+
* endianness). Everything else — newlines, CR, short/partial runs, and Unicode
|
|
177
|
+
* whitespace — falls to the tight byte loop, which also avoids the per-byte helper
|
|
178
|
+
* calls (fj_byte / fj_is_ws / fj_advance) the previous byte-at-a-time version paid.
|
|
179
|
+
* The set of bytes treated as whitespace is unchanged. */
|
|
171
180
|
static void fj_skip_pure_ws(fj_state *st) {
|
|
181
|
+
const char *p = st->buf + st->pos;
|
|
182
|
+
const char *end = st->buf + st->len;
|
|
172
183
|
for (;;) {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
} else if (b >= 0x80) {
|
|
178
|
-
long m = fj_mbws(st->buf + st->pos, st->len - st->pos);
|
|
179
|
-
if (m == 0) break;
|
|
180
|
-
st->pos += m;
|
|
181
|
-
} else {
|
|
184
|
+
while (end - p >= 8) {
|
|
185
|
+
uint64_t w;
|
|
186
|
+
memcpy(&w, p, 8);
|
|
187
|
+
if (w == 0x2020202020202020ULL || w == 0x0909090909090909ULL) { p += 8; continue; }
|
|
182
188
|
break;
|
|
183
189
|
}
|
|
190
|
+
if (p >= end) break;
|
|
191
|
+
{
|
|
192
|
+
unsigned char b = (unsigned char)*p;
|
|
193
|
+
if (b == 0x20 || (b >= 0x09 && b <= 0x0D)) {
|
|
194
|
+
p++;
|
|
195
|
+
} else if (b >= 0x80) {
|
|
196
|
+
long m = fj_mbws(p, end - p);
|
|
197
|
+
if (m == 0) break;
|
|
198
|
+
p += m;
|
|
199
|
+
} else {
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
184
203
|
}
|
|
204
|
+
st->pos = p - st->buf;
|
|
185
205
|
}
|
|
186
206
|
|
|
187
207
|
/* A comment marker only starts a comment when preceded by whitespace or at the
|
|
@@ -228,9 +248,23 @@ static void fj_skip_ws_comments(fj_state *st) {
|
|
|
228
248
|
}
|
|
229
249
|
}
|
|
230
250
|
|
|
251
|
+
/* Cheap guard for the hot loop: could the current byte begin whitespace or a
|
|
252
|
+
* comment marker, so the (otherwise no-op) fj_skip_ws_comments call is actually
|
|
253
|
+
* needed? Compact data — the next byte is already a structural char or a value
|
|
254
|
+
* start — answers no, and we elide both the call and its memcpy/lookahead. ASCII
|
|
255
|
+
* whitespace, '#', '/', and possible multibyte-ws lead bytes (>=0x80) answer yes;
|
|
256
|
+
* EOF (-1) answers no (the caller's existing -1 checks handle it). Behaviour is
|
|
257
|
+
* identical to always calling fj_skip_ws_comments — this only skips a known no-op. */
|
|
258
|
+
static inline int fj_needs_ws_skip(int b) {
|
|
259
|
+
if (b < 0) return 0;
|
|
260
|
+
return b == 0x20 || (b >= 0x09 && b <= 0x0D) || b == '#' || b == '/' || b >= 0x80;
|
|
261
|
+
}
|
|
262
|
+
|
|
231
263
|
/* forward declarations (mutual recursion) */
|
|
232
264
|
static VALUE fj_parse_value(fj_state *st);
|
|
233
265
|
static VALUE fj_parse_member_value(fj_state *st);
|
|
266
|
+
static int fj_smart_quote_kind(fj_state *st);
|
|
267
|
+
static VALUE fj_parse_smart_string(fj_state *st, int kind);
|
|
234
268
|
|
|
235
269
|
static void fj_append_utf8(VALUE buf, unsigned long cp) {
|
|
236
270
|
char tmp[4];
|
|
@@ -471,7 +505,7 @@ static VALUE fj_to_bigdecimal_token(const char *p, long n) {
|
|
|
471
505
|
* (quoteless path) call these, so the Integer/Float a token produces is identical
|
|
472
506
|
* no matter which path scanned it. [p, n) is the raw token slice (with any sign),
|
|
473
507
|
* needed only by the bignum / strtod fallbacks. */
|
|
474
|
-
static VALUE fj_int_from_parts(uint64_t m, int digits, int neg, int overflow, const char *p, long n) {
|
|
508
|
+
static FJ_ALWAYS_INLINE VALUE fj_int_from_parts(uint64_t m, int digits, int neg, int overflow, const char *p, long n) {
|
|
475
509
|
if (!overflow && digits >= 1 && digits <= 18) {
|
|
476
510
|
int64_t v = (int64_t)m;
|
|
477
511
|
return LL2NUM(neg ? -v : v);
|
|
@@ -481,16 +515,96 @@ static VALUE fj_int_from_parts(uint64_t m, int digits, int neg, int overflow, co
|
|
|
481
515
|
return rb_str_to_inum(fj_strip_underscores(p, n), 10, 0);
|
|
482
516
|
}
|
|
483
517
|
|
|
518
|
+
/* Convert a >17-digit / subnormal float token to a double. A double resolves ~17
|
|
519
|
+
* significant decimals; the digits past that affect only the final round-to-nearest-
|
|
520
|
+
* even, which a single sticky marker ("was any dropped digit nonzero?") captures. So
|
|
521
|
+
* we keep FJ_FLOAT_ODD_DIGITS significant digits and, if more nonzero digits follow,
|
|
522
|
+
* force the last kept digit odd (round-to-odd). strtod's round-to-nearest of that
|
|
523
|
+
* shorter mantissa then equals round-to-nearest of the full value — but strtod grinds
|
|
524
|
+
* far fewer digits. The kept count is well above 2x double's ~16 significant decimals,
|
|
525
|
+
* which is what round-to-odd needs to be exact (verified bit-for-bit against
|
|
526
|
+
* JSON.parse on the high-precision corpus). The token is rebuilt into a NUL-terminated
|
|
527
|
+
* "<digits>e<exp>" buffer (passing the raw input slice would make rb_cstr_to_dbl treat
|
|
528
|
+
* the trailing delimiter as garbage and re-run strtod a second time). */
|
|
529
|
+
#define FJ_FLOAT_ODD_DIGITS 40
|
|
530
|
+
static VALUE fj_float_strtod(const char *p, long n) {
|
|
531
|
+
char digits[FJ_FLOAT_ODD_DIGITS];
|
|
532
|
+
char out[FJ_FLOAT_ODD_DIGITS + 40];
|
|
533
|
+
long i = 0, ow = 0, kept = 0, point_pos = 0, lead_frac_zeros = 0;
|
|
534
|
+
int neg = 0, after_point = 0, seen_sig = 0, sticky = 0, esign = 0;
|
|
535
|
+
int64_t expl_exp = 0, x;
|
|
536
|
+
|
|
537
|
+
if (i < n && (p[i] == '+' || p[i] == '-')) { neg = (p[i] == '-'); i++; }
|
|
538
|
+
|
|
539
|
+
for (; i < n; i++) {
|
|
540
|
+
char c = p[i];
|
|
541
|
+
if (c == '_') continue;
|
|
542
|
+
if (c == '.') { after_point = 1; continue; }
|
|
543
|
+
if (c == 'e' || c == 'E') { i++; break; }
|
|
544
|
+
if (!seen_sig && c == '0') { if (after_point) lead_frac_zeros++; continue; }
|
|
545
|
+
seen_sig = 1;
|
|
546
|
+
if (!after_point) point_pos++;
|
|
547
|
+
if (kept < FJ_FLOAT_ODD_DIGITS) digits[kept++] = c;
|
|
548
|
+
else if (c != '0') sticky = 1;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
if (i < n && (p[i] == '+' || p[i] == '-')) { esign = (p[i] == '-'); i++; }
|
|
552
|
+
for (; i < n; i++) {
|
|
553
|
+
char c = p[i];
|
|
554
|
+
if (c == '_') continue;
|
|
555
|
+
if (c < '0' || c > '9') break;
|
|
556
|
+
expl_exp = expl_exp * 10 + (c - '0');
|
|
557
|
+
}
|
|
558
|
+
if (esign) expl_exp = -expl_exp;
|
|
559
|
+
|
|
560
|
+
if (kept == 0) return rb_float_new(neg ? -0.0 : 0.0);
|
|
561
|
+
|
|
562
|
+
/* round-to-odd: a dropped nonzero tail forces the last kept digit odd. */
|
|
563
|
+
if (sticky && ((digits[kept - 1] - '0') % 2) == 0) digits[kept - 1]++;
|
|
564
|
+
|
|
565
|
+
x = expl_exp + point_pos - lead_frac_zeros - kept;
|
|
566
|
+
if (neg) out[ow++] = '-';
|
|
567
|
+
memcpy(out + ow, digits, (size_t)kept);
|
|
568
|
+
ow += kept;
|
|
569
|
+
/* Append "e<exp>" by hand. snprintf here showed up as BSD_vfprintf in profiling —
|
|
570
|
+
a full printf formatter per number is absurdly heavy for one integer. */
|
|
571
|
+
out[ow++] = 'e';
|
|
572
|
+
if (x < 0) { out[ow++] = '-'; x = -x; }
|
|
573
|
+
{
|
|
574
|
+
char ex[24];
|
|
575
|
+
int en = 0;
|
|
576
|
+
if (x == 0) ex[en++] = '0';
|
|
577
|
+
else while (x > 0) { ex[en++] = (char)('0' + (int)(x % 10)); x /= 10; }
|
|
578
|
+
while (en > 0) out[ow++] = ex[--en];
|
|
579
|
+
}
|
|
580
|
+
out[ow] = '\0';
|
|
581
|
+
return rb_float_new(rb_cstr_to_dbl(out, 0));
|
|
582
|
+
}
|
|
583
|
+
|
|
484
584
|
/* e10 is the final base-10 exponent (already adjusted by the fraction length). */
|
|
485
|
-
static VALUE fj_float_from_parts(uint64_t m10, int m10digits, int64_t e10, int neg, int overflow, const char *p, long n) {
|
|
486
|
-
|
|
487
|
-
|
|
585
|
+
static FJ_ALWAYS_INLINE VALUE fj_float_from_parts(fj_state *st, uint64_t m10, int m10digits, int64_t e10, int neg, int overflow, const char *p, long n) {
|
|
586
|
+
double d;
|
|
587
|
+
/* Fast path by mantissa width (our scanner accumulates m10 exactly up to 18
|
|
588
|
+
digits, flagging overflow beyond):
|
|
589
|
+
1..18 digits -> Eisel-Lemire, correctly-rounded for any exact uint64 mantissa
|
|
590
|
+
(Mushtak-Lemire). This pulls full-double-precision data (e.g.
|
|
591
|
+
citylots coordinates, 18 sig digits) off the slow strtod
|
|
592
|
+
fallback — the stdlib json gem still strtods it.
|
|
593
|
+
>18 digits / overflow / extreme exponent -> strtod (round-to-odd). */
|
|
594
|
+
if (!overflow && m10digits >= 1 && m10digits <= 18 && (long)m10digits + e10 >= -307) {
|
|
488
595
|
if (m10 == 0) return rb_float_new(neg ? -0.0 : 0.0);
|
|
489
|
-
|
|
596
|
+
d = fj_eisel_lemire_s2d(e10, m10, neg);
|
|
597
|
+
} else {
|
|
598
|
+
/* Fallback for >18 digits / extreme or subnormal exponents. */
|
|
599
|
+
d = RFLOAT_VALUE(fj_float_strtod(p, n));
|
|
490
600
|
}
|
|
491
|
-
/*
|
|
492
|
-
|
|
493
|
-
|
|
601
|
+
/* A finite literal whose magnitude exceeds Float range (e.g. 1e400) becomes
|
|
602
|
+
±Infinity — a silent data change. Report it via :number_overflow (the value is
|
|
603
|
+
still returned). The Infinity/NaN keywords take separate paths and never get here.
|
|
604
|
+
Gate isinf on a listening handler (matches the Ruby float_or_warn): no handler ->
|
|
605
|
+
no point detecting, and it keeps the test off the hot number path. */
|
|
606
|
+
if (st->on_warning != Qnil && isinf(d)) fj_warn(st, fj_sym_number_overflow, "number literal out of Float range — collapsed to Infinity");
|
|
607
|
+
return rb_float_new(d);
|
|
494
608
|
}
|
|
495
609
|
|
|
496
610
|
/* Scan an already-bounded quoteless token [p, p+n) exactly once: validate it as a
|
|
@@ -571,11 +685,11 @@ static int fj_try_decimal(fj_state *st, const char *p, long n, VALUE *out) {
|
|
|
571
685
|
e10 -= frac;
|
|
572
686
|
/* :bigdecimal always; :auto only when significant digits > 16. m10digits is >=
|
|
573
687
|
* the significant-digit count, so m10digits <= 16 skips the fj_sig_digits scan. */
|
|
574
|
-
if (st->
|
|
575
|
-
(st->
|
|
688
|
+
if (st->decimal_precision == 2 ||
|
|
689
|
+
(st->decimal_precision == 1 && m10digits > 16 && fj_sig_digits(p, n) > 16)) {
|
|
576
690
|
*out = fj_to_bigdecimal_token(p, n);
|
|
577
691
|
} else {
|
|
578
|
-
*out = fj_float_from_parts(m10, m10digits, e10, neg, overflow, p, n);
|
|
692
|
+
*out = fj_float_from_parts(st, m10, m10digits, e10, neg, overflow, p, n);
|
|
579
693
|
}
|
|
580
694
|
return 1;
|
|
581
695
|
}
|
|
@@ -596,7 +710,7 @@ static VALUE fj_parse_number(fj_state *st) {
|
|
|
596
710
|
long nlen;
|
|
597
711
|
int is_float = 0, neg = 0, overflow = 0;
|
|
598
712
|
uint64_t m10 = 0; /* mantissa: integer + fraction digits */
|
|
599
|
-
int m10digits = 0; /* mantissa digit chars (caps the
|
|
713
|
+
int m10digits = 0; /* mantissa digit chars (caps the Eisel-Lemire fast path at 18) */
|
|
600
714
|
int frac = 0; /* fraction digit chars: e10 -= frac */
|
|
601
715
|
int64_t e10 = 0;
|
|
602
716
|
|
|
@@ -683,11 +797,11 @@ static VALUE fj_parse_number(fj_state *st) {
|
|
|
683
797
|
* when significant digits > 16. Since m10digits >= significant digits, m10digits
|
|
684
798
|
* <= 16 guarantees not-BigDecimal and lets us skip the fj_sig_digits scan
|
|
685
799
|
* entirely (the common case — e.g. every coordinate in canada.json). */
|
|
686
|
-
if (st->
|
|
687
|
-
(st->
|
|
800
|
+
if (st->decimal_precision == 2 ||
|
|
801
|
+
(st->decimal_precision == 1 && m10digits > 16 && fj_sig_digits(np, nlen) > 16)) {
|
|
688
802
|
return fj_to_bigdecimal_token(np, nlen);
|
|
689
803
|
}
|
|
690
|
-
return fj_float_from_parts(m10, m10digits, e10, neg, overflow, np, nlen);
|
|
804
|
+
return fj_float_from_parts(st, m10, m10digits, e10, neg, overflow, np, nlen);
|
|
691
805
|
}
|
|
692
806
|
|
|
693
807
|
static VALUE fj_parse_literal(fj_state *st, const char *word, VALUE value) {
|
|
@@ -740,6 +854,7 @@ static VALUE fj_parse_identifier_key(fj_state *st) {
|
|
|
740
854
|
|
|
741
855
|
static VALUE fj_parse_object_key(fj_state *st) {
|
|
742
856
|
int b = fj_byte(st);
|
|
857
|
+
int kind;
|
|
743
858
|
|
|
744
859
|
/* Quoted key. The common case has no escapes: intern straight from the buffer
|
|
745
860
|
* with no throwaway allocation. An escaped key (rare) falls through to the
|
|
@@ -760,6 +875,12 @@ static VALUE fj_parse_object_key(fj_state *st) {
|
|
|
760
875
|
return fj_parse_string(st, b);
|
|
761
876
|
}
|
|
762
877
|
|
|
878
|
+
/* A key may open with a smart/curly quote too (a word-processor paste curls the
|
|
879
|
+
* keys, not just the values) — route to the same reader the value path uses.
|
|
880
|
+
* Mirrors the Ruby fallback's parse_object_key; Hash#[]= dedups the key on store. */
|
|
881
|
+
kind = fj_smart_quote_kind(st);
|
|
882
|
+
if (kind) return fj_parse_smart_string(st, kind);
|
|
883
|
+
|
|
763
884
|
if (fj_is_key_start(b)) return fj_parse_identifier_key(st);
|
|
764
885
|
|
|
765
886
|
fj_error(st, "expected a key");
|
|
@@ -851,7 +972,8 @@ static VALUE fj_classify_quoteless(fj_state *st, const char *p0, long n0) {
|
|
|
851
972
|
* before the whitespace check. */
|
|
852
973
|
enum { FJ_QL_ORD = 0, FJ_QL_TERM, FJ_QL_WS, FJ_QL_CMT };
|
|
853
974
|
static const unsigned char fj_ql_class[256] = {
|
|
854
|
-
[','] = FJ_QL_TERM, ['
|
|
975
|
+
[','] = FJ_QL_TERM, ['{'] = FJ_QL_TERM, ['}'] = FJ_QL_TERM,
|
|
976
|
+
['['] = FJ_QL_TERM, [']'] = FJ_QL_TERM,
|
|
855
977
|
[0x0A] = FJ_QL_TERM, [0x0D] = FJ_QL_TERM,
|
|
856
978
|
[0x09] = FJ_QL_WS, [0x0B] = FJ_QL_WS, [0x0C] = FJ_QL_WS, [' '] = FJ_QL_WS,
|
|
857
979
|
['#'] = FJ_QL_CMT, ['/'] = FJ_QL_CMT,
|
|
@@ -1078,7 +1200,8 @@ static int fj_try_member_number(fj_state *st, VALUE *out) {
|
|
|
1078
1200
|
/* Commit only if the number abuts a value terminator; otherwise (whitespace,
|
|
1079
1201
|
* letters, a second '.', "0x…", …) leave it to the quoteless scanner. */
|
|
1080
1202
|
t = (unsigned char)*p;
|
|
1081
|
-
if (!(t == ',' || t == '
|
|
1203
|
+
if (!(t == ',' || t == '{' || t == '}' || t == '[' || t == ']' ||
|
|
1204
|
+
t == 0x0A || t == 0x0D || p == buf + st->len)) {
|
|
1082
1205
|
return 0;
|
|
1083
1206
|
}
|
|
1084
1207
|
|
|
@@ -1089,11 +1212,11 @@ static int fj_try_member_number(fj_state *st, VALUE *out) {
|
|
|
1089
1212
|
return 1;
|
|
1090
1213
|
}
|
|
1091
1214
|
e10 -= frac;
|
|
1092
|
-
if (st->
|
|
1093
|
-
(st->
|
|
1215
|
+
if (st->decimal_precision == 2 ||
|
|
1216
|
+
(st->decimal_precision == 1 && m10digits > 16 && fj_sig_digits(np, nlen) > 16)) {
|
|
1094
1217
|
*out = fj_to_bigdecimal_token(np, nlen);
|
|
1095
1218
|
} else {
|
|
1096
|
-
*out = fj_float_from_parts(m10, m10digits, e10, neg, overflow, np, nlen);
|
|
1219
|
+
*out = fj_float_from_parts(st, m10, m10digits, e10, neg, overflow, np, nlen);
|
|
1097
1220
|
}
|
|
1098
1221
|
return 1;
|
|
1099
1222
|
}
|
|
@@ -1164,19 +1287,9 @@ static void fj_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) {
|
|
|
1164
1287
|
void rb_hash_bulk_insert(long, const VALUE *, VALUE);
|
|
1165
1288
|
#endif
|
|
1166
1289
|
|
|
1167
|
-
/* Hash entry count as a C long. RHASH_SIZE is not part of the public C API on
|
|
1168
|
-
* older Ruby (< ~2.7), but rb_hash_size (Hash#size's implementation) is available
|
|
1169
|
-
* everywhere. Only used on the rare :raise duplicate-key path, so the boxing cost
|
|
1170
|
-
* is irrelevant — and it keeps the extension buildable down to Ruby 2.5. */
|
|
1171
|
-
static inline long fj_hash_len(VALUE hash) {
|
|
1172
|
-
return NUM2LONG(rb_hash_size(hash));
|
|
1173
|
-
}
|
|
1174
|
-
|
|
1175
1290
|
/* Build a Hash from `count` interleaved key,value slots. Fast path (String keys,
|
|
1176
|
-
* default :last_wins
|
|
1177
|
-
*
|
|
1178
|
-
* actually happened. symbolize_keys / :first_wins use a per-member loop into the
|
|
1179
|
-
* same pre-sized hash. */
|
|
1291
|
+
* default :last_wins): pre-size + bulk insert. symbolize_keys / :first_wins use a
|
|
1292
|
+
* per-member loop into the same pre-sized hash. */
|
|
1180
1293
|
static VALUE fj_build_object(fj_state *st, const VALUE *pairs, long count) {
|
|
1181
1294
|
long entries = count / 2, i;
|
|
1182
1295
|
VALUE hash = rb_hash_new_capa(entries);
|
|
@@ -1185,22 +1298,13 @@ static VALUE fj_build_object(fj_state *st, const VALUE *pairs, long count) {
|
|
|
1185
1298
|
* the per-member loop below to report each dropped duplicate key. */
|
|
1186
1299
|
if (!st->symbolize_keys && !st->dup_first_wins && st->on_warning == Qnil) {
|
|
1187
1300
|
rb_hash_bulk_insert(count, pairs, hash);
|
|
1188
|
-
if (st->dup_raise && fj_hash_len(hash) < entries) {
|
|
1189
|
-
VALUE seen = rb_hash_new_capa(entries);
|
|
1190
|
-
for (i = 0; i + 1 < count; i += 2) {
|
|
1191
|
-
long before = fj_hash_len(seen);
|
|
1192
|
-
rb_hash_aset(seen, pairs[i], Qtrue);
|
|
1193
|
-
if (fj_hash_len(seen) == before) fj_error(st, "duplicate key");
|
|
1194
|
-
}
|
|
1195
|
-
}
|
|
1196
1301
|
return hash;
|
|
1197
1302
|
}
|
|
1198
1303
|
|
|
1199
1304
|
for (i = 0; i + 1 < count; i += 2) {
|
|
1200
1305
|
VALUE k = st->symbolize_keys ? rb_funcall(pairs[i], fj_to_sym_id, 0) : pairs[i];
|
|
1201
|
-
if (st->dup_first_wins || st->
|
|
1306
|
+
if (st->dup_first_wins || st->on_warning != Qnil) {
|
|
1202
1307
|
if (RTEST(rb_funcall(hash, fj_key_p_id, 1, k))) {
|
|
1203
|
-
if (st->dup_raise) fj_error(st, "duplicate key");
|
|
1204
1308
|
fj_warn(st, fj_sym_duplicate_key, "duplicate key");
|
|
1205
1309
|
if (st->dup_first_wins) continue;
|
|
1206
1310
|
}
|
|
@@ -1274,11 +1378,14 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1274
1378
|
int is_obj;
|
|
1275
1379
|
|
|
1276
1380
|
if (ps->fhead == 0) { /* top level: parse exactly one value */
|
|
1277
|
-
fj_skip_ws_comments(st);
|
|
1278
1381
|
b = fj_byte(st);
|
|
1382
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1279
1383
|
if (b == '{') { fj_advance(st, 1); fj_fpush(ps, ps->vhead, 1); vss = 0; continue; }
|
|
1280
1384
|
if (b == '[') { fj_advance(st, 1); fj_fpush(ps, ps->vhead, 0); vss = 0; continue; }
|
|
1281
1385
|
if (b == -1) fj_error(st, "unexpected end of input");
|
|
1386
|
+
/* Top-level scalar: must be a recognized JSON value (number / literal / quoted
|
|
1387
|
+
* string). A bare word raises — no top-level quoteless strings (B-broad). The
|
|
1388
|
+
* scalar-vs-separator boundary is enforced in fj_parse_c. */
|
|
1282
1389
|
result = fj_parse_value(st);
|
|
1283
1390
|
break;
|
|
1284
1391
|
}
|
|
@@ -1288,8 +1395,8 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1288
1395
|
|
|
1289
1396
|
if (is_obj) {
|
|
1290
1397
|
VALUE key;
|
|
1291
|
-
fj_skip_ws_comments(st);
|
|
1292
1398
|
b = fj_byte(st);
|
|
1399
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1293
1400
|
if (b == ',') { /* collapsing separator: skip empty member */
|
|
1294
1401
|
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma, collapsed an empty slot");
|
|
1295
1402
|
vss = 0;
|
|
@@ -1316,11 +1423,12 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1316
1423
|
}
|
|
1317
1424
|
if (b == ']') fj_error(st, "unexpected ']' — expected a key or '}'");
|
|
1318
1425
|
key = fj_parse_object_key(st);
|
|
1319
|
-
|
|
1320
|
-
if (
|
|
1426
|
+
b = fj_byte(st);
|
|
1427
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1428
|
+
if (b != ':') fj_error(st, "expected ':' after object key");
|
|
1321
1429
|
fj_advance(st, 1);
|
|
1322
|
-
fj_skip_ws_comments(st);
|
|
1323
1430
|
b = fj_byte(st);
|
|
1431
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1324
1432
|
if (b == '{' || b == '[') {
|
|
1325
1433
|
fj_vpush(ps, key);
|
|
1326
1434
|
fj_advance(st, 1);
|
|
@@ -1340,8 +1448,8 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1340
1448
|
fj_vpush(ps, fj_parse_member_value(st));
|
|
1341
1449
|
vss = 1;
|
|
1342
1450
|
} else { /* array */
|
|
1343
|
-
fj_skip_ws_comments(st);
|
|
1344
1451
|
b = fj_byte(st);
|
|
1452
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1345
1453
|
if (b == ',') { /* collapsing separator: skip empty slot */
|
|
1346
1454
|
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma, collapsed an empty slot");
|
|
1347
1455
|
vss = 0;
|
|
@@ -1367,6 +1475,15 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1367
1475
|
vss = 0;
|
|
1368
1476
|
continue;
|
|
1369
1477
|
}
|
|
1478
|
+
/* Strict hot path: inline the two commonest element types — a number and a
|
|
1479
|
+
plain double-quoted string — so they skip fj_parse_member_value's byte
|
|
1480
|
+
re-read + switch. Everything else (quoteless, single/triple-quote,
|
|
1481
|
+
smart-quote, literals) falls through to the full dispatch below. */
|
|
1482
|
+
if (b == '-' || b == '+' || b == '.' || (b >= '0' && b <= '9')) {
|
|
1483
|
+
VALUE num;
|
|
1484
|
+
if (fj_try_member_number(st, &num)) { fj_vpush(ps, num); vss = 1; continue; }
|
|
1485
|
+
}
|
|
1486
|
+
if (b == '"') { fj_vpush(ps, fj_parse_string(st, '"')); vss = 1; continue; }
|
|
1370
1487
|
fj_vpush(ps, fj_parse_member_value(st));
|
|
1371
1488
|
vss = 1;
|
|
1372
1489
|
}
|
|
@@ -1391,9 +1508,46 @@ static int fj_implicit_root_ahead(fj_state *st) {
|
|
|
1391
1508
|
return result;
|
|
1392
1509
|
}
|
|
1393
1510
|
|
|
1511
|
+
/* Between top-level documents, whitespace, comments, AND commas all separate
|
|
1512
|
+
* (commas collapse like the in-container lenient-comma rule). A space alone never
|
|
1513
|
+
* separates — that is handled inside the document by the quoteless run. Mirrors
|
|
1514
|
+
* the Ruby Parser#skip_document_separators. */
|
|
1515
|
+
static void fj_skip_document_separators(fj_state *st) {
|
|
1516
|
+
for (;;) {
|
|
1517
|
+
fj_skip_ws_comments(st);
|
|
1518
|
+
if (fj_byte(st) != ',') break;
|
|
1519
|
+
fj_advance(st, 1);
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
static int fj_is_hws(int b) { return b == ' ' || b == '\t' || b == 0x0B || b == 0x0C; }
|
|
1524
|
+
|
|
1525
|
+
/* After a top-level value: a self-delimiting value (object / array / string) may be
|
|
1526
|
+
* followed by anything, but a bare scalar (number / keyword) must be followed by a
|
|
1527
|
+
* real separator — a newline, ',', a comment, or EOF. A space is NOT a separator, so
|
|
1528
|
+
* `1 2 3` and `42 "x" true` raise. Mirrors the Ruby Parser#enforce_scalar_boundary. */
|
|
1529
|
+
static void fj_enforce_scalar_boundary(fj_state *st, VALUE value) {
|
|
1530
|
+
int b, nx;
|
|
1531
|
+
if (RB_TYPE_P(value, T_STRING) || RB_TYPE_P(value, T_HASH) || RB_TYPE_P(value, T_ARRAY)) return;
|
|
1532
|
+
for (;;) {
|
|
1533
|
+
b = fj_byte(st);
|
|
1534
|
+
if (b != -1 && fj_is_hws(b)) { fj_advance(st, 1); continue; }
|
|
1535
|
+
if (b != -1 && b >= 0x80) {
|
|
1536
|
+
long m = fj_mbws(st->buf + st->pos, st->len - st->pos);
|
|
1537
|
+
if (m > 0) { st->pos += m; continue; } /* multibyte horizontal whitespace (NBSP, …) */
|
|
1538
|
+
}
|
|
1539
|
+
break;
|
|
1540
|
+
}
|
|
1541
|
+
b = fj_byte(st);
|
|
1542
|
+
if (b == -1 || b == 0x0A || b == 0x0D || b == ',') return;
|
|
1543
|
+
if (b == '#') return;
|
|
1544
|
+
if (b == '/') { nx = fj_byte_at(st, 1); if (nx == '/' || nx == '*') return; }
|
|
1545
|
+
fj_error(st, "a top-level number or keyword must be followed by a newline, ',', or end of input");
|
|
1546
|
+
}
|
|
1547
|
+
|
|
1394
1548
|
static VALUE fj_parse_c(VALUE self, VALUE input, VALUE opts) {
|
|
1395
1549
|
fj_state st;
|
|
1396
|
-
VALUE
|
|
1550
|
+
VALUE enc_opt, dk;
|
|
1397
1551
|
|
|
1398
1552
|
Check_Type(input, T_STRING);
|
|
1399
1553
|
|
|
@@ -1423,13 +1577,12 @@ static VALUE fj_parse_c(VALUE self, VALUE input, VALUE opts) {
|
|
|
1423
1577
|
st.symbolize_keys = RTEST(rb_hash_aref(opts, fj_sym_symbolize_keys));
|
|
1424
1578
|
dk = rb_hash_aref(opts, fj_sym_duplicate_key);
|
|
1425
1579
|
st.dup_first_wins = (dk == fj_sym_first_wins);
|
|
1426
|
-
st.dup_raise = (dk == fj_sym_raise);
|
|
1427
1580
|
|
|
1428
1581
|
{
|
|
1429
|
-
VALUE bd = rb_hash_aref(opts,
|
|
1430
|
-
if (bd == fj_sym_float) st.
|
|
1431
|
-
else if (bd == fj_sym_bigdecimal) st.
|
|
1432
|
-
else st.
|
|
1582
|
+
VALUE bd = rb_hash_aref(opts, fj_sym_decimal_precision);
|
|
1583
|
+
if (bd == fj_sym_float) st.decimal_precision = 0;
|
|
1584
|
+
else if (bd == fj_sym_bigdecimal) st.decimal_precision = 2;
|
|
1585
|
+
else st.decimal_precision = 1; /* :auto (default), including nil */
|
|
1433
1586
|
}
|
|
1434
1587
|
|
|
1435
1588
|
st.on_warning = rb_hash_aref(opts, fj_sym_on_warning); /* Qnil when absent */
|
|
@@ -1439,36 +1592,37 @@ static VALUE fj_parse_c(VALUE self, VALUE input, VALUE opts) {
|
|
|
1439
1592
|
st.pos = 3;
|
|
1440
1593
|
}
|
|
1441
1594
|
|
|
1442
|
-
/* With a block: yield each top-level
|
|
1443
|
-
* concatenated). Same loop as the Ruby each_value path
|
|
1595
|
+
/* With a block: yield each top-level document until EOF and return the document
|
|
1596
|
+
* count (NDJSON / JSONL / concatenated). Same loop as the Ruby each_value path. */
|
|
1444
1597
|
if (rb_block_given_p()) {
|
|
1598
|
+
long count = 0;
|
|
1445
1599
|
for (;;) {
|
|
1446
|
-
|
|
1600
|
+
VALUE v;
|
|
1601
|
+
fj_skip_document_separators(&st);
|
|
1447
1602
|
if (fj_eof(&st)) break;
|
|
1448
|
-
|
|
1603
|
+
v = fj_parse_iter(&st, fj_implicit_root_ahead(&st));
|
|
1604
|
+
fj_enforce_scalar_boundary(&st, v);
|
|
1605
|
+
rb_yield(v);
|
|
1606
|
+
count++;
|
|
1449
1607
|
}
|
|
1450
|
-
return
|
|
1608
|
+
return LONG2NUM(count);
|
|
1451
1609
|
}
|
|
1452
1610
|
|
|
1453
|
-
/* No block:
|
|
1454
|
-
*
|
|
1455
|
-
*
|
|
1456
|
-
*
|
|
1457
|
-
*
|
|
1458
|
-
* whitespace / newline / concatenation do), so a bracketless comma list still
|
|
1459
|
-
* raises in fj_parse_iter — the unsupported implicit-root array. */
|
|
1460
|
-
fj_skip_ws_comments(&st);
|
|
1461
|
-
if (fj_eof(&st)) return Qnil;
|
|
1462
|
-
value = fj_parse_iter(&st, fj_implicit_root_ahead(&st));
|
|
1463
|
-
fj_skip_ws_comments(&st);
|
|
1464
|
-
if (fj_eof(&st)) return value;
|
|
1611
|
+
/* No block: always return an Array of every top-level document (0 -> [], 1 ->
|
|
1612
|
+
* [doc], 2+ -> [d1, d2, …]) — the always-array contract. Documents are separated by
|
|
1613
|
+
* newline / comma / concatenation (self-delimiting values); a space alone never
|
|
1614
|
+
* separates, and a bare scalar must be followed by a real separator, so `1 2 3`
|
|
1615
|
+
* raises while `1\n2\n3` and `1, 2, 3` are three documents. */
|
|
1465
1616
|
{
|
|
1466
1617
|
VALUE arr = rb_ary_new();
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1618
|
+
for (;;) {
|
|
1619
|
+
VALUE v;
|
|
1620
|
+
fj_skip_document_separators(&st);
|
|
1621
|
+
if (fj_eof(&st)) break;
|
|
1622
|
+
v = fj_parse_iter(&st, fj_implicit_root_ahead(&st));
|
|
1623
|
+
fj_enforce_scalar_boundary(&st, v);
|
|
1624
|
+
rb_ary_push(arr, v);
|
|
1625
|
+
}
|
|
1472
1626
|
return arr;
|
|
1473
1627
|
}
|
|
1474
1628
|
}
|
|
@@ -1490,11 +1644,11 @@ void Init_smarter_json(void) {
|
|
|
1490
1644
|
fj_sym_empty_slot = ID2SYM(rb_intern("empty_slot"));
|
|
1491
1645
|
fj_sym_empty_value = ID2SYM(rb_intern("empty_value"));
|
|
1492
1646
|
fj_sym_duplicate_key = ID2SYM(rb_intern("duplicate_key"));
|
|
1647
|
+
fj_sym_number_overflow = ID2SYM(rb_intern("number_overflow"));
|
|
1493
1648
|
fj_sym_encoding = ID2SYM(rb_intern("encoding"));
|
|
1494
1649
|
fj_sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
|
|
1495
1650
|
fj_sym_first_wins = ID2SYM(rb_intern("first_wins"));
|
|
1496
|
-
|
|
1497
|
-
fj_sym_bigdecimal_load = ID2SYM(rb_intern("bigdecimal_load"));
|
|
1651
|
+
fj_sym_decimal_precision = ID2SYM(rb_intern("decimal_precision"));
|
|
1498
1652
|
fj_sym_float = ID2SYM(rb_intern("float"));
|
|
1499
1653
|
fj_sym_bigdecimal = ID2SYM(rb_intern("bigdecimal"));
|
|
1500
1654
|
fj_sym_on_warning = ID2SYM(rb_intern("on_warning"));
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2021 The fast_float authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any
|
|
6
|
+
person obtaining a copy of this software and associated
|
|
7
|
+
documentation files (the "Software"), to deal in the
|
|
8
|
+
Software without restriction, including without
|
|
9
|
+
limitation the rights to use, copy, modify, merge,
|
|
10
|
+
publish, distribute, sublicense, and/or sell copies of
|
|
11
|
+
the Software, and to permit persons to whom the Software
|
|
12
|
+
is furnished to do so, subject to the following
|
|
13
|
+
conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice
|
|
16
|
+
shall be included in all copies or substantial portions
|
|
17
|
+
of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
|
20
|
+
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
|
21
|
+
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
22
|
+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
23
|
+
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
24
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
25
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
|
26
|
+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
27
|
+
DEALINGS IN THE SOFTWARE.
|