smarter_json 0.8.0 → 0.9.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +83 -47
- data/README.md +225 -46
- data/docs/_introduction.md +6 -12
- data/docs/basic_read_api.md +59 -16
- data/docs/basic_write_api.md +2 -2
- data/docs/examples.md +58 -24
- data/docs/options.md +14 -14
- data/ext/smarter_json/smarter_json.c +261 -97
- data/ext/smarter_json/vendor/LICENSE-fast_float-MIT +27 -0
- data/ext/smarter_json/vendor/eisel_lemire.h +117 -0
- data/ext/smarter_json/vendor/eisel_lemire.md +29 -0
- data/ext/smarter_json/vendor/eisel_lemire_powers.h +663 -0
- data/lib/smarter_json/backports.rb +28 -0
- data/lib/smarter_json/options.rb +52 -0
- data/lib/smarter_json/parser.rb +722 -198
- data/lib/smarter_json/version.rb +1 -1
- data/lib/smarter_json.rb +3 -1
- metadata +9 -5
- data/ext/smarter_json/vendor/ryu.h +0 -819
- data/ext/smarter_json/vendor/ryu.md +0 -22
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
#ifdef __ARM_NEON
|
|
5
5
|
#include <arm_neon.h>
|
|
6
6
|
#endif
|
|
7
|
-
#include "vendor/
|
|
7
|
+
#include "vendor/eisel_lemire.h" /* Eisel-Lemire decimal->double, correctly rounded (fast_float) */
|
|
8
8
|
|
|
9
9
|
/* Branch hints / prefetch on the hot scan loops. No-ops on compilers without the
|
|
10
10
|
* builtins (the code is correct either way; these only steer code layout). */
|
|
@@ -12,10 +12,12 @@
|
|
|
12
12
|
# define FJ_LIKELY(x) __builtin_expect(!!(x), 1)
|
|
13
13
|
# define FJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
|
|
14
14
|
# define FJ_PREFETCH(p) __builtin_prefetch(p)
|
|
15
|
+
# define FJ_ALWAYS_INLINE inline __attribute__((always_inline))
|
|
15
16
|
#else
|
|
16
17
|
# define FJ_LIKELY(x) (x)
|
|
17
18
|
# define FJ_UNLIKELY(x) (x)
|
|
18
19
|
# define FJ_PREFETCH(p) ((void)0)
|
|
20
|
+
# define FJ_ALWAYS_INLINE inline
|
|
19
21
|
#endif
|
|
20
22
|
|
|
21
23
|
/*
|
|
@@ -41,6 +43,17 @@ static VALUE fj_sym_duplicate_key;
|
|
|
41
43
|
static ID fj_bigdecimal_id; /* cached BigDecimal() method id (set in Init) */
|
|
42
44
|
static ID fj_to_sym_id; /* cached :to_sym (symbolize_keys) */
|
|
43
45
|
static ID fj_key_p_id; /* cached :key? (non-default duplicate_key modes) */
|
|
46
|
+
static ID fj_force_encoding_id;
|
|
47
|
+
static ID fj_valid_encoding_p_id;
|
|
48
|
+
static ID fj_encoding_id;
|
|
49
|
+
static ID fj_name_id;
|
|
50
|
+
static VALUE fj_sym_encoding;
|
|
51
|
+
static VALUE fj_sym_symbolize_keys;
|
|
52
|
+
static VALUE fj_sym_first_wins;
|
|
53
|
+
static VALUE fj_sym_decimal_precision;
|
|
54
|
+
static VALUE fj_sym_float;
|
|
55
|
+
static VALUE fj_sym_bigdecimal;
|
|
56
|
+
static VALUE fj_sym_on_warning;
|
|
44
57
|
|
|
45
58
|
/* Per-parse direct-mapped key cache: key bytes -> the interned (frozen,
|
|
46
59
|
* globally-rooted) String, so repeated keys skip the global fstring lookup.
|
|
@@ -58,8 +71,7 @@ typedef struct {
|
|
|
58
71
|
int depth;
|
|
59
72
|
int symbolize_keys;
|
|
60
73
|
int dup_first_wins;
|
|
61
|
-
int
|
|
62
|
-
int bigdecimal_load; /* 0 = float, 1 = auto, 2 = bigdecimal */
|
|
74
|
+
int decimal_precision; /* 0 = float, 1 = auto, 2 = bigdecimal */
|
|
63
75
|
fj_kc_slot *kcache; /* per-parse key cache (NULL when interning unavailable) */
|
|
64
76
|
VALUE on_warning; /* on_warning: callable invoked per non-fatal lenient fix, else Qnil */
|
|
65
77
|
} fj_state;
|
|
@@ -156,20 +168,39 @@ static long fj_mbws(const char *p, long n) {
|
|
|
156
168
|
return 0;
|
|
157
169
|
}
|
|
158
170
|
|
|
171
|
+
/* Skip a run of whitespace. This is hot on pretty-printed input, where most of
|
|
172
|
+
* the bytes are indentation. Indentation is homogeneous — all spaces OR all tabs,
|
|
173
|
+
* the two common styles — so a run of it is skipped 8 bytes at a time with a
|
|
174
|
+
* single 64-bit compare (the uniform-byte patterns read the same regardless of
|
|
175
|
+
* endianness). Everything else — newlines, CR, short/partial runs, and Unicode
|
|
176
|
+
* whitespace — falls to the tight byte loop, which also avoids the per-byte helper
|
|
177
|
+
* calls (fj_byte / fj_is_ws / fj_advance) the previous byte-at-a-time version paid.
|
|
178
|
+
* The set of bytes treated as whitespace is unchanged. */
|
|
159
179
|
static void fj_skip_pure_ws(fj_state *st) {
|
|
180
|
+
const char *p = st->buf + st->pos;
|
|
181
|
+
const char *end = st->buf + st->len;
|
|
160
182
|
for (;;) {
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
} else if (b >= 0x80) {
|
|
166
|
-
long m = fj_mbws(st->buf + st->pos, st->len - st->pos);
|
|
167
|
-
if (m == 0) break;
|
|
168
|
-
st->pos += m;
|
|
169
|
-
} else {
|
|
183
|
+
while (end - p >= 8) {
|
|
184
|
+
uint64_t w;
|
|
185
|
+
memcpy(&w, p, 8);
|
|
186
|
+
if (w == 0x2020202020202020ULL || w == 0x0909090909090909ULL) { p += 8; continue; }
|
|
170
187
|
break;
|
|
171
188
|
}
|
|
189
|
+
if (p >= end) break;
|
|
190
|
+
{
|
|
191
|
+
unsigned char b = (unsigned char)*p;
|
|
192
|
+
if (b == 0x20 || (b >= 0x09 && b <= 0x0D)) {
|
|
193
|
+
p++;
|
|
194
|
+
} else if (b >= 0x80) {
|
|
195
|
+
long m = fj_mbws(p, end - p);
|
|
196
|
+
if (m == 0) break;
|
|
197
|
+
p += m;
|
|
198
|
+
} else {
|
|
199
|
+
break;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
172
202
|
}
|
|
203
|
+
st->pos = p - st->buf;
|
|
173
204
|
}
|
|
174
205
|
|
|
175
206
|
/* A comment marker only starts a comment when preceded by whitespace or at the
|
|
@@ -216,6 +247,18 @@ static void fj_skip_ws_comments(fj_state *st) {
|
|
|
216
247
|
}
|
|
217
248
|
}
|
|
218
249
|
|
|
250
|
+
/* Cheap guard for the hot loop: could the current byte begin whitespace or a
|
|
251
|
+
* comment marker, so the (otherwise no-op) fj_skip_ws_comments call is actually
|
|
252
|
+
* needed? Compact data — the next byte is already a structural char or a value
|
|
253
|
+
* start — answers no, and we elide both the call and its memcpy/lookahead. ASCII
|
|
254
|
+
* whitespace, '#', '/', and possible multibyte-ws lead bytes (>=0x80) answer yes;
|
|
255
|
+
* EOF (-1) answers no (the caller's existing -1 checks handle it). Behaviour is
|
|
256
|
+
* identical to always calling fj_skip_ws_comments — this only skips a known no-op. */
|
|
257
|
+
static inline int fj_needs_ws_skip(int b) {
|
|
258
|
+
if (b < 0) return 0;
|
|
259
|
+
return b == 0x20 || (b >= 0x09 && b <= 0x0D) || b == '#' || b == '/' || b >= 0x80;
|
|
260
|
+
}
|
|
261
|
+
|
|
219
262
|
/* forward declarations (mutual recursion) */
|
|
220
263
|
static VALUE fj_parse_value(fj_state *st);
|
|
221
264
|
static VALUE fj_parse_member_value(fj_state *st);
|
|
@@ -373,11 +416,17 @@ static void fj_consume_keyword(fj_state *st, const char *word) {
|
|
|
373
416
|
fj_advance(st, n);
|
|
374
417
|
}
|
|
375
418
|
|
|
376
|
-
/* Copy a byte range into a fresh String, dropping underscores.
|
|
419
|
+
/* Copy a byte range into a fresh String, dropping underscores. Copies whole
|
|
420
|
+
* underscore-free runs in bulk, rather than one byte at a time. */
|
|
377
421
|
static VALUE fj_strip_underscores(const char *p, long n) {
|
|
378
422
|
VALUE s = rb_str_buf_new(n);
|
|
379
|
-
long i;
|
|
380
|
-
|
|
423
|
+
long i = 0;
|
|
424
|
+
while (i < n) {
|
|
425
|
+
long start = i;
|
|
426
|
+
while (i < n && p[i] != '_') i++;
|
|
427
|
+
if (i > start) rb_str_buf_cat(s, p + start, i - start);
|
|
428
|
+
if (i < n) i++; /* skip '_' */
|
|
429
|
+
}
|
|
381
430
|
return s;
|
|
382
431
|
}
|
|
383
432
|
|
|
@@ -453,7 +502,7 @@ static VALUE fj_to_bigdecimal_token(const char *p, long n) {
|
|
|
453
502
|
* (quoteless path) call these, so the Integer/Float a token produces is identical
|
|
454
503
|
* no matter which path scanned it. [p, n) is the raw token slice (with any sign),
|
|
455
504
|
* needed only by the bignum / strtod fallbacks. */
|
|
456
|
-
static VALUE fj_int_from_parts(uint64_t m, int digits, int neg, int overflow, const char *p, long n) {
|
|
505
|
+
static FJ_ALWAYS_INLINE VALUE fj_int_from_parts(uint64_t m, int digits, int neg, int overflow, const char *p, long n) {
|
|
457
506
|
if (!overflow && digits >= 1 && digits <= 18) {
|
|
458
507
|
int64_t v = (int64_t)m;
|
|
459
508
|
return LL2NUM(neg ? -v : v);
|
|
@@ -463,16 +512,87 @@ static VALUE fj_int_from_parts(uint64_t m, int digits, int neg, int overflow, co
|
|
|
463
512
|
return rb_str_to_inum(fj_strip_underscores(p, n), 10, 0);
|
|
464
513
|
}
|
|
465
514
|
|
|
515
|
+
/* Convert a >17-digit / subnormal float token to a double. A double resolves ~17
|
|
516
|
+
* significant decimals; the digits past that affect only the final round-to-nearest-
|
|
517
|
+
* even, which a single sticky marker ("was any dropped digit nonzero?") captures. So
|
|
518
|
+
* we keep FJ_FLOAT_ODD_DIGITS significant digits and, if more nonzero digits follow,
|
|
519
|
+
* force the last kept digit odd (round-to-odd). strtod's round-to-nearest of that
|
|
520
|
+
* shorter mantissa then equals round-to-nearest of the full value — but strtod grinds
|
|
521
|
+
* far fewer digits. The kept count is well above 2x double's ~16 significant decimals,
|
|
522
|
+
* which is what round-to-odd needs to be exact (verified bit-for-bit against
|
|
523
|
+
* JSON.parse on the high-precision corpus). The token is rebuilt into a NUL-terminated
|
|
524
|
+
* "<digits>e<exp>" buffer (passing the raw input slice would make rb_cstr_to_dbl treat
|
|
525
|
+
* the trailing delimiter as garbage and re-run strtod a second time). */
|
|
526
|
+
#define FJ_FLOAT_ODD_DIGITS 40
|
|
527
|
+
static VALUE fj_float_strtod(const char *p, long n) {
|
|
528
|
+
char digits[FJ_FLOAT_ODD_DIGITS];
|
|
529
|
+
char out[FJ_FLOAT_ODD_DIGITS + 40];
|
|
530
|
+
long i = 0, ow = 0, kept = 0, point_pos = 0, lead_frac_zeros = 0;
|
|
531
|
+
int neg = 0, after_point = 0, seen_sig = 0, sticky = 0, esign = 0;
|
|
532
|
+
int64_t expl_exp = 0, x;
|
|
533
|
+
|
|
534
|
+
if (i < n && (p[i] == '+' || p[i] == '-')) { neg = (p[i] == '-'); i++; }
|
|
535
|
+
|
|
536
|
+
for (; i < n; i++) {
|
|
537
|
+
char c = p[i];
|
|
538
|
+
if (c == '_') continue;
|
|
539
|
+
if (c == '.') { after_point = 1; continue; }
|
|
540
|
+
if (c == 'e' || c == 'E') { i++; break; }
|
|
541
|
+
if (!seen_sig && c == '0') { if (after_point) lead_frac_zeros++; continue; }
|
|
542
|
+
seen_sig = 1;
|
|
543
|
+
if (!after_point) point_pos++;
|
|
544
|
+
if (kept < FJ_FLOAT_ODD_DIGITS) digits[kept++] = c;
|
|
545
|
+
else if (c != '0') sticky = 1;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
if (i < n && (p[i] == '+' || p[i] == '-')) { esign = (p[i] == '-'); i++; }
|
|
549
|
+
for (; i < n; i++) {
|
|
550
|
+
char c = p[i];
|
|
551
|
+
if (c == '_') continue;
|
|
552
|
+
if (c < '0' || c > '9') break;
|
|
553
|
+
expl_exp = expl_exp * 10 + (c - '0');
|
|
554
|
+
}
|
|
555
|
+
if (esign) expl_exp = -expl_exp;
|
|
556
|
+
|
|
557
|
+
if (kept == 0) return rb_float_new(neg ? -0.0 : 0.0);
|
|
558
|
+
|
|
559
|
+
/* round-to-odd: a dropped nonzero tail forces the last kept digit odd. */
|
|
560
|
+
if (sticky && ((digits[kept - 1] - '0') % 2) == 0) digits[kept - 1]++;
|
|
561
|
+
|
|
562
|
+
x = expl_exp + point_pos - lead_frac_zeros - kept;
|
|
563
|
+
if (neg) out[ow++] = '-';
|
|
564
|
+
memcpy(out + ow, digits, (size_t)kept);
|
|
565
|
+
ow += kept;
|
|
566
|
+
/* Append "e<exp>" by hand. snprintf here showed up as BSD_vfprintf in profiling —
|
|
567
|
+
a full printf formatter per number is absurdly heavy for one integer. */
|
|
568
|
+
out[ow++] = 'e';
|
|
569
|
+
if (x < 0) { out[ow++] = '-'; x = -x; }
|
|
570
|
+
{
|
|
571
|
+
char ex[24];
|
|
572
|
+
int en = 0;
|
|
573
|
+
if (x == 0) ex[en++] = '0';
|
|
574
|
+
else while (x > 0) { ex[en++] = (char)('0' + (int)(x % 10)); x /= 10; }
|
|
575
|
+
while (en > 0) out[ow++] = ex[--en];
|
|
576
|
+
}
|
|
577
|
+
out[ow] = '\0';
|
|
578
|
+
return rb_float_new(rb_cstr_to_dbl(out, 0));
|
|
579
|
+
}
|
|
580
|
+
|
|
466
581
|
/* e10 is the final base-10 exponent (already adjusted by the fraction length). */
|
|
467
|
-
static VALUE fj_float_from_parts(uint64_t m10, int m10digits, int64_t e10, int neg, int overflow, const char *p, long n) {
|
|
468
|
-
/*
|
|
469
|
-
|
|
582
|
+
static FJ_ALWAYS_INLINE VALUE fj_float_from_parts(uint64_t m10, int m10digits, int64_t e10, int neg, int overflow, const char *p, long n) {
|
|
583
|
+
/* Fast path by mantissa width (our scanner accumulates m10 exactly up to 18
|
|
584
|
+
digits, flagging overflow beyond):
|
|
585
|
+
1..18 digits -> Eisel-Lemire, correctly-rounded for any exact uint64 mantissa
|
|
586
|
+
(Mushtak-Lemire). This pulls full-double-precision data (e.g.
|
|
587
|
+
citylots coordinates, 18 sig digits) off the slow strtod
|
|
588
|
+
fallback — the stdlib json gem still strtods it.
|
|
589
|
+
>18 digits / overflow / extreme exponent -> strtod (round-to-odd). */
|
|
590
|
+
if (!overflow && m10digits >= 1 && m10digits <= 18 && (long)m10digits + e10 >= -307) {
|
|
470
591
|
if (m10 == 0) return rb_float_new(neg ? -0.0 : 0.0);
|
|
471
|
-
return rb_float_new(
|
|
592
|
+
return rb_float_new(fj_eisel_lemire_s2d(e10, m10, neg));
|
|
472
593
|
}
|
|
473
|
-
/* Fallback for >
|
|
474
|
-
|
|
475
|
-
return rb_float_new(rb_str_to_dbl(fj_strip_underscores(p, n), 0));
|
|
594
|
+
/* Fallback for >18 digits / extreme or subnormal exponents. */
|
|
595
|
+
return fj_float_strtod(p, n);
|
|
476
596
|
}
|
|
477
597
|
|
|
478
598
|
/* Scan an already-bounded quoteless token [p, p+n) exactly once: validate it as a
|
|
@@ -553,8 +673,8 @@ static int fj_try_decimal(fj_state *st, const char *p, long n, VALUE *out) {
|
|
|
553
673
|
e10 -= frac;
|
|
554
674
|
/* :bigdecimal always; :auto only when significant digits > 16. m10digits is >=
|
|
555
675
|
* the significant-digit count, so m10digits <= 16 skips the fj_sig_digits scan. */
|
|
556
|
-
if (st->
|
|
557
|
-
(st->
|
|
676
|
+
if (st->decimal_precision == 2 ||
|
|
677
|
+
(st->decimal_precision == 1 && m10digits > 16 && fj_sig_digits(p, n) > 16)) {
|
|
558
678
|
*out = fj_to_bigdecimal_token(p, n);
|
|
559
679
|
} else {
|
|
560
680
|
*out = fj_float_from_parts(m10, m10digits, e10, neg, overflow, p, n);
|
|
@@ -578,7 +698,7 @@ static VALUE fj_parse_number(fj_state *st) {
|
|
|
578
698
|
long nlen;
|
|
579
699
|
int is_float = 0, neg = 0, overflow = 0;
|
|
580
700
|
uint64_t m10 = 0; /* mantissa: integer + fraction digits */
|
|
581
|
-
int m10digits = 0; /* mantissa digit chars (caps the
|
|
701
|
+
int m10digits = 0; /* mantissa digit chars (caps the Eisel-Lemire fast path at 18) */
|
|
582
702
|
int frac = 0; /* fraction digit chars: e10 -= frac */
|
|
583
703
|
int64_t e10 = 0;
|
|
584
704
|
|
|
@@ -665,8 +785,8 @@ static VALUE fj_parse_number(fj_state *st) {
|
|
|
665
785
|
* when significant digits > 16. Since m10digits >= significant digits, m10digits
|
|
666
786
|
* <= 16 guarantees not-BigDecimal and lets us skip the fj_sig_digits scan
|
|
667
787
|
* entirely (the common case — e.g. every coordinate in canada.json). */
|
|
668
|
-
if (st->
|
|
669
|
-
(st->
|
|
788
|
+
if (st->decimal_precision == 2 ||
|
|
789
|
+
(st->decimal_precision == 1 && m10digits > 16 && fj_sig_digits(np, nlen) > 16)) {
|
|
670
790
|
return fj_to_bigdecimal_token(np, nlen);
|
|
671
791
|
}
|
|
672
792
|
return fj_float_from_parts(m10, m10digits, e10, neg, overflow, np, nlen);
|
|
@@ -833,7 +953,8 @@ static VALUE fj_classify_quoteless(fj_state *st, const char *p0, long n0) {
|
|
|
833
953
|
* before the whitespace check. */
|
|
834
954
|
enum { FJ_QL_ORD = 0, FJ_QL_TERM, FJ_QL_WS, FJ_QL_CMT };
|
|
835
955
|
static const unsigned char fj_ql_class[256] = {
|
|
836
|
-
[','] = FJ_QL_TERM, ['
|
|
956
|
+
[','] = FJ_QL_TERM, ['{'] = FJ_QL_TERM, ['}'] = FJ_QL_TERM,
|
|
957
|
+
['['] = FJ_QL_TERM, [']'] = FJ_QL_TERM,
|
|
837
958
|
[0x0A] = FJ_QL_TERM, [0x0D] = FJ_QL_TERM,
|
|
838
959
|
[0x09] = FJ_QL_WS, [0x0B] = FJ_QL_WS, [0x0C] = FJ_QL_WS, [' '] = FJ_QL_WS,
|
|
839
960
|
['#'] = FJ_QL_CMT, ['/'] = FJ_QL_CMT,
|
|
@@ -1060,7 +1181,8 @@ static int fj_try_member_number(fj_state *st, VALUE *out) {
|
|
|
1060
1181
|
/* Commit only if the number abuts a value terminator; otherwise (whitespace,
|
|
1061
1182
|
* letters, a second '.', "0x…", …) leave it to the quoteless scanner. */
|
|
1062
1183
|
t = (unsigned char)*p;
|
|
1063
|
-
if (!(t == ',' || t == '
|
|
1184
|
+
if (!(t == ',' || t == '{' || t == '}' || t == '[' || t == ']' ||
|
|
1185
|
+
t == 0x0A || t == 0x0D || p == buf + st->len)) {
|
|
1064
1186
|
return 0;
|
|
1065
1187
|
}
|
|
1066
1188
|
|
|
@@ -1071,8 +1193,8 @@ static int fj_try_member_number(fj_state *st, VALUE *out) {
|
|
|
1071
1193
|
return 1;
|
|
1072
1194
|
}
|
|
1073
1195
|
e10 -= frac;
|
|
1074
|
-
if (st->
|
|
1075
|
-
(st->
|
|
1196
|
+
if (st->decimal_precision == 2 ||
|
|
1197
|
+
(st->decimal_precision == 1 && m10digits > 16 && fj_sig_digits(np, nlen) > 16)) {
|
|
1076
1198
|
*out = fj_to_bigdecimal_token(np, nlen);
|
|
1077
1199
|
} else {
|
|
1078
1200
|
*out = fj_float_from_parts(m10, m10digits, e10, neg, overflow, np, nlen);
|
|
@@ -1146,19 +1268,9 @@ static void fj_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) {
|
|
|
1146
1268
|
void rb_hash_bulk_insert(long, const VALUE *, VALUE);
|
|
1147
1269
|
#endif
|
|
1148
1270
|
|
|
1149
|
-
/* Hash entry count as a C long. RHASH_SIZE is not part of the public C API on
|
|
1150
|
-
* older Ruby (< ~2.7), but rb_hash_size (Hash#size's implementation) is available
|
|
1151
|
-
* everywhere. Only used on the rare :raise duplicate-key path, so the boxing cost
|
|
1152
|
-
* is irrelevant — and it keeps the extension buildable down to Ruby 2.5. */
|
|
1153
|
-
static inline long fj_hash_len(VALUE hash) {
|
|
1154
|
-
return NUM2LONG(rb_hash_size(hash));
|
|
1155
|
-
}
|
|
1156
|
-
|
|
1157
1271
|
/* Build a Hash from `count` interleaved key,value slots. Fast path (String keys,
|
|
1158
|
-
* default :last_wins
|
|
1159
|
-
*
|
|
1160
|
-
* actually happened. symbolize_keys / :first_wins use a per-member loop into the
|
|
1161
|
-
* same pre-sized hash. */
|
|
1272
|
+
* default :last_wins): pre-size + bulk insert. symbolize_keys / :first_wins use a
|
|
1273
|
+
* per-member loop into the same pre-sized hash. */
|
|
1162
1274
|
static VALUE fj_build_object(fj_state *st, const VALUE *pairs, long count) {
|
|
1163
1275
|
long entries = count / 2, i;
|
|
1164
1276
|
VALUE hash = rb_hash_new_capa(entries);
|
|
@@ -1167,22 +1279,13 @@ static VALUE fj_build_object(fj_state *st, const VALUE *pairs, long count) {
|
|
|
1167
1279
|
* the per-member loop below to report each dropped duplicate key. */
|
|
1168
1280
|
if (!st->symbolize_keys && !st->dup_first_wins && st->on_warning == Qnil) {
|
|
1169
1281
|
rb_hash_bulk_insert(count, pairs, hash);
|
|
1170
|
-
if (st->dup_raise && fj_hash_len(hash) < entries) {
|
|
1171
|
-
VALUE seen = rb_hash_new_capa(entries);
|
|
1172
|
-
for (i = 0; i + 1 < count; i += 2) {
|
|
1173
|
-
long before = fj_hash_len(seen);
|
|
1174
|
-
rb_hash_aset(seen, pairs[i], Qtrue);
|
|
1175
|
-
if (fj_hash_len(seen) == before) fj_error(st, "duplicate key");
|
|
1176
|
-
}
|
|
1177
|
-
}
|
|
1178
1282
|
return hash;
|
|
1179
1283
|
}
|
|
1180
1284
|
|
|
1181
1285
|
for (i = 0; i + 1 < count; i += 2) {
|
|
1182
1286
|
VALUE k = st->symbolize_keys ? rb_funcall(pairs[i], fj_to_sym_id, 0) : pairs[i];
|
|
1183
|
-
if (st->dup_first_wins || st->
|
|
1287
|
+
if (st->dup_first_wins || st->on_warning != Qnil) {
|
|
1184
1288
|
if (RTEST(rb_funcall(hash, fj_key_p_id, 1, k))) {
|
|
1185
|
-
if (st->dup_raise) fj_error(st, "duplicate key");
|
|
1186
1289
|
fj_warn(st, fj_sym_duplicate_key, "duplicate key");
|
|
1187
1290
|
if (st->dup_first_wins) continue;
|
|
1188
1291
|
}
|
|
@@ -1256,11 +1359,14 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1256
1359
|
int is_obj;
|
|
1257
1360
|
|
|
1258
1361
|
if (ps->fhead == 0) { /* top level: parse exactly one value */
|
|
1259
|
-
fj_skip_ws_comments(st);
|
|
1260
1362
|
b = fj_byte(st);
|
|
1363
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1261
1364
|
if (b == '{') { fj_advance(st, 1); fj_fpush(ps, ps->vhead, 1); vss = 0; continue; }
|
|
1262
1365
|
if (b == '[') { fj_advance(st, 1); fj_fpush(ps, ps->vhead, 0); vss = 0; continue; }
|
|
1263
1366
|
if (b == -1) fj_error(st, "unexpected end of input");
|
|
1367
|
+
/* Top-level scalar: must be a recognized JSON value (number / literal / quoted
|
|
1368
|
+
* string). A bare word raises — no top-level quoteless strings (B-broad). The
|
|
1369
|
+
* scalar-vs-separator boundary is enforced in fj_parse_c. */
|
|
1264
1370
|
result = fj_parse_value(st);
|
|
1265
1371
|
break;
|
|
1266
1372
|
}
|
|
@@ -1270,8 +1376,8 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1270
1376
|
|
|
1271
1377
|
if (is_obj) {
|
|
1272
1378
|
VALUE key;
|
|
1273
|
-
fj_skip_ws_comments(st);
|
|
1274
1379
|
b = fj_byte(st);
|
|
1380
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1275
1381
|
if (b == ',') { /* collapsing separator: skip empty member */
|
|
1276
1382
|
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma, collapsed an empty slot");
|
|
1277
1383
|
vss = 0;
|
|
@@ -1298,11 +1404,12 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1298
1404
|
}
|
|
1299
1405
|
if (b == ']') fj_error(st, "unexpected ']' — expected a key or '}'");
|
|
1300
1406
|
key = fj_parse_object_key(st);
|
|
1301
|
-
|
|
1302
|
-
if (
|
|
1407
|
+
b = fj_byte(st);
|
|
1408
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1409
|
+
if (b != ':') fj_error(st, "expected ':' after object key");
|
|
1303
1410
|
fj_advance(st, 1);
|
|
1304
|
-
fj_skip_ws_comments(st);
|
|
1305
1411
|
b = fj_byte(st);
|
|
1412
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1306
1413
|
if (b == '{' || b == '[') {
|
|
1307
1414
|
fj_vpush(ps, key);
|
|
1308
1415
|
fj_advance(st, 1);
|
|
@@ -1322,8 +1429,8 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1322
1429
|
fj_vpush(ps, fj_parse_member_value(st));
|
|
1323
1430
|
vss = 1;
|
|
1324
1431
|
} else { /* array */
|
|
1325
|
-
fj_skip_ws_comments(st);
|
|
1326
1432
|
b = fj_byte(st);
|
|
1433
|
+
if (FJ_UNLIKELY(fj_needs_ws_skip(b))) { fj_skip_ws_comments(st); b = fj_byte(st); }
|
|
1327
1434
|
if (b == ',') { /* collapsing separator: skip empty slot */
|
|
1328
1435
|
if (st->on_warning != Qnil && !vss) fj_warn(st, fj_sym_empty_slot, "extra comma, collapsed an empty slot");
|
|
1329
1436
|
vss = 0;
|
|
@@ -1349,6 +1456,15 @@ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
|
|
|
1349
1456
|
vss = 0;
|
|
1350
1457
|
continue;
|
|
1351
1458
|
}
|
|
1459
|
+
/* Strict hot path: inline the two commonest element types — a number and a
|
|
1460
|
+
plain double-quoted string — so they skip fj_parse_member_value's byte
|
|
1461
|
+
re-read + switch. Everything else (quoteless, single/triple-quote,
|
|
1462
|
+
smart-quote, literals) falls through to the full dispatch below. */
|
|
1463
|
+
if (b == '-' || b == '+' || b == '.' || (b >= '0' && b <= '9')) {
|
|
1464
|
+
VALUE num;
|
|
1465
|
+
if (fj_try_member_number(st, &num)) { fj_vpush(ps, num); vss = 1; continue; }
|
|
1466
|
+
}
|
|
1467
|
+
if (b == '"') { fj_vpush(ps, fj_parse_string(st, '"')); vss = 1; continue; }
|
|
1352
1468
|
fj_vpush(ps, fj_parse_member_value(st));
|
|
1353
1469
|
vss = 1;
|
|
1354
1470
|
}
|
|
@@ -1373,20 +1489,57 @@ static int fj_implicit_root_ahead(fj_state *st) {
|
|
|
1373
1489
|
return result;
|
|
1374
1490
|
}
|
|
1375
1491
|
|
|
1492
|
+
/* Between top-level documents, whitespace, comments, AND commas all separate
|
|
1493
|
+
* (commas collapse like the in-container lenient-comma rule). A space alone never
|
|
1494
|
+
* separates — that is handled inside the document by the quoteless run. Mirrors
|
|
1495
|
+
* the Ruby Parser#skip_document_separators. */
|
|
1496
|
+
static void fj_skip_document_separators(fj_state *st) {
|
|
1497
|
+
for (;;) {
|
|
1498
|
+
fj_skip_ws_comments(st);
|
|
1499
|
+
if (fj_byte(st) != ',') break;
|
|
1500
|
+
fj_advance(st, 1);
|
|
1501
|
+
}
|
|
1502
|
+
}
|
|
1503
|
+
|
|
1504
|
+
static int fj_is_hws(int b) { return b == ' ' || b == '\t' || b == 0x0B || b == 0x0C; }
|
|
1505
|
+
|
|
1506
|
+
/* After a top-level value: a self-delimiting value (object / array / string) may be
|
|
1507
|
+
* followed by anything, but a bare scalar (number / keyword) must be followed by a
|
|
1508
|
+
* real separator — a newline, ',', a comment, or EOF. A space is NOT a separator, so
|
|
1509
|
+
* `1 2 3` and `42 "x" true` raise. Mirrors the Ruby Parser#enforce_scalar_boundary. */
|
|
1510
|
+
static void fj_enforce_scalar_boundary(fj_state *st, VALUE value) {
|
|
1511
|
+
int b, nx;
|
|
1512
|
+
if (RB_TYPE_P(value, T_STRING) || RB_TYPE_P(value, T_HASH) || RB_TYPE_P(value, T_ARRAY)) return;
|
|
1513
|
+
for (;;) {
|
|
1514
|
+
b = fj_byte(st);
|
|
1515
|
+
if (b != -1 && fj_is_hws(b)) { fj_advance(st, 1); continue; }
|
|
1516
|
+
if (b != -1 && b >= 0x80) {
|
|
1517
|
+
long m = fj_mbws(st->buf + st->pos, st->len - st->pos);
|
|
1518
|
+
if (m > 0) { st->pos += m; continue; } /* multibyte horizontal whitespace (NBSP, …) */
|
|
1519
|
+
}
|
|
1520
|
+
break;
|
|
1521
|
+
}
|
|
1522
|
+
b = fj_byte(st);
|
|
1523
|
+
if (b == -1 || b == 0x0A || b == 0x0D || b == ',') return;
|
|
1524
|
+
if (b == '#') return;
|
|
1525
|
+
if (b == '/') { nx = fj_byte_at(st, 1); if (nx == '/' || nx == '*') return; }
|
|
1526
|
+
fj_error(st, "a top-level number or keyword must be followed by a newline, ',', or end of input");
|
|
1527
|
+
}
|
|
1528
|
+
|
|
1376
1529
|
static VALUE fj_parse_c(VALUE self, VALUE input, VALUE opts) {
|
|
1377
1530
|
fj_state st;
|
|
1378
|
-
VALUE
|
|
1531
|
+
VALUE enc_opt, dk;
|
|
1379
1532
|
|
|
1380
1533
|
Check_Type(input, T_STRING);
|
|
1381
1534
|
|
|
1382
|
-
enc_opt = rb_hash_aref(opts,
|
|
1535
|
+
enc_opt = rb_hash_aref(opts, fj_sym_encoding);
|
|
1383
1536
|
if (!NIL_P(enc_opt)) {
|
|
1384
|
-
input = rb_funcall(rb_str_dup(input),
|
|
1537
|
+
input = rb_funcall(rb_str_dup(input), fj_force_encoding_id, 1, enc_opt);
|
|
1385
1538
|
}
|
|
1386
|
-
if (!RTEST(rb_funcall(input,
|
|
1387
|
-
VALUE name = rb_funcall(rb_funcall(input,
|
|
1539
|
+
if (!RTEST(rb_funcall(input, fj_valid_encoding_p_id, 0))) {
|
|
1540
|
+
VALUE name = rb_funcall(rb_funcall(input, fj_encoding_id, 0), fj_name_id, 0);
|
|
1388
1541
|
VALUE msg = rb_sprintf("invalid byte sequence for %" PRIsVALUE, name);
|
|
1389
|
-
rb_exc_raise(rb_funcall(cEncodingError,
|
|
1542
|
+
rb_exc_raise(rb_funcall(cEncodingError, fj_new_id, 3, msg, Qnil, Qnil));
|
|
1390
1543
|
}
|
|
1391
1544
|
|
|
1392
1545
|
st.buf = RSTRING_PTR(input);
|
|
@@ -1402,55 +1555,55 @@ static VALUE fj_parse_c(VALUE self, VALUE input, VALUE opts) {
|
|
|
1402
1555
|
st.kcache = NULL;
|
|
1403
1556
|
#endif
|
|
1404
1557
|
|
|
1405
|
-
st.symbolize_keys = RTEST(rb_hash_aref(opts,
|
|
1406
|
-
dk = rb_hash_aref(opts,
|
|
1407
|
-
st.dup_first_wins = (dk ==
|
|
1408
|
-
st.dup_raise = (dk == ID2SYM(rb_intern("raise")));
|
|
1558
|
+
st.symbolize_keys = RTEST(rb_hash_aref(opts, fj_sym_symbolize_keys));
|
|
1559
|
+
dk = rb_hash_aref(opts, fj_sym_duplicate_key);
|
|
1560
|
+
st.dup_first_wins = (dk == fj_sym_first_wins);
|
|
1409
1561
|
|
|
1410
1562
|
{
|
|
1411
|
-
VALUE bd = rb_hash_aref(opts,
|
|
1412
|
-
if (bd ==
|
|
1413
|
-
else if (bd ==
|
|
1414
|
-
else st.
|
|
1563
|
+
VALUE bd = rb_hash_aref(opts, fj_sym_decimal_precision);
|
|
1564
|
+
if (bd == fj_sym_float) st.decimal_precision = 0;
|
|
1565
|
+
else if (bd == fj_sym_bigdecimal) st.decimal_precision = 2;
|
|
1566
|
+
else st.decimal_precision = 1; /* :auto (default), including nil */
|
|
1415
1567
|
}
|
|
1416
1568
|
|
|
1417
|
-
st.on_warning = rb_hash_aref(opts,
|
|
1569
|
+
st.on_warning = rb_hash_aref(opts, fj_sym_on_warning); /* Qnil when absent */
|
|
1418
1570
|
|
|
1419
1571
|
if (st.len >= 3 && (unsigned char)st.buf[0] == 0xEF &&
|
|
1420
1572
|
(unsigned char)st.buf[1] == 0xBB && (unsigned char)st.buf[2] == 0xBF) {
|
|
1421
1573
|
st.pos = 3;
|
|
1422
1574
|
}
|
|
1423
1575
|
|
|
1424
|
-
/* With a block: yield each top-level
|
|
1425
|
-
* concatenated). Same loop as the Ruby each_value path
|
|
1576
|
+
/* With a block: yield each top-level document until EOF and return the document
|
|
1577
|
+
* count (NDJSON / JSONL / concatenated). Same loop as the Ruby each_value path. */
|
|
1426
1578
|
if (rb_block_given_p()) {
|
|
1579
|
+
long count = 0;
|
|
1427
1580
|
for (;;) {
|
|
1428
|
-
|
|
1581
|
+
VALUE v;
|
|
1582
|
+
fj_skip_document_separators(&st);
|
|
1429
1583
|
if (fj_eof(&st)) break;
|
|
1430
|
-
|
|
1584
|
+
v = fj_parse_iter(&st, fj_implicit_root_ahead(&st));
|
|
1585
|
+
fj_enforce_scalar_boundary(&st, v);
|
|
1586
|
+
rb_yield(v);
|
|
1587
|
+
count++;
|
|
1431
1588
|
}
|
|
1432
|
-
return
|
|
1589
|
+
return LONG2NUM(count);
|
|
1433
1590
|
}
|
|
1434
1591
|
|
|
1435
|
-
/* No block:
|
|
1436
|
-
*
|
|
1437
|
-
*
|
|
1438
|
-
*
|
|
1439
|
-
*
|
|
1440
|
-
* whitespace / newline / concatenation do), so a bracketless comma list still
|
|
1441
|
-
* raises in fj_parse_iter — the unsupported implicit-root array. */
|
|
1442
|
-
fj_skip_ws_comments(&st);
|
|
1443
|
-
if (fj_eof(&st)) return Qnil;
|
|
1444
|
-
value = fj_parse_iter(&st, fj_implicit_root_ahead(&st));
|
|
1445
|
-
fj_skip_ws_comments(&st);
|
|
1446
|
-
if (fj_eof(&st)) return value;
|
|
1592
|
+
/* No block: always return an Array of every top-level document (0 -> [], 1 ->
|
|
1593
|
+
* [doc], 2+ -> [d1, d2, …]) — the always-array contract. Documents are separated by
|
|
1594
|
+
* newline / comma / concatenation (self-delimiting values); a space alone never
|
|
1595
|
+
* separates, and a bare scalar must be followed by a real separator, so `1 2 3`
|
|
1596
|
+
* raises while `1\n2\n3` and `1, 2, 3` are three documents. */
|
|
1447
1597
|
{
|
|
1448
1598
|
VALUE arr = rb_ary_new();
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1599
|
+
for (;;) {
|
|
1600
|
+
VALUE v;
|
|
1601
|
+
fj_skip_document_separators(&st);
|
|
1602
|
+
if (fj_eof(&st)) break;
|
|
1603
|
+
v = fj_parse_iter(&st, fj_implicit_root_ahead(&st));
|
|
1604
|
+
fj_enforce_scalar_boundary(&st, v);
|
|
1605
|
+
rb_ary_push(arr, v);
|
|
1606
|
+
}
|
|
1454
1607
|
return arr;
|
|
1455
1608
|
}
|
|
1456
1609
|
}
|
|
@@ -1465,8 +1618,19 @@ void Init_smarter_json(void) {
|
|
|
1465
1618
|
fj_key_p_id = rb_intern("key?");
|
|
1466
1619
|
fj_new_id = rb_intern("new");
|
|
1467
1620
|
fj_call_id = rb_intern("call");
|
|
1621
|
+
fj_force_encoding_id = rb_intern("force_encoding");
|
|
1622
|
+
fj_valid_encoding_p_id = rb_intern("valid_encoding?");
|
|
1623
|
+
fj_encoding_id = rb_intern("encoding");
|
|
1624
|
+
fj_name_id = rb_intern("name");
|
|
1468
1625
|
fj_sym_empty_slot = ID2SYM(rb_intern("empty_slot"));
|
|
1469
1626
|
fj_sym_empty_value = ID2SYM(rb_intern("empty_value"));
|
|
1470
1627
|
fj_sym_duplicate_key = ID2SYM(rb_intern("duplicate_key"));
|
|
1628
|
+
fj_sym_encoding = ID2SYM(rb_intern("encoding"));
|
|
1629
|
+
fj_sym_symbolize_keys = ID2SYM(rb_intern("symbolize_keys"));
|
|
1630
|
+
fj_sym_first_wins = ID2SYM(rb_intern("first_wins"));
|
|
1631
|
+
fj_sym_decimal_precision = ID2SYM(rb_intern("decimal_precision"));
|
|
1632
|
+
fj_sym_float = ID2SYM(rb_intern("float"));
|
|
1633
|
+
fj_sym_bigdecimal = ID2SYM(rb_intern("bigdecimal"));
|
|
1634
|
+
fj_sym_on_warning = ID2SYM(rb_intern("on_warning"));
|
|
1471
1635
|
rb_define_module_function(mSmarterJSON, "parse_c", fj_parse_c, 2);
|
|
1472
1636
|
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2021 The fast_float authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any
|
|
6
|
+
person obtaining a copy of this software and associated
|
|
7
|
+
documentation files (the "Software"), to deal in the
|
|
8
|
+
Software without restriction, including without
|
|
9
|
+
limitation the rights to use, copy, modify, merge,
|
|
10
|
+
publish, distribute, sublicense, and/or sell copies of
|
|
11
|
+
the Software, and to permit persons to whom the Software
|
|
12
|
+
is furnished to do so, subject to the following
|
|
13
|
+
conditions:
|
|
14
|
+
|
|
15
|
+
The above copyright notice and this permission notice
|
|
16
|
+
shall be included in all copies or substantial portions
|
|
17
|
+
of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
|
20
|
+
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
|
21
|
+
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
|
22
|
+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
|
23
|
+
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
24
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
25
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
|
26
|
+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
27
|
+
DEALINGS IN THE SOFTWARE.
|