smarter_json 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1424 @@
1
+ #include "smarter_json.h"
2
+ #include <math.h>
3
+ #include <string.h>
4
+ #ifdef __ARM_NEON
5
+ #include <arm_neon.h>
6
+ #endif
7
+ #include "vendor/ryu.h" /* Ryū string->double, correctly rounded (Ulf Adams, Apache-2.0) */
8
+
9
+ /* Branch hints / prefetch on the hot scan loops. No-ops on compilers without the
10
+ * builtins (the code is correct either way; these only steer code layout). */
11
+ #if defined(__GNUC__) || defined(__clang__)
12
+ # define FJ_LIKELY(x) __builtin_expect(!!(x), 1)
13
+ # define FJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
14
+ # define FJ_PREFETCH(p) __builtin_prefetch(p)
15
+ #else
16
+ # define FJ_LIKELY(x) (x)
17
+ # define FJ_UNLIKELY(x) (x)
18
+ # define FJ_PREFETCH(p) ((void)0)
19
+ #endif
20
+
21
+ /*
22
+ * smarter_json C extension — self-contained parser (no callbacks into Ruby parse
23
+ * logic). One entry point, SmarterJSON.parse_c (made private on the Ruby side).
24
+ *
25
+ * Covers strict JSON, JSON5, and the HJSON-inspired layer (quoteless strings
26
+ * with recognized-literals-win classification, triple-quoted strings, implicit
27
+ * root object, newline-as-separator, broader unquoted keys). The smarter_json
28
+ * layer (smart quotes, Python literals, Unicode whitespace) is pure-Ruby only
29
+ * for now; those acceleration:true parity specs stay red until ported here.
30
+ */
31
+
32
+ static VALUE mSmarterJSON;
33
+ static VALUE cParseError;
34
+ static VALUE cEncodingError;
35
+ static ID fj_bigdecimal_id; /* cached BigDecimal() method id (set in Init) */
36
+ static ID fj_to_sym_id; /* cached :to_sym (symbolize_keys) */
37
+ static ID fj_key_p_id; /* cached :key? (non-default duplicate_key modes) */
38
+
39
+ /* Per-parse direct-mapped key cache: key bytes -> the interned (frozen,
40
+ * globally-rooted) String, so repeated keys skip the global fstring lookup.
41
+ * Only used when rb_enc_interned_str is available — the cached strings are then
42
+ * kept alive by the interned-string table, so the cache needs no GC marking. */
43
+ #define FJ_KCACHE_BITS 9
44
+ #define FJ_KCACHE_SIZE (1 << FJ_KCACHE_BITS)
45
+ typedef struct { long len; VALUE str; } fj_kc_slot;
46
+
47
+ typedef struct {
48
+ const char *buf;
49
+ long len;
50
+ long pos;
51
+ rb_encoding *enc;
52
+ int depth;
53
+ int symbolize_keys;
54
+ int dup_first_wins;
55
+ int dup_raise;
56
+ int bigdecimal_load; /* 0 = float, 1 = auto, 2 = bigdecimal */
57
+ fj_kc_slot *kcache; /* per-parse key cache (NULL when interning unavailable) */
58
+ } fj_state;
59
+
60
+ /* Line/column at the current byte position, computed lazily (only when raising
61
+ * an error) by scanning from the start of the buffer. CR, LF, and CRLF each
62
+ * count as one newline; col is bytes since the last line start (1-based).
63
+ * Keeping this off the hot path is the point — fj_advance never touches it. */
64
+ static void fj_line_col(fj_state *st, long *line, long *col) {
65
+ long l = 1, c = 1, i;
66
+ long limit = (st->pos < st->len) ? st->pos : st->len;
67
+ for (i = 0; i < limit; i++) {
68
+ unsigned char b = (unsigned char)st->buf[i];
69
+ if (b == 0x0A) { l++; c = 1; }
70
+ else if (b == 0x0D) { l++; c = 1; if (i + 1 < st->len && (unsigned char)st->buf[i + 1] == 0x0A) i++; }
71
+ else c++;
72
+ }
73
+ *line = l;
74
+ *col = c;
75
+ }
76
+
77
+ /* 1-based column of the current byte position (bytes since the last line start).
78
+ * Used for triple-quoted indentation stripping (smarter_json.md §2.3). */
79
+ static long fj_column(fj_state *st) {
80
+ long c = 1, i = st->pos - 1;
81
+ while (i >= 0 && st->buf[i] != 0x0A && st->buf[i] != 0x0D) { c++; i--; }
82
+ return c;
83
+ }
84
+
85
+ /* Construct SmarterJSON::ParseError(message, line, col) and raise it. */
86
+ NORETURN(static void fj_error(fj_state *st, const char *msg));
87
+ static void fj_error(fj_state *st, const char *msg) {
88
+ long line, col;
89
+ VALUE exc;
90
+ fj_line_col(st, &line, &col);
91
+ exc = rb_funcall(cParseError, rb_intern("new"), 3,
92
+ rb_str_new_cstr(msg), LONG2NUM(line), LONG2NUM(col));
93
+ rb_exc_raise(exc);
94
+ }
95
+
96
+ static int fj_byte(fj_state *st) {
97
+ return (st->pos < st->len) ? (unsigned char)st->buf[st->pos] : -1;
98
+ }
99
+
100
+ static int fj_byte_at(fj_state *st, long off) {
101
+ long p = st->pos + off;
102
+ return (p >= 0 && p < st->len) ? (unsigned char)st->buf[p] : -1;
103
+ }
104
+
105
+ static int fj_eof(fj_state *st) { return st->pos >= st->len; }
106
+
107
+ /* Advance the byte cursor by n (clamped to EOF). No line/col bookkeeping — that
108
+ * is computed lazily in fj_line_col only when an error is raised. */
109
+ static void fj_advance(fj_state *st, long n) {
110
+ st->pos += n;
111
+ if (st->pos > st->len) st->pos = st->len;
112
+ }
113
+
114
+ /* ASCII whitespace: space, or 0x09..0x0D (tab, LF, VT, FF, CR). */
115
+ static int fj_is_ws(int b) { return b == 0x20 || (b >= 0x09 && b <= 0x0D); }
116
+
117
+ /* Length (1..3) of the Unicode whitespace char starting at p (n bytes
118
+ * available), or 0. Matches Ruby's [[:space:]]; see smarter_json.md §4.7.
119
+ * Reject-gate: only C2/E1/E2/E3 can begin a whitespace char. */
120
+ static long fj_mbws(const char *p, long n) {
121
+ int b0, b1, b2;
122
+ if (n < 1) return 0;
123
+ b0 = (unsigned char)p[0];
124
+ if (b0 != 0xC2 && (b0 < 0xE1 || b0 > 0xE3)) return 0;
125
+ if (n < 2) return 0;
126
+ b1 = (unsigned char)p[1];
127
+ if (b0 == 0xC2) return (b1 == 0xA0 || b1 == 0x85) ? 2 : 0;
128
+ if (n < 3) return 0;
129
+ b2 = (unsigned char)p[2];
130
+ if (b0 == 0xE1) return (b1 == 0x9A && b2 == 0x80) ? 3 : 0;
131
+ if (b0 == 0xE2) {
132
+ if (b1 == 0x80 && ((b2 >= 0x80 && b2 <= 0x8A) || b2 == 0xA8 || b2 == 0xA9 || b2 == 0xAF)) return 3;
133
+ if (b1 == 0x81 && b2 == 0x9F) return 3;
134
+ return 0;
135
+ }
136
+ if (b0 == 0xE3) return (b1 == 0x80 && b2 == 0x80) ? 3 : 0;
137
+ return 0;
138
+ }
139
+
140
+ static void fj_skip_pure_ws(fj_state *st) {
141
+ for (;;) {
142
+ int b = fj_byte(st);
143
+ if (b == -1) break;
144
+ if (fj_is_ws(b)) {
145
+ fj_advance(st, 1);
146
+ } else if (b >= 0x80) {
147
+ long m = fj_mbws(st->buf + st->pos, st->len - st->pos);
148
+ if (m == 0) break;
149
+ st->pos += m;
150
+ } else {
151
+ break;
152
+ }
153
+ }
154
+ }
155
+
156
+ /* A comment marker only starts a comment when preceded by whitespace or at the
157
+ * very start of input (the comment-marker rule). */
158
+ static int fj_preceded_by_ws_or_start(fj_state *st) {
159
+ long i, m;
160
+ unsigned char prev;
161
+ if (st->pos == 0) return 1;
162
+ prev = (unsigned char)st->buf[st->pos - 1];
163
+ if (fj_is_ws(prev)) return 1;
164
+ if (prev < 0x80) return 0;
165
+ i = st->pos - 1; /* back up to the lead byte of a multibyte char */
166
+ while (i > 0 && ((unsigned char)st->buf[i] & 0xC0) == 0x80) i--;
167
+ m = fj_mbws(st->buf + i, st->len - i);
168
+ return (m > 0 && i + m == st->pos);
169
+ }
170
+
171
+ static void fj_skip_to_eol(fj_state *st) {
172
+ int b;
173
+ while ((b = fj_byte(st)) != -1 && b != 0x0A && b != 0x0D) fj_advance(st, 1);
174
+ }
175
+
176
+ static void fj_skip_block_comment(fj_state *st) {
177
+ fj_advance(st, 2); /* consume the opening slash-star */
178
+ while (!fj_eof(st)) {
179
+ if (fj_byte(st) == '*' && fj_byte_at(st, 1) == '/') { fj_advance(st, 2); return; }
180
+ fj_advance(st, 1);
181
+ }
182
+ fj_error(st, "unterminated block comment");
183
+ }
184
+
185
+ static void fj_skip_ws_comments(fj_state *st) {
186
+ for (;;) {
187
+ int b, n;
188
+ fj_skip_pure_ws(st);
189
+ b = fj_byte(st);
190
+ if (b == -1) return;
191
+ n = fj_byte_at(st, 1);
192
+ int is_marker = (b == '#') || (b == '/' && (n == '/' || n == '*'));
193
+ if (!is_marker) return;
194
+ if (!fj_preceded_by_ws_or_start(st)) return;
195
+ if (b == '/' && n == '*') fj_skip_block_comment(st);
196
+ else fj_skip_to_eol(st);
197
+ }
198
+ }
199
+
200
+ /* forward declarations (mutual recursion) */
201
+ static VALUE fj_parse_value(fj_state *st);
202
+ static VALUE fj_parse_member_value(fj_state *st);
203
+
204
+ static void fj_append_utf8(VALUE buf, unsigned long cp) {
205
+ char tmp[4];
206
+ if (cp <= 0x7F) {
207
+ tmp[0] = (char)cp; rb_str_buf_cat(buf, tmp, 1);
208
+ } else if (cp <= 0x7FF) {
209
+ tmp[0] = (char)(0xC0 | (cp >> 6));
210
+ tmp[1] = (char)(0x80 | (cp & 0x3F));
211
+ rb_str_buf_cat(buf, tmp, 2);
212
+ } else if (cp <= 0xFFFF) {
213
+ tmp[0] = (char)(0xE0 | (cp >> 12));
214
+ tmp[1] = (char)(0x80 | ((cp >> 6) & 0x3F));
215
+ tmp[2] = (char)(0x80 | (cp & 0x3F));
216
+ rb_str_buf_cat(buf, tmp, 3);
217
+ } else {
218
+ tmp[0] = (char)(0xF0 | (cp >> 18));
219
+ tmp[1] = (char)(0x80 | ((cp >> 12) & 0x3F));
220
+ tmp[2] = (char)(0x80 | ((cp >> 6) & 0x3F));
221
+ tmp[3] = (char)(0x80 | (cp & 0x3F));
222
+ rb_str_buf_cat(buf, tmp, 4);
223
+ }
224
+ }
225
+
226
+ static int fj_hex_val(int b) {
227
+ if (b >= '0' && b <= '9') return b - '0';
228
+ if (b >= 'a' && b <= 'f') return b - 'a' + 10;
229
+ if (b >= 'A' && b <= 'F') return b - 'A' + 10;
230
+ return -1;
231
+ }
232
+
233
+ static unsigned long fj_read_hex4(fj_state *st) {
234
+ unsigned long v = 0;
235
+ int i;
236
+ for (i = 0; i < 4; i++) {
237
+ int h = fj_hex_val(fj_byte(st));
238
+ if (h < 0) fj_error(st, "invalid \\u escape");
239
+ v = (v << 4) | (unsigned long)h;
240
+ fj_advance(st, 1);
241
+ }
242
+ return v;
243
+ }
244
+
245
+ /* Scan [p, end) for the first `quote` or backslash; returns a pointer to it, or
246
+ * `end` if neither occurs. NEON (16 bytes/iteration) on arm64, scalar elsewhere.
247
+ * With lazy line/col the caller advances past the whole run in O(1). */
248
+ static const char *fj_scan_str(const char *p, const char *end, int quote) {
249
+ #ifdef __ARM_NEON
250
+ const uint8x16_t vq = vdupq_n_u8((uint8_t)quote);
251
+ const uint8x16_t vbs = vdupq_n_u8('\\');
252
+ while (p + 16 <= end) {
253
+ uint8x16_t chunk = vld1q_u8((const uint8_t *)p);
254
+ uint8x16_t m = vorrq_u8(vceqq_u8(chunk, vq), vceqq_u8(chunk, vbs));
255
+ /* movemask emulation (Oj's technique): pack to 4 bits/byte, then ctz/4. */
256
+ uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(m), 4);
257
+ uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
258
+ if (FJ_UNLIKELY(mask != 0)) { /* most 16-byte chunks contain no quote/backslash */
259
+ mask &= 0x8888888888888888ull;
260
+ return p + (__builtin_ctzll(mask) >> 2);
261
+ }
262
+ FJ_PREFETCH(p + 64);
263
+ p += 16;
264
+ }
265
+ #endif
266
+ for (; p < end; p++) {
267
+ if (*p == (char)quote || *p == '\\') return p;
268
+ }
269
+ return end;
270
+ }
271
+
272
+ static VALUE fj_parse_string(fj_state *st, int quote) {
273
+ long start;
274
+ VALUE buf;
275
+ int b;
276
+ const char *hit;
277
+ fj_advance(st, 1); /* opening quote */
278
+ start = st->pos;
279
+ /* Fast scan to the closing quote or the first backslash. */
280
+ hit = fj_scan_str(st->buf + st->pos, st->buf + st->len, quote);
281
+ fj_advance(st, hit - (st->buf + st->pos));
282
+ b = fj_byte(st);
283
+ if (FJ_LIKELY(b == quote)) { /* common case: a string with no escapes */
284
+ VALUE s = rb_enc_str_new(st->buf + start, st->pos - start, st->enc);
285
+ fj_advance(st, 1);
286
+ return s;
287
+ }
288
+ if (FJ_UNLIKELY(b == -1)) fj_error(st, "unterminated string");
289
+
290
+ buf = rb_str_buf_new(st->pos - start + 16);
291
+ rb_enc_associate(buf, rb_ascii8bit_encoding());
292
+ if (st->pos > start) rb_str_buf_cat(buf, st->buf + start, st->pos - start);
293
+
294
+ while ((b = fj_byte(st)) != -1) {
295
+ if (b == quote) {
296
+ fj_advance(st, 1);
297
+ rb_enc_associate(buf, st->enc);
298
+ return buf;
299
+ } else if (b == '\\') {
300
+ int e;
301
+ fj_advance(st, 1);
302
+ e = fj_byte(st);
303
+ if (e == -1) fj_error(st, "unterminated string escape");
304
+ switch (e) {
305
+ case '"': rb_str_buf_cat(buf, "\"", 1); fj_advance(st, 1); break;
306
+ case '\'': rb_str_buf_cat(buf, "'", 1); fj_advance(st, 1); break;
307
+ case '\\': rb_str_buf_cat(buf, "\\", 1); fj_advance(st, 1); break;
308
+ case '/': rb_str_buf_cat(buf, "/", 1); fj_advance(st, 1); break;
309
+ case 'b': rb_str_buf_cat(buf, "\b", 1); fj_advance(st, 1); break;
310
+ case 'f': rb_str_buf_cat(buf, "\f", 1); fj_advance(st, 1); break;
311
+ case 'n': rb_str_buf_cat(buf, "\n", 1); fj_advance(st, 1); break;
312
+ case 'r': rb_str_buf_cat(buf, "\r", 1); fj_advance(st, 1); break;
313
+ case 't': rb_str_buf_cat(buf, "\t", 1); fj_advance(st, 1); break;
314
+ case 0x0A: fj_advance(st, 1); break; /* \<LF>: line continuation */
315
+ case 0x0D: fj_advance(st, 1); if (fj_byte(st) == 0x0A) fj_advance(st, 1); break;
316
+ case 'u': {
317
+ unsigned long cp;
318
+ fj_advance(st, 1);
319
+ cp = fj_read_hex4(st);
320
+ if (cp >= 0xD800 && cp <= 0xDBFF) {
321
+ unsigned long lo;
322
+ if (fj_byte(st) != '\\' || fj_byte_at(st, 1) != 'u') {
323
+ fj_error(st, "unpaired high surrogate in string");
324
+ }
325
+ fj_advance(st, 2);
326
+ lo = fj_read_hex4(st);
327
+ if (lo < 0xDC00 || lo > 0xDFFF) fj_error(st, "invalid low surrogate value");
328
+ cp = 0x10000 + ((cp - 0xD800) << 10) + (lo - 0xDC00);
329
+ }
330
+ fj_append_utf8(buf, cp);
331
+ break;
332
+ }
333
+ default:
334
+ fj_error(st, "invalid escape");
335
+ }
336
+ } else {
337
+ /* Literal run between escapes: NEON-scan to the next quote/backslash and
338
+ * bulk-copy the whole run in one rb_str_buf_cat, rather than byte by byte. */
339
+ const char *p0 = st->buf + st->pos;
340
+ const char *h = fj_scan_str(p0, st->buf + st->len, quote);
341
+ rb_str_buf_cat(buf, p0, h - p0);
342
+ fj_advance(st, h - p0);
343
+ }
344
+ }
345
+ fj_error(st, "unterminated string");
346
+ return Qnil; /* unreachable */
347
+ }
348
+
349
+ static void fj_consume_keyword(fj_state *st, const char *word) {
350
+ long n = (long)strlen(word), i;
351
+ for (i = 0; i < n; i++) {
352
+ if (fj_byte_at(st, i) != (unsigned char)word[i]) fj_error(st, "invalid literal");
353
+ }
354
+ fj_advance(st, n);
355
+ }
356
+
357
+ /* Copy a byte range into a fresh String, dropping underscores. */
358
+ static VALUE fj_strip_underscores(const char *p, long n) {
359
+ VALUE s = rb_str_buf_new(n);
360
+ long i;
361
+ for (i = 0; i < n; i++) if (p[i] != '_') rb_str_buf_cat(s, p + i, 1);
362
+ return s;
363
+ }
364
+
365
+ /* Significant mantissa digits in the token [p, p+n) (leading zeros excluded,
366
+ * trailing zeros and the fraction included; exponent and underscores excluded)
367
+ * — Oj's dec_cnt + 1. */
368
+ static long fj_sig_digits(const char *p, long n) {
369
+ long i, cnt = 0;
370
+ int started = 0;
371
+ for (i = 0; i < n; i++) {
372
+ char c = p[i];
373
+ if (c == 'e' || c == 'E') break;
374
+ if (c >= '0' && c <= '9') {
375
+ if (!started) { if (c != '0') { started = 1; cnt = 1; } }
376
+ else cnt++;
377
+ }
378
+ }
379
+ return cnt;
380
+ }
381
+
382
+ /* A decimal token can go straight to BigDecimal() unchanged unless it has an
383
+ * underscore (smarter_json leniency) or a dot that BigDecimal() rejects: a leading
384
+ * dot (".5") or a dot not followed by a digit ("5.", "5.e3"). */
385
+ static int fj_decimal_is_clean(const char *p, long n) {
386
+ long i = 0;
387
+ if (memchr(p, '_', (size_t)n) != NULL) return 0;
388
+ if (i < n && (p[i] == '-' || p[i] == '+')) i++;
389
+ if (i < n && p[i] == '.') return 0;
390
+ for (; i < n; i++) {
391
+ if (p[i] == '.') {
392
+ char nx = (i + 1 < n) ? p[i + 1] : '\0';
393
+ if (nx < '0' || nx > '9') return 0;
394
+ }
395
+ }
396
+ return 1;
397
+ }
398
+
399
+ /* Build a BigDecimal from a decimal token. Fast path (the common case): the raw
400
+ * token bytes go straight to BigDecimal() via the cached method id — the same
401
+ * shape Oj uses, no normalization, no extra allocation beyond the String. Only
402
+ * when the token has an underscore or a bare/trailing dot do we clean it, in a
403
+ * single C pass (no per-byte rb_str appends), then one rb_str_new. The grammar
404
+ * was already validated by the caller, so BigDecimal() can't raise on a clean
405
+ * token — no rescue frame (one fewer than Oj). */
406
+ static VALUE fj_to_bigdecimal_token(const char *p, long n) {
407
+ char stack[64];
408
+ char *buf;
409
+ long i = 0, w = 0;
410
+ VALUE s;
411
+
412
+ if (fj_decimal_is_clean(p, n)) {
413
+ return rb_funcall(rb_cObject, fj_bigdecimal_id, 1, rb_str_new(p, n));
414
+ }
415
+
416
+ buf = (n + 2 <= (long)sizeof(stack)) ? stack : ruby_xmalloc((size_t)(n + 2));
417
+ if (i < n && (p[i] == '-' || p[i] == '+')) buf[w++] = p[i++];
418
+ if (i < n && p[i] == '.') buf[w++] = '0'; /* ".5" -> "0.5" */
419
+ for (; i < n; i++) {
420
+ if (p[i] == '_') continue;
421
+ buf[w++] = p[i];
422
+ if (p[i] == '.') {
423
+ char nx = (i + 1 < n) ? p[i + 1] : '\0';
424
+ if (nx == 'e' || nx == 'E' || nx == '\0') buf[w++] = '0'; /* "5." -> "5.0" */
425
+ }
426
+ }
427
+ s = rb_str_new(buf, w);
428
+ if (buf != stack) ruby_xfree(buf);
429
+ return rb_funcall(rb_cObject, fj_bigdecimal_id, 1, s);
430
+ }
431
+
432
+ /* Shared conversion tail: turn the parts extracted during a scan into a Ruby
433
+ * value. Both fj_parse_number (strict-position scan) and fj_try_decimal
434
+ * (quoteless path) call these, so the Integer/Float a token produces is identical
435
+ * no matter which path scanned it. [p, n) is the raw token slice (with any sign),
436
+ * needed only by the bignum / strtod fallbacks. */
437
+ static VALUE fj_int_from_parts(uint64_t m, int digits, int neg, int overflow, const char *p, long n) {
438
+ if (!overflow && digits >= 1 && digits <= 18) {
439
+ int64_t v = (int64_t)m;
440
+ return LL2NUM(neg ? -v : v);
441
+ }
442
+ /* >18 digits (may exceed int64) -> bignum from the slice. */
443
+ if (memchr(p, '_', (size_t)n) == NULL) return rb_cstr_to_inum(p, 10, 0);
444
+ return rb_str_to_inum(fj_strip_underscores(p, n), 10, 0);
445
+ }
446
+
447
+ /* e10 is the final base-10 exponent (already adjusted by the fraction length). */
448
+ static VALUE fj_float_from_parts(uint64_t m10, int m10digits, int64_t e10, int neg, int overflow, const char *p, long n) {
449
+ /* Ryū fast path: <=17 mantissa digits and not in the subnormal range. */
450
+ if (!overflow && m10digits >= 1 && m10digits <= 17 && (long)m10digits + e10 >= -307) {
451
+ if (m10 == 0) return rb_float_new(neg ? -0.0 : 0.0);
452
+ return rb_float_new(ryu_s2d_from_parts(m10, m10digits, (int32_t)e10, neg != 0));
453
+ }
454
+ /* Fallback for >17 digits / extreme or subnormal exponents. */
455
+ if (memchr(p, '_', (size_t)n) == NULL) return rb_float_new(rb_cstr_to_dbl(p, 0));
456
+ return rb_float_new(rb_str_to_dbl(fj_strip_underscores(p, n), 0));
457
+ }
458
+
459
+ /* Scan an already-bounded quoteless token [p, p+n) exactly once: validate it as a
460
+ * JSON5 decimal *and* extract the mantissa/exponent in the same pass, then build
461
+ * the value through the shared fj_*_from_parts helpers. Returns 1 (and sets *out)
462
+ * for a valid number; returns 0 when the token is not a number, so the caller can
463
+ * keep it as a quoteless string. This replaces the old validate-then-convert
464
+ * sequence (fj_validate_decimal + fj_decimal_value/fj_int_value), which scanned
465
+ * the token three-plus times. The accept/reject grammar matches the old
466
+ * fj_validate_decimal exactly. (+Infinity/-Infinity and hex are handled by the
467
+ * caller before this point, so they never reach here.) The digit runs skip the
468
+ * per-byte '_' test, dropping to a slow step only when an underscore appears. */
469
+ static int fj_try_decimal(fj_state *st, const char *p, long n, VALUE *out) {
470
+ long i = 0;
471
+ int is_float = 0, neg = 0, has_digit = 0, overflow = 0;
472
+ uint64_t m10 = 0;
473
+ int m10digits = 0, frac = 0;
474
+ int64_t e10 = 0;
475
+
476
+ if (i < n && (p[i] == '-' || p[i] == '+')) { neg = (p[i] == '-'); i++; }
477
+
478
+ /* Integer part: a single '0', or [1-9] then digits/underscores. */
479
+ if (i < n && p[i] == '0') {
480
+ has_digit = 1; m10digits = 1; i++;
481
+ } else if (i < n && p[i] >= '1' && p[i] <= '9') {
482
+ has_digit = 1;
483
+ for (;;) {
484
+ while (i < n && p[i] >= '0' && p[i] <= '9') {
485
+ if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(p[i] - '0'); m10digits++; }
486
+ else overflow = 1;
487
+ i++;
488
+ }
489
+ if (i < n && p[i] == '_') { i++; continue; } /* slow step: underscores are rare */
490
+ break;
491
+ }
492
+ }
493
+
494
+ /* Fraction. */
495
+ if (i < n && p[i] == '.') {
496
+ is_float = 1; i++;
497
+ for (;;) {
498
+ while (i < n && p[i] >= '0' && p[i] <= '9') {
499
+ has_digit = 1;
500
+ if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(p[i] - '0'); m10digits++; frac++; }
501
+ else overflow = 1;
502
+ i++;
503
+ }
504
+ if (i < n && p[i] == '_') { i++; continue; }
505
+ break;
506
+ }
507
+ }
508
+
509
+ /* Exponent: [eE] [+-]? then digits/underscores (at least one required). */
510
+ if (i < n && (p[i] == 'e' || p[i] == 'E')) {
511
+ long es;
512
+ int eneg = 0;
513
+ is_float = 1; i++;
514
+ if (i < n && (p[i] == '-' || p[i] == '+')) { eneg = (p[i] == '-'); i++; }
515
+ es = i;
516
+ while (i < n && ((p[i] >= '0' && p[i] <= '9') || p[i] == '_')) {
517
+ if (p[i] != '_' && !overflow) {
518
+ e10 = e10 * 10 + (p[i] - '0');
519
+ if (e10 > 1000000) overflow = 1; /* extreme exponent -> strtod fallback on the slice */
520
+ }
521
+ i++;
522
+ }
523
+ if (i == es) return 0; /* 'e' with no exponent digits -> not a number */
524
+ if (eneg) e10 = -e10;
525
+ }
526
+
527
+ if (i != n) return 0; /* token not fully consumed -> not a number (string) */
528
+ if (!has_digit) return 0; /* e.g. "." or "+" -> not a number (string) */
529
+
530
+ if (!is_float) {
531
+ *out = fj_int_from_parts(m10, m10digits, neg, overflow, p, n);
532
+ return 1;
533
+ }
534
+ e10 -= frac;
535
+ /* :bigdecimal always; :auto only when significant digits > 16. m10digits is >=
536
+ * the significant-digit count, so m10digits <= 16 skips the fj_sig_digits scan. */
537
+ if (st->bigdecimal_load == 2 ||
538
+ (st->bigdecimal_load == 1 && m10digits > 16 && fj_sig_digits(p, n) > 16)) {
539
+ *out = fj_to_bigdecimal_token(p, n);
540
+ } else {
541
+ *out = fj_float_from_parts(m10, m10digits, e10, neg, overflow, p, n);
542
+ }
543
+ return 1;
544
+ }
545
+
546
+ /* Top-level / strict-position number (JSON5 grammar). Single pass: the scan that
547
+ * finds the token boundary also accumulates the mantissa/exponent, so the common
548
+ * integer/float case never re-reads the token (no second extraction pass, and no
549
+ * separate fj_sig_digits pass). Scanning is a raw pointer loop that relies on the
550
+ * RSTRING_PTR NUL terminator as a sentinel — no per-byte bounds check — and the
551
+ * digit runs skip the per-byte '_' test (the leniency tax), dropping to a slow
552
+ * step only when an underscore actually appears. The extracted parts go through
553
+ * the same fj_*_from_parts helpers the quoteless path uses, so a token produces
554
+ * the identical Ruby value no matter which path scanned it. */
555
+ static VALUE fj_parse_number(fj_state *st) {
556
+ const char *buf = st->buf;
557
+ const char *p = buf + st->pos; /* buf[len] == '\0' (RSTRING_PTR) is the scan sentinel */
558
+ const char *np = p; /* token start, includes a leading sign */
559
+ long nlen;
560
+ int is_float = 0, neg = 0, overflow = 0;
561
+ uint64_t m10 = 0; /* mantissa: integer + fraction digits */
562
+ int m10digits = 0; /* mantissa digit chars (caps the Ryū fast path at 17) */
563
+ int frac = 0; /* fraction digit chars: e10 -= frac */
564
+ int64_t e10 = 0;
565
+
566
+ if (*p == '-' || *p == '+') { neg = (*p == '-'); p++; }
567
+
568
+ /* Cold branches (rare, not perf-critical): sync the cursor, reuse scalar helpers. */
569
+ if (*p == 'I') { st->pos = p - buf; fj_consume_keyword(st, "Infinity"); return rb_float_new(neg ? -INFINITY : INFINITY); }
570
+ if (*p == 'N') { st->pos = p - buf; fj_consume_keyword(st, "NaN"); return rb_float_new(NAN); }
571
+ if (*p == '0' && (p[1] == 'x' || p[1] == 'X')) {
572
+ const char *hs, *q;
573
+ VALUE hx;
574
+ p += 2;
575
+ hs = p;
576
+ while (fj_hex_val((unsigned char)*p) >= 0 || *p == '_') p++;
577
+ if (p == hs) { st->pos = p - buf; fj_error(st, "invalid hex number"); }
578
+ hx = rb_str_buf_new(16);
579
+ if (neg) rb_str_buf_cat(hx, "-", 1);
580
+ for (q = hs; q < p; q++) if (*q != '_') rb_str_buf_cat(hx, q, 1);
581
+ st->pos = p - buf;
582
+ return rb_str_to_inum(hx, 16, 0);
583
+ }
584
+
585
+ /* Integer part: a single '0', or [1-9] then digits/underscores. */
586
+ if (*p == '0') {
587
+ m10digits = 1; /* one leading zero, counted as a single mantissa digit */
588
+ p++;
589
+ } else if (*p >= '1' && *p <= '9') {
590
+ for (;;) {
591
+ while (*p >= '0' && *p <= '9') {
592
+ if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(*p - '0'); m10digits++; }
593
+ else overflow = 1;
594
+ p++;
595
+ }
596
+ if (*p == '_') { p++; continue; } /* slow step: underscores are rare */
597
+ break;
598
+ }
599
+ } else if (*p == '.') {
600
+ /* leading decimal point: no integer part */
601
+ } else {
602
+ st->pos = p - buf;
603
+ fj_error(st, "invalid number");
604
+ }
605
+
606
+ /* Fraction. */
607
+ if (*p == '.') {
608
+ is_float = 1;
609
+ p++;
610
+ for (;;) {
611
+ while (*p >= '0' && *p <= '9') {
612
+ if (m10digits < 18) { m10 = m10 * 10 + (uint64_t)(*p - '0'); m10digits++; frac++; }
613
+ else overflow = 1;
614
+ p++;
615
+ }
616
+ if (*p == '_') { p++; continue; }
617
+ break;
618
+ }
619
+ }
620
+
621
+ /* Exponent. */
622
+ if (*p == 'e' || *p == 'E') {
623
+ int eneg = 0;
624
+ is_float = 1;
625
+ p++;
626
+ if (*p == '-' || *p == '+') { eneg = (*p == '-'); p++; }
627
+ if (!(*p >= '0' && *p <= '9')) { st->pos = p - buf; fj_error(st, "invalid number: expected digits in exponent"); }
628
+ while ((*p >= '0' && *p <= '9') || *p == '_') {
629
+ if (*p != '_' && !overflow) {
630
+ e10 = e10 * 10 + (*p - '0');
631
+ if (e10 > 1000000) overflow = 1; /* extreme exponent -> strtod fallback on the slice */
632
+ }
633
+ p++;
634
+ }
635
+ if (eneg) e10 = -e10;
636
+ }
637
+
638
+ st->pos = p - buf;
639
+ nlen = p - np;
640
+
641
+ if (!is_float) {
642
+ return fj_int_from_parts(m10, m10digits, neg, overflow, np, nlen);
643
+ }
644
+ e10 -= frac;
645
+ /* BigDecimal decision (same rule as fj_try_decimal): :bigdecimal always; :auto only
646
+ * when significant digits > 16. Since m10digits >= significant digits, m10digits
647
+ * <= 16 guarantees not-BigDecimal and lets us skip the fj_sig_digits scan
648
+ * entirely (the common case — e.g. every coordinate in canada.json). */
649
+ if (st->bigdecimal_load == 2 ||
650
+ (st->bigdecimal_load == 1 && m10digits > 16 && fj_sig_digits(np, nlen) > 16)) {
651
+ return fj_to_bigdecimal_token(np, nlen);
652
+ }
653
+ return fj_float_from_parts(m10, m10digits, e10, neg, overflow, np, nlen);
654
+ }
655
+
656
+ static VALUE fj_parse_literal(fj_state *st, const char *word, VALUE value) {
657
+ fj_consume_keyword(st, word);
658
+ return value;
659
+ }
660
+
661
+ static int fj_is_key_start(int b) {
662
+ return (b >= 'A' && b <= 'Z') || (b >= 'a' && b <= 'z') || b == '_' || b == '$';
663
+ }
664
+
665
+ static int fj_is_key_continue(int b) {
666
+ return fj_is_key_start(b) || (b >= '0' && b <= '9') || b == '-';
667
+ }
668
+
669
+ /* Intern an object key (frozen, deduplicated) so repeated keys across records
670
+ * share one String and skip a per-occurrence allocation. On Ruby < 3.0 (no
671
+ * rb_enc_interned_str) this falls back to a plain string — Hash#[]= still dedups
672
+ * the key on store, just without saving the allocation. Keys only: values are
673
+ * rarely repeated, so interning them wouldn't pay off (this matches Oj). */
674
+ static inline VALUE fj_key_str(fj_state *st, const char *p, long n) {
675
+ #ifdef HAVE_RB_ENC_INTERNED_STR
676
+ if (st->kcache != NULL) {
677
+ uint64_t h = 1469598103934665603ULL; /* FNV-1a over the key bytes */
678
+ long i;
679
+ fj_kc_slot *slot;
680
+ for (i = 0; i < n; i++) { h ^= (unsigned char)p[i]; h *= 1099511628211ULL; }
681
+ slot = &st->kcache[(size_t)((h ^ (h >> FJ_KCACHE_BITS)) & (FJ_KCACHE_SIZE - 1))];
682
+ if (slot->str != Qfalse && slot->len == n &&
683
+ memcmp(RSTRING_PTR(slot->str), p, (size_t)n) == 0) {
684
+ return slot->str; /* hit — skip the global fstring lookup */
685
+ }
686
+ slot->str = rb_enc_interned_str(p, n, st->enc);
687
+ slot->len = n;
688
+ return slot->str;
689
+ }
690
+ return rb_enc_interned_str(p, n, st->enc);
691
+ #else
692
+ return rb_enc_str_new(p, n, st->enc);
693
+ #endif
694
+ }
695
+
696
+ static VALUE fj_parse_identifier_key(fj_state *st) {
697
+ long start = st->pos;
698
+ int b;
699
+ fj_advance(st, 1);
700
+ while ((b = fj_byte(st)) != -1 && fj_is_key_continue(b)) fj_advance(st, 1);
701
+ return fj_key_str(st, st->buf + start, st->pos - start);
702
+ }
703
+
704
+ static VALUE fj_parse_object_key(fj_state *st) {
705
+ int b = fj_byte(st);
706
+
707
+ /* Quoted key. The common case has no escapes: intern straight from the buffer
708
+ * with no throwaway allocation. An escaped key (rare) falls through to the
709
+ * full string parser; Hash#[]= still dedups it on store. */
710
+ if (b == '"' || b == '\'') {
711
+ long i = st->pos + 1;
712
+ while (i < st->len) {
713
+ char c = st->buf[i];
714
+ if (c == (char)b) {
715
+ long cstart = st->pos + 1;
716
+ VALUE k = fj_key_str(st, st->buf + cstart, i - cstart);
717
+ fj_advance(st, i - st->pos + 1); /* consume opening quote .. closing quote */
718
+ return k;
719
+ }
720
+ if (c == '\\') break;
721
+ i++;
722
+ }
723
+ return fj_parse_string(st, b);
724
+ }
725
+
726
+ if (fj_is_key_start(b)) return fj_parse_identifier_key(st);
727
+
728
+ fj_error(st, "expected a key");
729
+ return Qnil; /* unreachable */
730
+ }
731
+
732
+ /* --- quoteless classification (recognized-literals-win), pure C --- */
733
+
734
+ static int fj_tok_eq(const char *p, long n, const char *word) {
735
+ long wl = (long)strlen(word);
736
+ return n == wl && memcmp(p, word, (size_t)n) == 0;
737
+ }
738
+
739
+ static int fj_is_hex_token(const char *p, long n) {
740
+ long i = 0, hs;
741
+ if (i < n && (p[i] == '-' || p[i] == '+')) i++;
742
+ if (i + 1 < n && p[i] == '0' && (p[i + 1] == 'x' || p[i + 1] == 'X')) i += 2; else return 0;
743
+ hs = i;
744
+ while (i < n && (fj_hex_val((unsigned char)p[i]) >= 0 || p[i] == '_')) i++;
745
+ if (i == hs) return 0;
746
+ return i == n;
747
+ }
748
+
749
+ static VALUE fj_classify_quoteless(fj_state *st, const char *p0, long n0) {
750
+ const char *p = p0;
751
+ long n = n0;
752
+ int c0;
753
+ /* trim leading/trailing whitespace (ASCII or multibyte Unicode) */
754
+ for (;;) {
755
+ if (n > 0 && fj_is_ws((unsigned char)p[0])) { p++; n--; continue; }
756
+ if (n > 0 && (unsigned char)p[0] >= 0x80) {
757
+ long m = fj_mbws(p, n);
758
+ if (m > 0) { p += m; n -= m; continue; }
759
+ }
760
+ break;
761
+ }
762
+ for (;;) {
763
+ if (n > 0 && fj_is_ws((unsigned char)p[n - 1])) { n--; continue; }
764
+ if (n > 0 && (unsigned char)p[n - 1] >= 0x80) {
765
+ long j = n - 1;
766
+ while (j > 0 && ((unsigned char)p[j] & 0xC0) == 0x80) j--;
767
+ long m = fj_mbws(p + j, n - j);
768
+ if (m > 0 && j + m == n) { n = j; continue; }
769
+ }
770
+ break;
771
+ }
772
+
773
+ /* Dispatch on the first byte: a digit or '.' can only be a number or a
774
+ * string (no named literal starts that way), so we skip the literal
775
+ * comparisons entirely. '+'/'-' can additionally be ±Infinity. Letters fall
776
+ * through to the literal checks. */
777
+ c0 = (n > 0) ? (unsigned char)p[0] : 0;
778
+
779
+ if ((c0 >= '0' && c0 <= '9') || c0 == '.' || c0 == '-' || c0 == '+') {
780
+ if (c0 == '+' && fj_tok_eq(p, n, "+Infinity")) return rb_float_new(INFINITY);
781
+ if (c0 == '-' && fj_tok_eq(p, n, "-Infinity")) return rb_float_new(-INFINITY);
782
+ if (fj_is_hex_token(p, n)) {
783
+ long i = 0;
784
+ int neg = 0;
785
+ VALUE hx;
786
+ if (p[i] == '-' || p[i] == '+') { neg = (p[i] == '-'); i++; }
787
+ i += 2; /* skip 0x */
788
+ hx = rb_str_buf_new(n);
789
+ if (neg) rb_str_buf_cat(hx, "-", 1);
790
+ for (; i < n; i++) if (p[i] != '_') rb_str_buf_cat(hx, p + i, 1);
791
+ return rb_str_to_inum(hx, 16, 0);
792
+ }
793
+ {
794
+ VALUE num;
795
+ if (fj_try_decimal(st, p, n, &num)) return num;
796
+ }
797
+ return rb_enc_str_new(p, n, st->enc);
798
+ }
799
+
800
+ if (fj_tok_eq(p, n, "true") || fj_tok_eq(p, n, "True")) return Qtrue;
801
+ if (fj_tok_eq(p, n, "false") || fj_tok_eq(p, n, "False")) return Qfalse;
802
+ if (fj_tok_eq(p, n, "null") || fj_tok_eq(p, n, "None") || fj_tok_eq(p, n, "undefined")) return Qnil;
803
+ if (fj_tok_eq(p, n, "NaN")) return rb_float_new(NAN);
804
+ if (fj_tok_eq(p, n, "Infinity")) return rb_float_new(INFINITY);
805
+
806
+ return rb_enc_str_new(p, n, st->enc);
807
+ }
808
+
809
+ /* Quoteless single-line string: scan to a delimiter (structural punctuation,
810
+ * newline, EOF, or a whitespace-preceded comment marker), then classify. */
811
+ /* Per-byte classes for the quoteless-token boundary scan. ASCII only; bytes
812
+ * >= 0x80 are handled separately (possible multibyte whitespace). LF/CR are
813
+ * TERM, not WS — they end the token, matching the old terminator check that ran
814
+ * before the whitespace check. */
815
+ enum { FJ_QL_ORD = 0, FJ_QL_TERM, FJ_QL_WS, FJ_QL_CMT };
816
+ static const unsigned char fj_ql_class[256] = {
817
+ [','] = FJ_QL_TERM, ['}'] = FJ_QL_TERM, [']'] = FJ_QL_TERM,
818
+ [0x0A] = FJ_QL_TERM, [0x0D] = FJ_QL_TERM,
819
+ [0x09] = FJ_QL_WS, [0x0B] = FJ_QL_WS, [0x0C] = FJ_QL_WS, [' '] = FJ_QL_WS,
820
+ ['#'] = FJ_QL_CMT, ['/'] = FJ_QL_CMT,
821
+ };
822
+
823
+ static VALUE fj_parse_quoteless_or_literal(fj_state *st) {
824
+ long start = st->pos;
825
+ int prev_ws = 0, b, nx;
826
+ for (;;) {
827
+ b = fj_byte(st);
828
+ if (b == -1) break;
829
+ if (b >= 0x80) { /* possible multibyte whitespace */
830
+ long m = fj_mbws(st->buf + st->pos, st->len - st->pos);
831
+ if (m > 0) { prev_ws = 1; st->pos += m; }
832
+ else { prev_ws = 0; fj_advance(st, 1); }
833
+ continue;
834
+ }
835
+ /* One table lookup classifies the byte; the common ordinary byte takes the
836
+ * fast path with no further comparisons and no lookahead read. */
837
+ {
838
+ unsigned char cls = fj_ql_class[b];
839
+ if (FJ_LIKELY(cls == FJ_QL_ORD)) { prev_ws = 0; fj_advance(st, 1); continue; }
840
+ if (cls == FJ_QL_TERM) break;
841
+ if (cls == FJ_QL_WS) { prev_ws = 1; fj_advance(st, 1); continue; }
842
+ /* FJ_QL_CMT: '#' or '/' — a comment marker only when preceded by whitespace.
843
+ * The lookahead byte (nx) is read only here, not on every byte. */
844
+ if (prev_ws) {
845
+ if (b == '#') break;
846
+ nx = fj_byte_at(st, 1);
847
+ if (nx == '/' || nx == '*') break; /* b == '/' */
848
+ }
849
+ prev_ws = 0;
850
+ fj_advance(st, 1);
851
+ }
852
+ }
853
+ return fj_classify_quoteless(st, st->buf + start, st->pos - start);
854
+ }
855
+
856
+ /* --- triple-quoted strings (pure C, mirroring strip_triple) --- */
857
+
858
+ static VALUE fj_strip_indent(VALUE line, long indent, rb_encoding *enc) {
859
+ const char *p = RSTRING_PTR(line);
860
+ long m = RSTRING_LEN(line), i = 0;
861
+ while (i < indent && i < m && (p[i] == ' ' || p[i] == '\t')) i++;
862
+ return rb_enc_str_new(p + i, m - i, enc);
863
+ }
864
+
865
+ static int fj_blank_line(VALUE line) {
866
+ const char *p = RSTRING_PTR(line);
867
+ long m = RSTRING_LEN(line), i;
868
+ for (i = 0; i < m; i++) if (p[i] != ' ' && p[i] != '\t') return 0;
869
+ return 1;
870
+ }
871
+
872
+ static VALUE fj_strip_triple(const char *p, long n, long indent, rb_encoding *enc) {
873
+ VALUE lines = rb_ary_new();
874
+ VALUE out, res;
875
+ int leading_newline = (n > 0 && (p[0] == '\n' || p[0] == '\r'));
876
+ long i = 0, lstart = 0, len, idx;
877
+
878
+ while (i < n) {
879
+ if (p[i] == '\n' || p[i] == '\r') {
880
+ rb_ary_push(lines, rb_enc_str_new(p + lstart, i - lstart, enc));
881
+ if (p[i] == '\r' && i + 1 < n && p[i + 1] == '\n') i++;
882
+ i++;
883
+ lstart = i;
884
+ } else {
885
+ i++;
886
+ }
887
+ }
888
+ rb_ary_push(lines, rb_enc_str_new(p + lstart, n - lstart, enc));
889
+
890
+ out = rb_ary_new();
891
+ len = RARRAY_LEN(lines);
892
+ for (idx = 0; idx < len; idx++) {
893
+ VALUE line = rb_ary_entry(lines, idx);
894
+ if (idx == 0) {
895
+ if (leading_newline) continue;
896
+ rb_ary_push(out, line);
897
+ } else {
898
+ rb_ary_push(out, fj_strip_indent(line, indent, enc));
899
+ }
900
+ }
901
+ if (RARRAY_LEN(out) > 0 && fj_blank_line(rb_ary_entry(out, RARRAY_LEN(out) - 1))) {
902
+ rb_ary_pop(out);
903
+ }
904
+ res = rb_ary_join(out, rb_str_new_cstr("\n"));
905
+ rb_enc_associate(res, enc);
906
+ return res;
907
+ }
908
+
909
+ static VALUE fj_parse_triple_quoted(fj_state *st) {
910
+ long indent = fj_column(st) - 1;
911
+ long raw_start;
912
+ VALUE r;
913
+ fj_advance(st, 3);
914
+ raw_start = st->pos;
915
+ while (!fj_eof(st)) {
916
+ if (fj_byte(st) == '\'' && fj_byte_at(st, 1) == '\'' && fj_byte_at(st, 2) == '\'') break;
917
+ fj_advance(st, 1);
918
+ }
919
+ if (fj_eof(st)) fj_error(st, "unterminated triple-quoted string");
920
+ r = fj_strip_triple(st->buf + raw_start, st->pos - raw_start, indent, st->enc);
921
+ fj_advance(st, 3);
922
+ return r;
923
+ }
924
+
925
+ static VALUE fj_parse_single_or_triple(fj_state *st) {
926
+ if (fj_byte_at(st, 1) == '\'' && fj_byte_at(st, 2) == '\'') return fj_parse_triple_quoted(st);
927
+ return fj_parse_string(st, '\'');
928
+ }
929
+
930
+ /* Smart/curly quotes: U+201C/201D double (E2 80 9C/9D), U+2018/2019 single
931
+ * (E2 80 98/99). Returns 2 (double), 1 (single), or 0. */
932
+ static int fj_smart_quote_kind(fj_state *st) {
933
+ int b2;
934
+ if (fj_byte(st) != 0xE2 || fj_byte_at(st, 1) != 0x80) return 0;
935
+ b2 = fj_byte_at(st, 2);
936
+ if (b2 == 0x9C || b2 == 0x9D) return 2;
937
+ if (b2 == 0x98 || b2 == 0x99) return 1;
938
+ return 0;
939
+ }
940
+
941
+ /* Content between smart quotes is literal (no escape processing); lenient
942
+ * about open/close direction. */
943
+ static VALUE fj_parse_smart_string(fj_state *st, int kind) {
944
+ long start;
945
+ fj_advance(st, 3); /* opening smart quote */
946
+ start = st->pos;
947
+ while (!fj_eof(st)) {
948
+ if (fj_byte(st) == 0xE2 && fj_byte_at(st, 1) == 0x80) {
949
+ int b2 = fj_byte_at(st, 2);
950
+ int closer = (kind == 2) ? (b2 == 0x9C || b2 == 0x9D) : (b2 == 0x98 || b2 == 0x99);
951
+ if (closer) {
952
+ VALUE s = rb_enc_str_new(st->buf + start, st->pos - start, st->enc);
953
+ fj_advance(st, 3);
954
+ return s;
955
+ }
956
+ }
957
+ fj_advance(st, 1);
958
+ }
959
+ fj_error(st, "unterminated smart-quoted string");
960
+ return Qnil; /* unreachable */
961
+ }
962
+
963
+ /* --- containers --- */
964
+
965
+ /* Value in object-value or array-element position (scalar only — containers
966
+ * are handled by the iterative driver below). Quoteless allowed. Assumes the
967
+ * caller has already skipped whitespace/comments and checked for EOF. */
968
+ /* Fast path for a plain decimal number in object-value / array-element position.
969
+ * Scans a clean JSON5 decimal straight from the cursor in one pass and commits
970
+ * ONLY when the number immediately abuts a value terminator (',', '}', ']',
971
+ * newline, or EOF) — true for essentially all real JSON, where a number touches
972
+ * its delimiter. On any deviation (trailing whitespace, a letter, a second '.',
973
+ * '0x…', '±Infinity', …) it restores the cursor and returns 0, so the caller
974
+ * falls back to the full quoteless scanner, which preserves every lenient rule
975
+ * ("1 2 3" as a string, hex, Infinity). This bypasses the quoteless boundary scan
976
+ * + classify dispatch (and the per-number Infinity/hex probes) for the common
977
+ * case. Value construction goes through the same fj_*_from_parts helpers the
978
+ * other number paths use, so results can't drift. Returns 1 and sets *out, or 0
979
+ * with the cursor unchanged. */
980
+ static int fj_try_member_number(fj_state *st, VALUE *out) {
981
+ const char *buf = st->buf;
982
+ const char *p = buf + st->pos; /* RSTRING_PTR NUL terminator is the scan sentinel */
983
+ const char *np = p;
984
+ long nlen;
985
+ int is_float = 0, neg = 0, overflow = 0, t;
986
+ uint64_t m10 = 0;
987
+ int m10digits = 0, frac = 0;
988
+ int64_t e10 = 0;
989
+
990
+ if (*p == '-' || *p == '+') { neg = (*p == '-'); p++; }
991
+ /* Only a digit or '.' may open the numeric body; 'I'/'N'/etc. are left to the
992
+ * quoteless path (it handles ±Infinity and quoteless strings). */
993
+ if (!((*p >= '0' && *p <= '9') || *p == '.')) return 0;
994
+
995
+ /* Integer part: a single '0', or [1-9] then digits/underscores. */
996
+ if (*p == '0') {
997
+ m10digits = 1; p++;
998
+ } else if (*p >= '1' && *p <= '9') {
999
+ for (;;) {
1000
+ while (*p >= '0' && *p <= '9') {
1001
+ if (FJ_LIKELY(m10digits < 18)) { m10 = m10 * 10 + (uint64_t)(*p - '0'); m10digits++; }
1002
+ else overflow = 1;
1003
+ p++;
1004
+ }
1005
+ if (*p == '_') { p++; continue; }
1006
+ break;
1007
+ }
1008
+ }
1009
+
1010
+ /* Fraction. */
1011
+ if (*p == '.') {
1012
+ is_float = 1; p++;
1013
+ for (;;) {
1014
+ while (*p >= '0' && *p <= '9') {
1015
+ if (FJ_LIKELY(m10digits < 18)) { m10 = m10 * 10 + (uint64_t)(*p - '0'); m10digits++; frac++; }
1016
+ else overflow = 1;
1017
+ p++;
1018
+ }
1019
+ if (*p == '_') { p++; continue; }
1020
+ break;
1021
+ }
1022
+ }
1023
+
1024
+ /* Exponent. */
1025
+ if (*p == 'e' || *p == 'E') {
1026
+ const char *es;
1027
+ int eneg = 0;
1028
+ is_float = 1; p++;
1029
+ if (*p == '-' || *p == '+') { eneg = (*p == '-'); p++; }
1030
+ es = p;
1031
+ while ((*p >= '0' && *p <= '9') || *p == '_') {
1032
+ if (*p != '_' && !overflow) { e10 = e10 * 10 + (*p - '0'); if (e10 > 1000000) overflow = 1; }
1033
+ p++;
1034
+ }
1035
+ if (p == es) return 0; /* 'e' with no exponent digits -> let quoteless decide */
1036
+ if (eneg) e10 = -e10;
1037
+ }
1038
+
1039
+ if (m10digits == 0) return 0; /* e.g. "." or "+." -> not a number here */
1040
+
1041
+ /* Commit only if the number abuts a value terminator; otherwise (whitespace,
1042
+ * letters, a second '.', "0x…", …) leave it to the quoteless scanner. */
1043
+ t = (unsigned char)*p;
1044
+ if (!(t == ',' || t == '}' || t == ']' || t == 0x0A || t == 0x0D || p == buf + st->len)) {
1045
+ return 0;
1046
+ }
1047
+
1048
+ st->pos = p - buf;
1049
+ nlen = p - np;
1050
+ if (!is_float) {
1051
+ *out = fj_int_from_parts(m10, m10digits, neg, overflow, np, nlen);
1052
+ return 1;
1053
+ }
1054
+ e10 -= frac;
1055
+ if (st->bigdecimal_load == 2 ||
1056
+ (st->bigdecimal_load == 1 && m10digits > 16 && fj_sig_digits(np, nlen) > 16)) {
1057
+ *out = fj_to_bigdecimal_token(np, nlen);
1058
+ } else {
1059
+ *out = fj_float_from_parts(m10, m10digits, e10, neg, overflow, np, nlen);
1060
+ }
1061
+ return 1;
1062
+ }
1063
+
1064
+ static VALUE fj_parse_member_value(fj_state *st) {
1065
+ int b = fj_byte(st);
1066
+ switch (b) {
1067
+ case '"': return fj_parse_string(st, '"');
1068
+ case '\'': return fj_parse_single_or_triple(st);
1069
+ default: {
1070
+ int kind;
1071
+ if (b == '-' || b == '+' || b == '.' || (b >= '0' && b <= '9')) {
1072
+ VALUE num;
1073
+ if (fj_try_member_number(st, &num)) return num;
1074
+ }
1075
+ kind = fj_smart_quote_kind(st);
1076
+ if (kind) return fj_parse_smart_string(st, kind);
1077
+ return fj_parse_quoteless_or_literal(st);
1078
+ }
1079
+ }
1080
+ }
1081
+
1082
+ /* Top-level / strict scalar (no quoteless; containers handled by the driver). */
1083
+ static VALUE fj_parse_value(fj_state *st) {
1084
+ int b = fj_byte(st);
1085
+ switch (b) {
1086
+ case '"': return fj_parse_string(st, '"');
1087
+ case '\'': return fj_parse_single_or_triple(st);
1088
+ case 't': return fj_parse_literal(st, "true", Qtrue);
1089
+ case 'f': return fj_parse_literal(st, "false", Qfalse);
1090
+ case 'n': return fj_parse_literal(st, "null", Qnil);
1091
+ case 'T': return fj_parse_literal(st, "True", Qtrue);
1092
+ case 'F': return fj_parse_literal(st, "False", Qfalse);
1093
+ case 'u': return fj_parse_literal(st, "undefined", Qnil);
1094
+ case 'N': /* NaN (number) vs None (Python null) */
1095
+ if (fj_byte_at(st, 1) == 'a') return fj_parse_number(st);
1096
+ return fj_parse_literal(st, "None", Qnil);
1097
+ default:
1098
+ if (b == '-' || b == '+' || b == '.' || b == 'I' || (b >= '0' && b <= '9')) {
1099
+ return fj_parse_number(st);
1100
+ }
1101
+ {
1102
+ int kind = fj_smart_quote_kind(st);
1103
+ if (kind) return fj_parse_smart_string(st, kind);
1104
+ }
1105
+ fj_error(st, "unexpected character");
1106
+ }
1107
+ return Qnil; /* unreachable */
1108
+ }
1109
+
1110
+ /* --- container building: pre-sized hash + bulk insert (json/Oj style) --- */
1111
+
1112
+ #ifndef HAVE_RB_HASH_NEW_CAPA
1113
+ #define rb_hash_new_capa(n) rb_hash_new()
1114
+ #endif
1115
+
1116
+ #ifndef HAVE_RB_HASH_BULK_INSERT
1117
+ static void fj_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash) {
1118
+ long i;
1119
+ for (i = 0; i + 1 < count; i += 2) rb_hash_aset(hash, pairs[i], pairs[i + 1]);
1120
+ }
1121
+ #define rb_hash_bulk_insert fj_hash_bulk_insert
1122
+ #else
1123
+ /* Ruby 2.6 *exports* rb_hash_bulk_insert as a symbol (so have_func / HAVE_* is set
1124
+ * and the shim above is skipped) but does NOT declare it in any public header. Modern
1125
+ * clang treats the resulting implicit call as a hard error, so declare the prototype
1126
+ * ourselves. On 2.7+ the header already declares it identically, which is harmless. */
1127
+ void rb_hash_bulk_insert(long, const VALUE *, VALUE);
1128
+ #endif
1129
+
1130
+ /* Hash entry count as a C long. RHASH_SIZE is not part of the public C API on
1131
+ * older Ruby (< ~2.7), but rb_hash_size (Hash#size's implementation) is available
1132
+ * everywhere. Only used on the rare :raise duplicate-key path, so the boxing cost
1133
+ * is irrelevant — and it keeps the extension buildable down to Ruby 2.5. */
1134
+ static inline long fj_hash_len(VALUE hash) {
1135
+ return NUM2LONG(rb_hash_size(hash));
1136
+ }
1137
+
1138
+ /* Build a Hash from `count` interleaved key,value slots. Fast path (String keys,
1139
+ * default :last_wins or :raise): pre-size + bulk insert, detecting duplicates by
1140
+ * comparing the resulting size to the pair count — free unless a collision
1141
+ * actually happened. symbolize_keys / :first_wins use a per-member loop into the
1142
+ * same pre-sized hash. */
1143
+ static VALUE fj_build_object(fj_state *st, const VALUE *pairs, long count) {
1144
+ long entries = count / 2, i;
1145
+ VALUE hash = rb_hash_new_capa(entries);
1146
+
1147
+ if (!st->symbolize_keys && !st->dup_first_wins) {
1148
+ rb_hash_bulk_insert(count, pairs, hash);
1149
+ if (st->dup_raise && fj_hash_len(hash) < entries) {
1150
+ VALUE seen = rb_hash_new_capa(entries);
1151
+ for (i = 0; i + 1 < count; i += 2) {
1152
+ long before = fj_hash_len(seen);
1153
+ rb_hash_aset(seen, pairs[i], Qtrue);
1154
+ if (fj_hash_len(seen) == before) fj_error(st, "duplicate key");
1155
+ }
1156
+ }
1157
+ return hash;
1158
+ }
1159
+
1160
+ for (i = 0; i + 1 < count; i += 2) {
1161
+ VALUE k = st->symbolize_keys ? rb_funcall(pairs[i], fj_to_sym_id, 0) : pairs[i];
1162
+ if (st->dup_first_wins || st->dup_raise) {
1163
+ if (RTEST(rb_funcall(hash, fj_key_p_id, 1, k))) {
1164
+ if (st->dup_first_wins) continue;
1165
+ fj_error(st, "duplicate key");
1166
+ }
1167
+ }
1168
+ rb_hash_aset(hash, k, pairs[i + 1]);
1169
+ }
1170
+ return hash;
1171
+ }
1172
+
1173
+ /* --- working stacks: a GC-marked C value stack + a frame/mark stack ---
1174
+ * Pending values for not-yet-closed containers live on an explicit C array (not
1175
+ * a Ruby Array, so no Ruby-object op per value). Both buffers sit in one
1176
+ * TypedData object: GC marks the pending values via fj_pstack_mark, and frees
1177
+ * the buffers even if parsing raises mid-document. */
1178
+ typedef struct { long mark; int is_obj; } fj_frame;
1179
+
1180
+ typedef struct {
1181
+ VALUE *vptr; long vhead; long vcapa; /* pending values (GC-marked) */
1182
+ fj_frame *fptr; long fhead; long fcapa; /* open-container frames (no VALUEs) */
1183
+ } fj_pstack;
1184
+
1185
+ static void fj_pstack_mark(void *p) {
1186
+ fj_pstack *ps = (fj_pstack *)p;
1187
+ long i;
1188
+ for (i = 0; i < ps->vhead; i++) rb_gc_mark(ps->vptr[i]);
1189
+ }
1190
+ static void fj_pstack_free(void *p) {
1191
+ fj_pstack *ps = (fj_pstack *)p;
1192
+ if (ps->vptr != NULL) xfree(ps->vptr);
1193
+ if (ps->fptr != NULL) xfree(ps->fptr);
1194
+ xfree(ps);
1195
+ }
1196
+ static size_t fj_pstack_memsize(const void *p) {
1197
+ const fj_pstack *ps = (const fj_pstack *)p;
1198
+ return sizeof(fj_pstack) + (size_t)ps->vcapa * sizeof(VALUE) + (size_t)ps->fcapa * sizeof(fj_frame);
1199
+ }
1200
+ static const rb_data_type_t fj_pstack_type = {
1201
+ "smarter_json/pstack",
1202
+ { fj_pstack_mark, fj_pstack_free, fj_pstack_memsize, },
1203
+ 0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
1204
+ };
1205
+
1206
+ static inline void fj_vpush(fj_pstack *ps, VALUE v) {
1207
+ if (ps->vhead >= ps->vcapa) { ps->vcapa *= 2; REALLOC_N(ps->vptr, VALUE, ps->vcapa); }
1208
+ ps->vptr[ps->vhead++] = v;
1209
+ }
1210
+ static inline void fj_fpush(fj_pstack *ps, long mark, int is_obj) {
1211
+ if (ps->fhead >= ps->fcapa) { ps->fcapa *= 2; REALLOC_N(ps->fptr, fj_frame, ps->fcapa); }
1212
+ ps->fptr[ps->fhead].mark = mark;
1213
+ ps->fptr[ps->fhead].is_obj = is_obj;
1214
+ ps->fhead++;
1215
+ }
1216
+
1217
+ /* Iterative container parser — no C recursion. Each container's members/elements
1218
+ * are collected on the value stack and built at its closing brace with a
1219
+ * pre-sized hash + bulk insert (objects) or rb_ary_new_from_values (arrays). */
1220
+ static VALUE fj_parse_iter(fj_state *st, int implicit_root) {
1221
+ fj_pstack *ps;
1222
+ VALUE ps_obj = TypedData_Make_Struct(rb_cObject, fj_pstack, &fj_pstack_type, ps);
1223
+ VALUE result = Qnil;
1224
+
1225
+ ps->vptr = ALLOC_N(VALUE, 64); ps->vhead = 0; ps->vcapa = 64;
1226
+ ps->fptr = ALLOC_N(fj_frame, 16); ps->fhead = 0; ps->fcapa = 16;
1227
+
1228
+ if (implicit_root) fj_fpush(ps, 0, 1);
1229
+
1230
+ for (;;) {
1231
+ int b;
1232
+ long mark;
1233
+ int is_obj;
1234
+
1235
+ if (ps->fhead == 0) { /* top level: parse exactly one value */
1236
+ fj_skip_ws_comments(st);
1237
+ b = fj_byte(st);
1238
+ if (b == '{') { fj_advance(st, 1); fj_fpush(ps, ps->vhead, 1); continue; }
1239
+ if (b == '[') { fj_advance(st, 1); fj_fpush(ps, ps->vhead, 0); continue; }
1240
+ if (b == -1) fj_error(st, "unexpected end of input");
1241
+ result = fj_parse_value(st);
1242
+ break;
1243
+ }
1244
+
1245
+ mark = ps->fptr[ps->fhead - 1].mark;
1246
+ is_obj = ps->fptr[ps->fhead - 1].is_obj;
1247
+
1248
+ if (is_obj) {
1249
+ VALUE key;
1250
+ fj_skip_ws_comments(st);
1251
+ b = fj_byte(st);
1252
+ if (b == '}') {
1253
+ VALUE hash;
1254
+ fj_advance(st, 1);
1255
+ hash = fj_build_object(st, &ps->vptr[mark], ps->vhead - mark);
1256
+ ps->vhead = mark;
1257
+ ps->fhead--;
1258
+ if (ps->fhead == 0) { result = hash; break; }
1259
+ fj_vpush(ps, hash);
1260
+ fj_skip_ws_comments(st);
1261
+ if (fj_byte(st) == ',') fj_advance(st, 1);
1262
+ continue;
1263
+ }
1264
+ if (b == -1) {
1265
+ if (implicit_root && ps->fhead == 1) {
1266
+ result = fj_build_object(st, &ps->vptr[mark], ps->vhead - mark);
1267
+ break;
1268
+ }
1269
+ fj_error(st, "unterminated object");
1270
+ }
1271
+ if (b == ']') fj_error(st, "unexpected ']' — expected a key or '}'");
1272
+ key = fj_parse_object_key(st);
1273
+ fj_skip_ws_comments(st);
1274
+ if (fj_byte(st) != ':') fj_error(st, "expected ':' after object key");
1275
+ fj_advance(st, 1);
1276
+ fj_skip_ws_comments(st);
1277
+ b = fj_byte(st);
1278
+ if (b == '{' || b == '[') {
1279
+ fj_vpush(ps, key);
1280
+ fj_advance(st, 1);
1281
+ fj_fpush(ps, ps->vhead, (b == '{'));
1282
+ continue;
1283
+ }
1284
+ if (b == -1) fj_error(st, "unexpected end of input");
1285
+ fj_vpush(ps, key);
1286
+ fj_vpush(ps, fj_parse_member_value(st));
1287
+ fj_skip_ws_comments(st); /* skip_separator_run */
1288
+ if (fj_byte(st) == ',') fj_advance(st, 1);
1289
+ } else { /* array */
1290
+ fj_skip_ws_comments(st);
1291
+ b = fj_byte(st);
1292
+ if (b == ']') {
1293
+ VALUE ary;
1294
+ fj_advance(st, 1);
1295
+ ary = rb_ary_new_from_values(ps->vhead - mark, &ps->vptr[mark]);
1296
+ ps->vhead = mark;
1297
+ ps->fhead--;
1298
+ if (ps->fhead == 0) { result = ary; break; }
1299
+ fj_vpush(ps, ary);
1300
+ fj_skip_ws_comments(st);
1301
+ if (fj_byte(st) == ',') fj_advance(st, 1);
1302
+ continue;
1303
+ }
1304
+ if (b == -1) fj_error(st, "unterminated array");
1305
+ if (b == '}') fj_error(st, "unexpected '}' — expected ']' or a value");
1306
+ if (b == '{' || b == '[') {
1307
+ fj_advance(st, 1);
1308
+ fj_fpush(ps, ps->vhead, (b == '{'));
1309
+ continue;
1310
+ }
1311
+ fj_vpush(ps, fj_parse_member_value(st));
1312
+ fj_skip_ws_comments(st); /* skip_separator_run */
1313
+ if (fj_byte(st) == ',') fj_advance(st, 1);
1314
+ }
1315
+ }
1316
+
1317
+ RB_GC_GUARD(ps_obj);
1318
+ return result;
1319
+ }
1320
+
1321
+ /* At the start of a document: identifier followed by ':' means implicit root
1322
+ * object (no outer braces). Look ahead without consuming. */
1323
+ static int fj_implicit_root_ahead(fj_state *st) {
1324
+ int b = fj_byte(st), result;
1325
+ long sp;
1326
+ if (b == -1 || !fj_is_key_start(b)) return 0;
1327
+ sp = st->pos;
1328
+ fj_advance(st, 1);
1329
+ while ((b = fj_byte(st)) != -1 && fj_is_key_continue(b)) fj_advance(st, 1);
1330
+ fj_skip_pure_ws(st);
1331
+ result = (fj_byte(st) == ':');
1332
+ st->pos = sp;
1333
+ return result;
1334
+ }
1335
+
1336
+ static VALUE fj_parse_c(VALUE self, VALUE input, VALUE opts) {
1337
+ fj_state st;
1338
+ VALUE value, enc_opt, dk;
1339
+
1340
+ Check_Type(input, T_STRING);
1341
+
1342
+ enc_opt = rb_hash_aref(opts, ID2SYM(rb_intern("encoding")));
1343
+ if (!NIL_P(enc_opt)) {
1344
+ input = rb_funcall(rb_str_dup(input), rb_intern("force_encoding"), 1, enc_opt);
1345
+ }
1346
+ if (!RTEST(rb_funcall(input, rb_intern("valid_encoding?"), 0))) {
1347
+ VALUE name = rb_funcall(rb_funcall(input, rb_intern("encoding"), 0), rb_intern("name"), 0);
1348
+ VALUE msg = rb_sprintf("invalid byte sequence for %" PRIsVALUE, name);
1349
+ rb_exc_raise(rb_funcall(cEncodingError, rb_intern("new"), 3, msg, Qnil, Qnil));
1350
+ }
1351
+
1352
+ st.buf = RSTRING_PTR(input);
1353
+ st.len = RSTRING_LEN(input);
1354
+ st.pos = 0;
1355
+ st.enc = rb_enc_get(input);
1356
+ st.depth = 0;
1357
+ #ifdef HAVE_RB_ENC_INTERNED_STR
1358
+ fj_kc_slot kcache[FJ_KCACHE_SIZE];
1359
+ memset(kcache, 0, sizeof(kcache));
1360
+ st.kcache = kcache;
1361
+ #else
1362
+ st.kcache = NULL;
1363
+ #endif
1364
+
1365
+ st.symbolize_keys = RTEST(rb_hash_aref(opts, ID2SYM(rb_intern("symbolize_keys"))));
1366
+ dk = rb_hash_aref(opts, ID2SYM(rb_intern("duplicate_key")));
1367
+ st.dup_first_wins = (dk == ID2SYM(rb_intern("first_wins")));
1368
+ st.dup_raise = (dk == ID2SYM(rb_intern("raise")));
1369
+
1370
+ {
1371
+ VALUE bd = rb_hash_aref(opts, ID2SYM(rb_intern("bigdecimal_load")));
1372
+ if (bd == ID2SYM(rb_intern("float"))) st.bigdecimal_load = 0;
1373
+ else if (bd == ID2SYM(rb_intern("bigdecimal"))) st.bigdecimal_load = 2;
1374
+ else st.bigdecimal_load = 1; /* :auto (default), including nil */
1375
+ }
1376
+
1377
+ if (st.len >= 3 && (unsigned char)st.buf[0] == 0xEF &&
1378
+ (unsigned char)st.buf[1] == 0xBB && (unsigned char)st.buf[2] == 0xBF) {
1379
+ st.pos = 3;
1380
+ }
1381
+
1382
+ /* With a block: yield each top-level value until EOF (JSONL / NDJSON /
1383
+ * concatenated). Same loop as the Ruby each_value path, on the C parser. */
1384
+ if (rb_block_given_p()) {
1385
+ for (;;) {
1386
+ fj_skip_ws_comments(&st);
1387
+ if (fj_eof(&st)) break;
1388
+ rb_yield(fj_parse_iter(&st, fj_implicit_root_ahead(&st)));
1389
+ }
1390
+ return Qnil;
1391
+ }
1392
+
1393
+ /* No block: auto-detect the document count for free — it is the same "is there
1394
+ * trailing content after the first value?" check that used to raise. 0 documents
1395
+ * -> nil; 1 document -> the value itself (single-document hot path, no Array
1396
+ * allocated); 2+ documents (NDJSON / JSONL / concatenated / whitespace-separated)
1397
+ * -> an Array of every top-level value. Commas do NOT separate documents (only
1398
+ * whitespace / newline / concatenation do), so a bracketless comma list still
1399
+ * raises in fj_parse_iter — the unsupported implicit-root array. */
1400
+ fj_skip_ws_comments(&st);
1401
+ if (fj_eof(&st)) return Qnil;
1402
+ value = fj_parse_iter(&st, fj_implicit_root_ahead(&st));
1403
+ fj_skip_ws_comments(&st);
1404
+ if (fj_eof(&st)) return value;
1405
+ {
1406
+ VALUE arr = rb_ary_new();
1407
+ rb_ary_push(arr, value);
1408
+ do {
1409
+ rb_ary_push(arr, fj_parse_iter(&st, fj_implicit_root_ahead(&st)));
1410
+ fj_skip_ws_comments(&st);
1411
+ } while (!fj_eof(&st));
1412
+ return arr;
1413
+ }
1414
+ }
1415
+
1416
+ void Init_smarter_json(void) {
1417
+ mSmarterJSON = rb_define_module("SmarterJSON");
1418
+ cParseError = rb_const_get(mSmarterJSON, rb_intern("ParseError"));
1419
+ cEncodingError = rb_const_get(mSmarterJSON, rb_intern("EncodingError"));
1420
+ fj_bigdecimal_id = rb_intern("BigDecimal");
1421
+ fj_to_sym_id = rb_intern("to_sym");
1422
+ fj_key_p_id = rb_intern("key?");
1423
+ rb_define_module_function(mSmarterJSON, "parse_c", fj_parse_c, 2);
1424
+ }