json 2.13.2 → 2.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,42 +1,13 @@
1
- #include "ruby.h"
2
- #include "ruby/encoding.h"
3
-
4
- /* shims */
5
- /* This is the fallback definition from Ruby 3.4 */
6
-
7
- #ifndef RBIMPL_STDBOOL_H
8
- #if defined(__cplusplus)
9
- # if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
10
- # include <cstdbool>
11
- # endif
12
- #elif defined(HAVE_STDBOOL_H)
13
- # include <stdbool.h>
14
- #elif !defined(HAVE__BOOL)
15
- typedef unsigned char _Bool;
16
- # define bool _Bool
17
- # define true ((_Bool)+1)
18
- # define false ((_Bool)+0)
19
- # define __bool_true_false_are_defined
20
- #endif
21
- #endif
22
-
1
+ #include "../json.h"
2
+ #include "../vendor/ryu.h"
23
3
  #include "../simd/simd.h"
24
4
 
25
- #ifndef RB_UNLIKELY
26
- #define RB_UNLIKELY(expr) expr
27
- #endif
28
-
29
- #ifndef RB_LIKELY
30
- #define RB_LIKELY(expr) expr
31
- #endif
32
-
33
5
  static VALUE mJSON, eNestingError, Encoding_UTF_8;
34
6
  static VALUE CNaN, CInfinity, CMinusInfinity;
35
7
 
36
- static ID i_chr, i_aset, i_aref,
37
- i_leftshift, i_new, i_try_convert, i_uminus, i_encode;
8
+ static ID i_new, i_try_convert, i_uminus, i_encode;
38
9
 
39
- static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze,
10
+ static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters, sym_symbolize_names, sym_freeze,
40
11
  sym_decimal_class, sym_on_load, sym_allow_duplicate_key;
41
12
 
42
13
  static int binary_encindex;
@@ -44,7 +15,7 @@ static int utf8_encindex;
44
15
 
45
16
  #ifndef HAVE_RB_HASH_BULK_INSERT
46
17
  // For TruffleRuby
47
- void
18
+ static void
48
19
  rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
49
20
  {
50
21
  long index = 0;
@@ -61,6 +32,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
61
32
  #define rb_hash_new_capa(n) rb_hash_new()
62
33
  #endif
63
34
 
35
+ #ifndef HAVE_RB_STR_TO_INTERNED_STR
36
+ static VALUE rb_str_to_interned_str(VALUE str)
37
+ {
38
+ return rb_funcall(rb_str_freeze(str), i_uminus, 0);
39
+ }
40
+ #endif
64
41
 
65
42
  /* name cache */
66
43
 
@@ -106,116 +83,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring
106
83
  cache->entries[index] = rstring;
107
84
  }
108
85
 
109
- static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
86
+ #define rstring_cache_memcmp memcmp
87
+
88
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
89
+ #if __has_builtin(__builtin_bswap64)
90
+ #undef rstring_cache_memcmp
91
+ ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
110
92
  {
111
- long rstring_length = RSTRING_LEN(rstring);
112
- if (length == rstring_length) {
113
- return memcmp(str, RSTRING_PTR(rstring), length);
114
- } else {
115
- return (int)(length - rstring_length);
93
+ // The libc memcmp has numerous complex optimizations, but in this particular case,
94
+ // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
95
+ // inline a simpler memcmp outperforms calling the libc version.
96
+ long i = 0;
97
+
98
+ for (; i + 8 <= length; i += 8) {
99
+ uint64_t a, b;
100
+ memcpy(&a, str + i, 8);
101
+ memcpy(&b, rptr + i, 8);
102
+ if (a != b) {
103
+ a = __builtin_bswap64(a);
104
+ b = __builtin_bswap64(b);
105
+ return (a < b) ? -1 : 1;
106
+ }
107
+ }
108
+
109
+ for (; i < length; i++) {
110
+ if (str[i] != rptr[i]) {
111
+ return (str[i] < rptr[i]) ? -1 : 1;
112
+ }
116
113
  }
114
+
115
+ return 0;
117
116
  }
117
+ #endif
118
+ #endif
118
119
 
119
- static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
120
+ ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
120
121
  {
121
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
122
- // Common names aren't likely to be very long. So we just don't
123
- // cache names above an arbitrary threshold.
124
- return Qfalse;
125
- }
122
+ const char *rstring_ptr;
123
+ long rstring_length;
126
124
 
127
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
128
- // Simple heuristic, if the first character isn't a letter,
129
- // we're much less likely to see this string again.
130
- // We mostly want to cache strings that are likely to be repeated.
131
- return Qfalse;
125
+ RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
126
+
127
+ if (length == rstring_length) {
128
+ return rstring_cache_memcmp(str, rstring_ptr, length);
129
+ } else {
130
+ return (int)(length - rstring_length);
132
131
  }
132
+ }
133
133
 
134
+ ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
135
+ {
134
136
  int low = 0;
135
137
  int high = cache->length - 1;
136
- int mid = 0;
137
- int last_cmp = 0;
138
138
 
139
139
  while (low <= high) {
140
- mid = (high + low) >> 1;
140
+ int mid = (high + low) >> 1;
141
141
  VALUE entry = cache->entries[mid];
142
- last_cmp = rstring_cache_cmp(str, length, entry);
142
+ int cmp = rstring_cache_cmp(str, length, entry);
143
143
 
144
- if (last_cmp == 0) {
144
+ if (cmp == 0) {
145
145
  return entry;
146
- } else if (last_cmp > 0) {
146
+ } else if (cmp > 0) {
147
147
  low = mid + 1;
148
148
  } else {
149
149
  high = mid - 1;
150
150
  }
151
151
  }
152
152
 
153
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
154
- // We assume the overwhelming majority of names don't need to be escaped.
155
- // But if they do, we have to fallback to the slow path.
156
- return Qfalse;
157
- }
158
-
159
153
  VALUE rstring = build_interned_string(str, length);
160
154
 
161
155
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
162
- if (last_cmp > 0) {
163
- mid += 1;
164
- }
165
-
166
- rvalue_cache_insert_at(cache, mid, rstring);
156
+ rvalue_cache_insert_at(cache, low, rstring);
167
157
  }
168
158
  return rstring;
169
159
  }
170
160
 
171
161
  static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
172
162
  {
173
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
174
- // Common names aren't likely to be very long. So we just don't
175
- // cache names above an arbitrary threshold.
176
- return Qfalse;
177
- }
178
-
179
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
180
- // Simple heuristic, if the first character isn't a letter,
181
- // we're much less likely to see this string again.
182
- // We mostly want to cache strings that are likely to be repeated.
183
- return Qfalse;
184
- }
185
-
186
163
  int low = 0;
187
164
  int high = cache->length - 1;
188
- int mid = 0;
189
- int last_cmp = 0;
190
165
 
191
166
  while (low <= high) {
192
- mid = (high + low) >> 1;
167
+ int mid = (high + low) >> 1;
193
168
  VALUE entry = cache->entries[mid];
194
- last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
169
+ int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
195
170
 
196
- if (last_cmp == 0) {
171
+ if (cmp == 0) {
197
172
  return entry;
198
- } else if (last_cmp > 0) {
173
+ } else if (cmp > 0) {
199
174
  low = mid + 1;
200
175
  } else {
201
176
  high = mid - 1;
202
177
  }
203
178
  }
204
179
 
205
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
206
- // We assume the overwhelming majority of names don't need to be escaped.
207
- // But if they do, we have to fallback to the slow path.
208
- return Qfalse;
209
- }
210
-
211
180
  VALUE rsymbol = build_symbol(str, length);
212
181
 
213
182
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
214
- if (last_cmp > 0) {
215
- mid += 1;
216
- }
217
-
218
- rvalue_cache_insert_at(cache, mid, rsymbol);
183
+ rvalue_cache_insert_at(cache, low, rsymbol);
219
184
  }
220
185
  return rsymbol;
221
186
  }
@@ -330,15 +295,6 @@ static void rvalue_stack_eagerly_release(VALUE handle)
330
295
  }
331
296
  }
332
297
 
333
-
334
- #ifndef HAVE_STRNLEN
335
- static size_t strnlen(const char *s, size_t maxlen)
336
- {
337
- char *p;
338
- return ((p = memchr(s, '\0', maxlen)) ? p - s : maxlen);
339
- }
340
- #endif
341
-
342
298
  static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
343
299
  {
344
300
  int len = 1;
@@ -379,7 +335,7 @@ typedef struct JSON_ParserStruct {
379
335
  int max_nesting;
380
336
  bool allow_nan;
381
337
  bool allow_trailing_comma;
382
- bool parsing_name;
338
+ bool allow_control_characters;
383
339
  bool symbolize_names;
384
340
  bool freeze;
385
341
  } JSON_ParserConfig;
@@ -395,6 +351,22 @@ typedef struct JSON_ParserStateStruct {
395
351
  int current_nesting;
396
352
  } JSON_ParserState;
397
353
 
354
+ static inline size_t rest(JSON_ParserState *state) {
355
+ return state->end - state->cursor;
356
+ }
357
+
358
+ static inline bool eos(JSON_ParserState *state) {
359
+ return state->cursor >= state->end;
360
+ }
361
+
362
+ static inline char peek(JSON_ParserState *state)
363
+ {
364
+ if (RB_UNLIKELY(eos(state))) {
365
+ return 0;
366
+ }
367
+ return *state->cursor;
368
+ }
369
+
398
370
  static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
399
371
  {
400
372
  const char *cursor = state->cursor;
@@ -505,23 +477,24 @@ static const signed char digit_values[256] = {
505
477
  -1, -1, -1, -1, -1, -1, -1
506
478
  };
507
479
 
508
- static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
480
+ static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe)
509
481
  {
510
- signed char b;
511
- uint32_t result = 0;
512
- b = digit_values[p[0]];
513
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
514
- result = (result << 4) | (unsigned char)b;
515
- b = digit_values[p[1]];
516
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
517
- result = (result << 4) | (unsigned char)b;
518
- b = digit_values[p[2]];
519
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
520
- result = (result << 4) | (unsigned char)b;
521
- b = digit_values[p[3]];
522
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
523
- result = (result << 4) | (unsigned char)b;
524
- return result;
482
+ if (RB_UNLIKELY(sp > spe - 4)) {
483
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
484
+ }
485
+
486
+ const unsigned char *p = (const unsigned char *)sp;
487
+
488
+ const signed char b0 = digit_values[p[0]];
489
+ const signed char b1 = digit_values[p[1]];
490
+ const signed char b2 = digit_values[p[2]];
491
+ const signed char b3 = digit_values[p[3]];
492
+
493
+ if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) {
494
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
495
+ }
496
+
497
+ return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
525
498
  }
526
499
 
527
500
  #define GET_PARSER_CONFIG \
@@ -530,61 +503,82 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
530
503
 
531
504
  static const rb_data_type_t JSON_ParserConfig_type;
532
505
 
533
- static const bool whitespace[256] = {
534
- [' '] = 1,
535
- ['\t'] = 1,
536
- ['\n'] = 1,
537
- ['\r'] = 1,
538
- ['/'] = 1,
539
- };
540
-
541
506
  static void
542
507
  json_eat_comments(JSON_ParserState *state)
543
508
  {
544
- if (state->cursor + 1 < state->end) {
545
- switch (state->cursor[1]) {
546
- case '/': {
547
- state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
548
- if (!state->cursor) {
549
- state->cursor = state->end;
550
- } else {
551
- state->cursor++;
552
- }
553
- break;
509
+ const char *start = state->cursor;
510
+ state->cursor++;
511
+
512
+ switch (peek(state)) {
513
+ case '/': {
514
+ state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
515
+ if (!state->cursor) {
516
+ state->cursor = state->end;
517
+ } else {
518
+ state->cursor++;
554
519
  }
555
- case '*': {
556
- state->cursor += 2;
557
- while (true) {
558
- state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
559
- if (!state->cursor) {
560
- raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
561
- } else {
562
- state->cursor++;
563
- if (state->cursor < state->end && *state->cursor == '/') {
564
- state->cursor++;
565
- break;
566
- }
567
- }
520
+ break;
521
+ }
522
+ case '*': {
523
+ state->cursor++;
524
+
525
+ while (true) {
526
+ const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
527
+ if (!next_match) {
528
+ raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
529
+ }
530
+
531
+ state->cursor = next_match + 1;
532
+ if (peek(state) == '/') {
533
+ state->cursor++;
534
+ break;
568
535
  }
569
- break;
570
536
  }
571
- default:
572
- raise_parse_error("unexpected token %s", state);
573
- break;
537
+ break;
574
538
  }
575
- } else {
576
- raise_parse_error("unexpected token %s", state);
539
+ default:
540
+ raise_parse_error_at("unexpected token %s", state, start);
541
+ break;
577
542
  }
578
543
  }
579
544
 
580
- static inline void
545
+ ALWAYS_INLINE(static) void
581
546
  json_eat_whitespace(JSON_ParserState *state)
582
547
  {
583
- while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) {
584
- if (RB_LIKELY(*state->cursor != '/')) {
585
- state->cursor++;
586
- } else {
587
- json_eat_comments(state);
548
+ while (true) {
549
+ switch (peek(state)) {
550
+ case ' ':
551
+ state->cursor++;
552
+ break;
553
+ case '\n':
554
+ state->cursor++;
555
+
556
+ // Heuristic: if we see a newline, there is likely consecutive spaces after it.
557
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
558
+ while (rest(state) > 8) {
559
+ uint64_t chunk;
560
+ memcpy(&chunk, state->cursor, sizeof(uint64_t));
561
+ if (chunk == 0x2020202020202020) {
562
+ state->cursor += 8;
563
+ continue;
564
+ }
565
+
566
+ uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
567
+ state->cursor += consecutive_spaces;
568
+ break;
569
+ }
570
+ #endif
571
+ break;
572
+ case '\t':
573
+ case '\r':
574
+ state->cursor++;
575
+ break;
576
+ case '/':
577
+ json_eat_comments(state);
578
+ break;
579
+
580
+ default:
581
+ return;
588
582
  }
589
583
  }
590
584
  }
@@ -615,11 +609,22 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
615
609
  return result;
616
610
  }
617
611
 
618
- static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
612
+ static inline bool json_string_cacheable_p(const char *string, size_t length)
613
+ {
614
+ // We mostly want to cache strings that are likely to be repeated.
615
+ // Simple heuristics:
616
+ // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
617
+ // - If the first character isn't a letter, we're much less likely to see this string again.
618
+ return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
619
+ }
620
+
621
+ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
619
622
  {
623
+ bool intern = is_name || config->freeze;
624
+ bool symbolize = is_name && config->symbolize_names;
620
625
  size_t bufferSize = stringEnd - string;
621
626
 
622
- if (is_name && state->in_array) {
627
+ if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
623
628
  VALUE cached_key;
624
629
  if (RB_UNLIKELY(symbolize)) {
625
630
  cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
@@ -635,104 +640,125 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
635
640
  return build_string(string, stringEnd, intern, symbolize);
636
641
  }
637
642
 
638
- static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
639
- {
640
- size_t bufferSize = stringEnd - string;
641
- const char *p = string, *pe = string, *unescape, *bufferStart;
642
- char *buffer;
643
- int unescape_len;
644
- char buf[4];
643
+ #define JSON_MAX_UNESCAPE_POSITIONS 16
644
+ typedef struct _json_unescape_positions {
645
+ long size;
646
+ const char **positions;
647
+ unsigned long additional_backslashes;
648
+ } JSON_UnescapePositions;
645
649
 
646
- if (is_name && state->in_array) {
647
- VALUE cached_key;
648
- if (RB_UNLIKELY(symbolize)) {
649
- cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
650
- } else {
651
- cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
650
+ static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
651
+ {
652
+ while (positions->size) {
653
+ positions->size--;
654
+ const char *next_position = positions->positions[0];
655
+ positions->positions++;
656
+ if (next_position >= pe) {
657
+ return next_position;
652
658
  }
659
+ }
653
660
 
654
- if (RB_LIKELY(cached_key)) {
655
- return cached_key;
656
- }
661
+ if (positions->additional_backslashes) {
662
+ positions->additional_backslashes--;
663
+ return memchr(pe, '\\', stringEnd - pe);
657
664
  }
658
665
 
666
+ return NULL;
667
+ }
668
+
669
+ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
670
+ {
671
+ bool intern = is_name || config->freeze;
672
+ bool symbolize = is_name && config->symbolize_names;
673
+ size_t bufferSize = stringEnd - string;
674
+ const char *p = string, *pe = string, *bufferStart;
675
+ char *buffer;
676
+
659
677
  VALUE result = rb_str_buf_new(bufferSize);
660
678
  rb_enc_associate_index(result, utf8_encindex);
661
679
  buffer = RSTRING_PTR(result);
662
680
  bufferStart = buffer;
663
681
 
664
- while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) {
665
- unescape = (char *) "?";
666
- unescape_len = 1;
682
+ #define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
683
+
684
+ while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
667
685
  if (pe > p) {
668
686
  MEMCPY(buffer, p, char, pe - p);
669
687
  buffer += pe - p;
670
688
  }
671
689
  switch (*++pe) {
690
+ case '"':
691
+ case '/':
692
+ p = pe; // nothing to unescape just need to skip the backslash
693
+ break;
694
+ case '\\':
695
+ APPEND_CHAR('\\');
696
+ break;
672
697
  case 'n':
673
- unescape = (char *) "\n";
698
+ APPEND_CHAR('\n');
674
699
  break;
675
700
  case 'r':
676
- unescape = (char *) "\r";
701
+ APPEND_CHAR('\r');
677
702
  break;
678
703
  case 't':
679
- unescape = (char *) "\t";
680
- break;
681
- case '"':
682
- unescape = (char *) "\"";
683
- break;
684
- case '\\':
685
- unescape = (char *) "\\";
704
+ APPEND_CHAR('\t');
686
705
  break;
687
706
  case 'b':
688
- unescape = (char *) "\b";
707
+ APPEND_CHAR('\b');
689
708
  break;
690
709
  case 'f':
691
- unescape = (char *) "\f";
710
+ APPEND_CHAR('\f');
692
711
  break;
693
- case 'u':
694
- if (pe > stringEnd - 5) {
695
- raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
696
- } else {
697
- uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
698
- pe += 3;
699
- /* To handle values above U+FFFF, we take a sequence of
700
- * \uXXXX escapes in the U+D800..U+DBFF then
701
- * U+DC00..U+DFFF ranges, take the low 10 bits from each
702
- * to make a 20-bit number, then add 0x10000 to get the
703
- * final codepoint.
704
- *
705
- * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
706
- * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
707
- * Area".
708
- */
709
- if ((ch & 0xFC00) == 0xD800) {
710
- pe++;
711
- if (pe > stringEnd - 6) {
712
- raise_parse_error_at("incomplete surrogate pair at %s", state, p);
713
- }
714
- if (pe[0] == '\\' && pe[1] == 'u') {
715
- uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
716
- ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
717
- | (sur & 0x3FF));
718
- pe += 5;
719
- } else {
720
- unescape = (char *) "?";
721
- break;
712
+ case 'u': {
713
+ uint32_t ch = unescape_unicode(state, ++pe, stringEnd);
714
+ pe += 3;
715
+ /* To handle values above U+FFFF, we take a sequence of
716
+ * \uXXXX escapes in the U+D800..U+DBFF then
717
+ * U+DC00..U+DFFF ranges, take the low 10 bits from each
718
+ * to make a 20-bit number, then add 0x10000 to get the
719
+ * final codepoint.
720
+ *
721
+ * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
722
+ * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
723
+ * Area".
724
+ */
725
+ if ((ch & 0xFC00) == 0xD800) {
726
+ pe++;
727
+ if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) {
728
+ uint32_t sur = unescape_unicode(state, pe + 2, stringEnd);
729
+
730
+ if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) {
731
+ raise_parse_error_at("invalid surrogate pair at %s", state, p);
722
732
  }
733
+
734
+ ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF));
735
+ pe += 5;
736
+ } else {
737
+ raise_parse_error_at("incomplete surrogate pair at %s", state, p);
738
+ break;
723
739
  }
724
- unescape_len = convert_UTF32_to_UTF8(buf, ch);
725
- unescape = buf;
726
740
  }
741
+
742
+ int unescape_len = convert_UTF32_to_UTF8(buffer, ch);
743
+ buffer += unescape_len;
744
+ p = ++pe;
727
745
  break;
746
+ }
728
747
  default:
729
- p = pe;
730
- continue;
748
+ if ((unsigned char)*pe < 0x20) {
749
+ if (!config->allow_control_characters) {
750
+ if (*pe == '\n') {
751
+ raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1);
752
+ }
753
+ raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1);
754
+ }
755
+ } else {
756
+ raise_parse_error_at("invalid escape character in string: %s", state, pe - 1);
757
+ }
758
+ break;
731
759
  }
732
- MEMCPY(buffer, unescape, char, unescape_len);
733
- buffer += unescape_len;
734
- p = ++pe;
735
760
  }
761
+ #undef APPEND_CHAR
736
762
 
737
763
  if (stringEnd > p) {
738
764
  MEMCPY(buffer, p, char, stringEnd - p);
@@ -743,33 +769,13 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
743
769
  if (symbolize) {
744
770
  result = rb_str_intern(result);
745
771
  } else if (intern) {
746
- result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
772
+ result = rb_str_to_interned_str(result);
747
773
  }
748
774
 
749
775
  return result;
750
776
  }
751
777
 
752
778
  #define MAX_FAST_INTEGER_SIZE 18
753
- static inline VALUE fast_decode_integer(const char *p, const char *pe)
754
- {
755
- bool negative = false;
756
- if (*p == '-') {
757
- negative = true;
758
- p++;
759
- }
760
-
761
- long long memo = 0;
762
- while (p < pe) {
763
- memo *= 10;
764
- memo += *p - '0';
765
- p++;
766
- }
767
-
768
- if (negative) {
769
- memo = -memo;
770
- }
771
- return LL2NUM(memo);
772
- }
773
779
 
774
780
  static VALUE json_decode_large_integer(const char *start, long len)
775
781
  {
@@ -783,17 +789,27 @@ static VALUE json_decode_large_integer(const char *start, long len)
783
789
  }
784
790
 
785
791
  static inline VALUE
786
- json_decode_integer(const char *start, const char *end)
792
+ json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
787
793
  {
788
- long len = end - start;
789
- if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
790
- return fast_decode_integer(start, end);
794
+ if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
795
+ if (negative) {
796
+ return INT64T2NUM(-((int64_t)mantissa));
791
797
  }
792
- return json_decode_large_integer(start, len);
798
+ return UINT64T2NUM(mantissa);
799
+ }
800
+
801
+ return json_decode_large_integer(start, end - start);
793
802
  }
794
803
 
795
804
  static VALUE json_decode_large_float(const char *start, long len)
796
805
  {
806
+ if (RB_LIKELY(len < 64)) {
807
+ char buffer[64];
808
+ MEMCPY(buffer, start, char, len);
809
+ buffer[len] = '\0';
810
+ return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
811
+ }
812
+
797
813
  VALUE buffer_v;
798
814
  char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
799
815
  MEMCPY(buffer, start, char, len);
@@ -803,21 +819,24 @@ static VALUE json_decode_large_float(const char *start, long len)
803
819
  return number;
804
820
  }
805
821
 
806
- static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
822
+ /* Ruby JSON optimized float decoder using vendored Ryu algorithm
823
+ * Accepts pre-extracted mantissa and exponent from first-pass validation
824
+ */
825
+ static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
826
+ const char *start, const char *end)
807
827
  {
808
- long len = end - start;
809
-
810
828
  if (RB_UNLIKELY(config->decimal_class)) {
811
- VALUE text = rb_str_new(start, len);
829
+ VALUE text = rb_str_new(start, end - start);
812
830
  return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
813
- } else if (RB_LIKELY(len < 64)) {
814
- char buffer[64];
815
- MEMCPY(buffer, start, char, len);
816
- buffer[len] = '\0';
817
- return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
818
- } else {
819
- return json_decode_large_float(start, len);
820
831
  }
832
+
833
+ // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
834
+ // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
835
+ if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
836
+ return json_decode_large_float(start, end - start);
837
+ }
838
+
839
+ return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
821
840
  }
822
841
 
823
842
  static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
@@ -903,20 +922,6 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi
903
922
  return object;
904
923
  }
905
924
 
906
- static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name)
907
- {
908
- VALUE string;
909
- bool intern = is_name || config->freeze;
910
- bool symbolize = is_name && config->symbolize_names;
911
- if (escaped) {
912
- string = json_string_unescape(state, start, end, is_name, intern, symbolize);
913
- } else {
914
- string = json_string_fastpath(state, start, end, is_name, intern, symbolize);
915
- }
916
-
917
- return string;
918
- }
919
-
920
925
  static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
921
926
  {
922
927
  if (RB_UNLIKELY(config->on_load_proc)) {
@@ -939,17 +944,11 @@ static const bool string_scan_table[256] = {
939
944
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940
945
  };
941
946
 
942
- #if (defined(__GNUC__ ) || defined(__clang__))
943
- #define FORCE_INLINE __attribute__((always_inline))
944
- #else
945
- #define FORCE_INLINE
946
- #endif
947
-
948
947
  #ifdef HAVE_SIMD
949
948
  static SIMD_Implementation simd_impl = SIMD_NONE;
950
949
  #endif /* HAVE_SIMD */
951
950
 
952
- static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
951
+ ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state)
953
952
  {
954
953
  #ifdef HAVE_SIMD
955
954
  #if defined(HAVE_SIMD_NEON)
@@ -957,7 +956,7 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
957
956
  uint64_t mask = 0;
958
957
  if (string_scan_simd_neon(&state->cursor, state->end, &mask)) {
959
958
  state->cursor += trailing_zeros64(mask) >> 2;
960
- return 1;
959
+ return true;
961
960
  }
962
961
 
963
962
  #elif defined(HAVE_SIMD_SSE2)
@@ -965,64 +964,232 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
965
964
  int mask = 0;
966
965
  if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) {
967
966
  state->cursor += trailing_zeros(mask);
968
- return 1;
967
+ return true;
969
968
  }
970
969
  }
971
970
  #endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
972
971
  #endif /* HAVE_SIMD */
973
972
 
974
- while (state->cursor < state->end) {
973
+ while (!eos(state)) {
975
974
  if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
976
- return 1;
975
+ return true;
977
976
  }
978
- *state->cursor++;
977
+ state->cursor++;
979
978
  }
980
- return 0;
979
+ return false;
981
980
  }
982
981
 
983
- static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
982
+ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
984
983
  {
985
- state->cursor++;
986
- const char *start = state->cursor;
987
- bool escaped = false;
984
+ const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
985
+ JSON_UnescapePositions positions = {
986
+ .size = 0,
987
+ .positions = backslashes,
988
+ .additional_backslashes = 0,
989
+ };
988
990
 
989
- while (RB_UNLIKELY(string_scan(state))) {
991
+ do {
990
992
  switch (*state->cursor) {
991
993
  case '"': {
992
- VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name);
994
+ VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
993
995
  state->cursor++;
994
996
  return json_push_value(state, config, string);
995
997
  }
996
998
  case '\\': {
997
- state->cursor++;
998
- escaped = true;
999
- if ((unsigned char)*state->cursor < 0x20) {
1000
- raise_parse_error("invalid ASCII control character in string: %s", state);
999
+ if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
1000
+ backslashes[positions.size] = state->cursor;
1001
+ positions.size++;
1002
+ } else {
1003
+ positions.additional_backslashes++;
1001
1004
  }
1005
+ state->cursor++;
1002
1006
  break;
1003
1007
  }
1004
1008
  default:
1005
- raise_parse_error("invalid ASCII control character in string: %s", state);
1009
+ if (!config->allow_control_characters) {
1010
+ raise_parse_error("invalid ASCII control character in string: %s", state);
1011
+ }
1006
1012
  break;
1007
1013
  }
1008
1014
 
1009
1015
  state->cursor++;
1010
- }
1016
+ } while (string_scan(state));
1011
1017
 
1012
1018
  raise_parse_error("unexpected end of input, expected closing \"", state);
1013
1019
  return Qfalse;
1014
1020
  }
1015
1021
 
1022
+ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
1023
+ {
1024
+ state->cursor++;
1025
+ const char *start = state->cursor;
1026
+
1027
+ if (RB_UNLIKELY(!string_scan(state))) {
1028
+ raise_parse_error("unexpected end of input, expected closing \"", state);
1029
+ }
1030
+
1031
+ if (RB_LIKELY(*state->cursor == '"')) {
1032
+ VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name);
1033
+ state->cursor++;
1034
+ return json_push_value(state, config, string);
1035
+ }
1036
+ return json_parse_escaped_string(state, config, is_name, start);
1037
+ }
1038
+
1039
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1040
+ // From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
1041
+ // Additional References:
1042
+ // https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
1043
+ // http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
1044
+ static inline uint64_t decode_8digits_unrolled(uint64_t val) {
1045
+ const uint64_t mask = 0x000000FF000000FF;
1046
+ const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
1047
+ const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
1048
+ val -= 0x3030303030303030;
1049
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1050
+ val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
1051
+ return val;
1052
+ }
1053
+
1054
+ static inline uint64_t decode_4digits_unrolled(uint32_t val) {
1055
+ const uint32_t mask = 0x000000FF;
1056
+ const uint32_t mul1 = 100;
1057
+ val -= 0x30303030;
1058
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1059
+ val = ((val & mask) * mul1) + (((val >> 16) & mask));
1060
+ return val;
1061
+ }
1062
+ #endif
1063
+
1064
+ static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
1065
+ {
1066
+ const char *start = state->cursor;
1067
+
1068
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1069
+ while (rest(state) >= sizeof(uint64_t)) {
1070
+ uint64_t next_8bytes;
1071
+ memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
1072
+
1073
+ // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
1074
+ // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
1075
+ uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
1076
+
1077
+ if (match == 0x3333333333333333) { // 8 consecutive digits
1078
+ *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
1079
+ state->cursor += 8;
1080
+ continue;
1081
+ }
1082
+
1083
+ uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
1084
+
1085
+ if (consecutive_digits >= 4) {
1086
+ *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
1087
+ state->cursor += 4;
1088
+ consecutive_digits -= 4;
1089
+ }
1090
+
1091
+ while (consecutive_digits) {
1092
+ *accumulator = *accumulator * 10 + (*state->cursor - '0');
1093
+ consecutive_digits--;
1094
+ state->cursor++;
1095
+ }
1096
+
1097
+ return (int)(state->cursor - start);
1098
+ }
1099
+ #endif
1100
+
1101
+ char next_char;
1102
+ while (rb_isdigit(next_char = peek(state))) {
1103
+ *accumulator = *accumulator * 10 + (next_char - '0');
1104
+ state->cursor++;
1105
+ }
1106
+ return (int)(state->cursor - start);
1107
+ }
1108
+
1109
+ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
1110
+ {
1111
+ bool integer = true;
1112
+ const char first_digit = *state->cursor;
1113
+
1114
+ // Variables for Ryu optimization - extract digits during parsing
1115
+ int32_t exponent = 0;
1116
+ int decimal_point_pos = -1;
1117
+ uint64_t mantissa = 0;
1118
+
1119
+ // Parse integer part and extract mantissa digits
1120
+ int mantissa_digits = json_parse_digits(state, &mantissa);
1121
+
1122
+ if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
1123
+ raise_parse_error_at("invalid number: %s", state, start);
1124
+ }
1125
+
1126
+ // Parse fractional part
1127
+ if (peek(state) == '.') {
1128
+ integer = false;
1129
+ decimal_point_pos = mantissa_digits; // Remember position of decimal point
1130
+ state->cursor++;
1131
+
1132
+ int fractional_digits = json_parse_digits(state, &mantissa);
1133
+ mantissa_digits += fractional_digits;
1134
+
1135
+ if (RB_UNLIKELY(!fractional_digits)) {
1136
+ raise_parse_error_at("invalid number: %s", state, start);
1137
+ }
1138
+ }
1139
+
1140
+ // Parse exponent
1141
+ if (rb_tolower(peek(state)) == 'e') {
1142
+ integer = false;
1143
+ state->cursor++;
1144
+
1145
+ bool negative_exponent = false;
1146
+ const char next_char = peek(state);
1147
+ if (next_char == '-' || next_char == '+') {
1148
+ negative_exponent = next_char == '-';
1149
+ state->cursor++;
1150
+ }
1151
+
1152
+ uint64_t abs_exponent = 0;
1153
+ int exponent_digits = json_parse_digits(state, &abs_exponent);
1154
+
1155
+ if (RB_UNLIKELY(!exponent_digits)) {
1156
+ raise_parse_error_at("invalid number: %s", state, start);
1157
+ }
1158
+
1159
+ exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent);
1160
+ }
1161
+
1162
+ if (integer) {
1163
+ return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
1164
+ }
1165
+
1166
+ // Adjust exponent based on decimal point position
1167
+ if (decimal_point_pos >= 0) {
1168
+ exponent -= (mantissa_digits - decimal_point_pos);
1169
+ }
1170
+
1171
+ return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
1172
+ }
1173
+
1174
+ static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
1175
+ {
1176
+ return json_parse_number(state, config, false, state->cursor);
1177
+ }
1178
+
1179
+ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
1180
+ {
1181
+ const char *start = state->cursor;
1182
+ state->cursor++;
1183
+ return json_parse_number(state, config, true, start);
1184
+ }
1185
+
1016
1186
  static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1017
1187
  {
1018
1188
  json_eat_whitespace(state);
1019
- if (state->cursor >= state->end) {
1020
- raise_parse_error("unexpected end of input", state);
1021
- }
1022
1189
 
1023
- switch (*state->cursor) {
1190
+ switch (peek(state)) {
1024
1191
  case 'n':
1025
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) {
1192
+ if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) {
1026
1193
  state->cursor += 4;
1027
1194
  return json_push_value(state, config, Qnil);
1028
1195
  }
@@ -1030,7 +1197,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1030
1197
  raise_parse_error("unexpected token %s", state);
1031
1198
  break;
1032
1199
  case 't':
1033
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
1200
+ if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) {
1034
1201
  state->cursor += 4;
1035
1202
  return json_push_value(state, config, Qtrue);
1036
1203
  }
@@ -1039,7 +1206,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1039
1206
  break;
1040
1207
  case 'f':
1041
1208
  // Note: memcmp with a small power of two compile to an integer comparison
1042
- if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1209
+ if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1043
1210
  state->cursor += 5;
1044
1211
  return json_push_value(state, config, Qfalse);
1045
1212
  }
@@ -1048,7 +1215,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1048
1215
  break;
1049
1216
  case 'N':
1050
1217
  // Note: memcmp with a small power of two compile to an integer comparison
1051
- if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1218
+ if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1052
1219
  state->cursor += 3;
1053
1220
  return json_push_value(state, config, CNaN);
1054
1221
  }
@@ -1056,16 +1223,16 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1056
1223
  raise_parse_error("unexpected token %s", state);
1057
1224
  break;
1058
1225
  case 'I':
1059
- if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1226
+ if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1060
1227
  state->cursor += 8;
1061
1228
  return json_push_value(state, config, CInfinity);
1062
1229
  }
1063
1230
 
1064
1231
  raise_parse_error("unexpected token %s", state);
1065
1232
  break;
1066
- case '-':
1233
+ case '-': {
1067
1234
  // Note: memcmp with a small power of two compile to an integer comparison
1068
- if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1235
+ if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1069
1236
  if (config->allow_nan) {
1070
1237
  state->cursor += 9;
1071
1238
  return json_push_value(state, config, CMinusInfinity);
@@ -1073,62 +1240,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1073
1240
  raise_parse_error("unexpected token %s", state);
1074
1241
  }
1075
1242
  }
1076
- // Fallthrough
1077
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
1078
- bool integer = true;
1079
-
1080
- // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
1081
- const char *start = state->cursor;
1082
- state->cursor++;
1083
-
1084
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1085
- state->cursor++;
1086
- }
1087
-
1088
- long integer_length = state->cursor - start;
1089
-
1090
- if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
1091
- raise_parse_error_at("invalid number: %s", state, start);
1092
- } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
1093
- raise_parse_error_at("invalid number: %s", state, start);
1094
- } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
1095
- raise_parse_error_at("invalid number: %s", state, start);
1096
- }
1097
-
1098
- if ((state->cursor < state->end) && (*state->cursor == '.')) {
1099
- integer = false;
1100
- state->cursor++;
1101
-
1102
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1103
- raise_parse_error("invalid number: %s", state);
1104
- }
1105
-
1106
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1107
- state->cursor++;
1108
- }
1109
- }
1110
-
1111
- if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
1112
- integer = false;
1113
- state->cursor++;
1114
- if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
1115
- state->cursor++;
1116
- }
1117
-
1118
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1119
- raise_parse_error("invalid number: %s", state);
1120
- }
1121
-
1122
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1123
- state->cursor++;
1124
- }
1125
- }
1126
-
1127
- if (integer) {
1128
- return json_push_value(state, config, json_decode_integer(start, state->cursor));
1129
- }
1130
- return json_push_value(state, config, json_decode_float(config, start, state->cursor));
1243
+ return json_push_value(state, config, json_parse_negative_number(state, config));
1244
+ break;
1131
1245
  }
1246
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1247
+ return json_push_value(state, config, json_parse_positive_number(state, config));
1248
+ break;
1132
1249
  case '"': {
1133
1250
  // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
1134
1251
  return json_parse_string(state, config, false);
@@ -1139,7 +1256,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1139
1256
  json_eat_whitespace(state);
1140
1257
  long stack_head = state->stack->head;
1141
1258
 
1142
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1259
+ if (peek(state) == ']') {
1143
1260
  state->cursor++;
1144
1261
  return json_push_value(state, config, json_decode_array(state, config, 0));
1145
1262
  } else {
@@ -1154,26 +1271,26 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1154
1271
  while (true) {
1155
1272
  json_eat_whitespace(state);
1156
1273
 
1157
- if (state->cursor < state->end) {
1158
- if (*state->cursor == ']') {
1159
- state->cursor++;
1160
- long count = state->stack->head - stack_head;
1161
- state->current_nesting--;
1162
- state->in_array--;
1163
- return json_push_value(state, config, json_decode_array(state, config, count));
1164
- }
1274
+ const char next_char = peek(state);
1165
1275
 
1166
- if (*state->cursor == ',') {
1167
- state->cursor++;
1168
- if (config->allow_trailing_comma) {
1169
- json_eat_whitespace(state);
1170
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1171
- continue;
1172
- }
1276
+ if (RB_LIKELY(next_char == ',')) {
1277
+ state->cursor++;
1278
+ if (config->allow_trailing_comma) {
1279
+ json_eat_whitespace(state);
1280
+ if (peek(state) == ']') {
1281
+ continue;
1173
1282
  }
1174
- json_parse_any(state, config);
1175
- continue;
1176
1283
  }
1284
+ json_parse_any(state, config);
1285
+ continue;
1286
+ }
1287
+
1288
+ if (next_char == ']') {
1289
+ state->cursor++;
1290
+ long count = state->stack->head - stack_head;
1291
+ state->current_nesting--;
1292
+ state->in_array--;
1293
+ return json_push_value(state, config, json_decode_array(state, config, count));
1177
1294
  }
1178
1295
 
1179
1296
  raise_parse_error("expected ',' or ']' after array value", state);
@@ -1187,7 +1304,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1187
1304
  json_eat_whitespace(state);
1188
1305
  long stack_head = state->stack->head;
1189
1306
 
1190
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1307
+ if (peek(state) == '}') {
1191
1308
  state->cursor++;
1192
1309
  return json_push_value(state, config, json_decode_object(state, config, 0));
1193
1310
  } else {
@@ -1196,13 +1313,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1196
1313
  rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
1197
1314
  }
1198
1315
 
1199
- if (*state->cursor != '"') {
1316
+ if (peek(state) != '"') {
1200
1317
  raise_parse_error("expected object key, got %s", state);
1201
1318
  }
1202
1319
  json_parse_string(state, config, true);
1203
1320
 
1204
1321
  json_eat_whitespace(state);
1205
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1322
+ if (peek(state) != ':') {
1206
1323
  raise_parse_error("expected ':' after object key", state);
1207
1324
  }
1208
1325
  state->cursor++;
@@ -1213,46 +1330,45 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1213
1330
  while (true) {
1214
1331
  json_eat_whitespace(state);
1215
1332
 
1216
- if (state->cursor < state->end) {
1217
- if (*state->cursor == '}') {
1218
- state->cursor++;
1219
- state->current_nesting--;
1220
- size_t count = state->stack->head - stack_head;
1333
+ const char next_char = peek(state);
1334
+ if (next_char == '}') {
1335
+ state->cursor++;
1336
+ state->current_nesting--;
1337
+ size_t count = state->stack->head - stack_head;
1221
1338
 
1222
- // Temporary rewind cursor in case an error is raised
1223
- const char *final_cursor = state->cursor;
1224
- state->cursor = object_start_cursor;
1225
- VALUE object = json_decode_object(state, config, count);
1226
- state->cursor = final_cursor;
1339
+ // Temporary rewind cursor in case an error is raised
1340
+ const char *final_cursor = state->cursor;
1341
+ state->cursor = object_start_cursor;
1342
+ VALUE object = json_decode_object(state, config, count);
1343
+ state->cursor = final_cursor;
1227
1344
 
1228
- return json_push_value(state, config, object);
1229
- }
1345
+ return json_push_value(state, config, object);
1346
+ }
1230
1347
 
1231
- if (*state->cursor == ',') {
1232
- state->cursor++;
1233
- json_eat_whitespace(state);
1348
+ if (next_char == ',') {
1349
+ state->cursor++;
1350
+ json_eat_whitespace(state);
1234
1351
 
1235
- if (config->allow_trailing_comma) {
1236
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1237
- continue;
1238
- }
1352
+ if (config->allow_trailing_comma) {
1353
+ if (peek(state) == '}') {
1354
+ continue;
1239
1355
  }
1356
+ }
1240
1357
 
1241
- if (*state->cursor != '"') {
1242
- raise_parse_error("expected object key, got: %s", state);
1243
- }
1244
- json_parse_string(state, config, true);
1358
+ if (RB_UNLIKELY(peek(state) != '"')) {
1359
+ raise_parse_error("expected object key, got: %s", state);
1360
+ }
1361
+ json_parse_string(state, config, true);
1245
1362
 
1246
- json_eat_whitespace(state);
1247
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1248
- raise_parse_error("expected ':' after object key, got: %s", state);
1249
- }
1250
- state->cursor++;
1363
+ json_eat_whitespace(state);
1364
+ if (RB_UNLIKELY(peek(state) != ':')) {
1365
+ raise_parse_error("expected ':' after object key, got: %s", state);
1366
+ }
1367
+ state->cursor++;
1251
1368
 
1252
- json_parse_any(state, config);
1369
+ json_parse_any(state, config);
1253
1370
 
1254
- continue;
1255
- }
1371
+ continue;
1256
1372
  }
1257
1373
 
1258
1374
  raise_parse_error("expected ',' or '}' after object value, got: %s", state);
@@ -1260,18 +1376,23 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1260
1376
  break;
1261
1377
  }
1262
1378
 
1379
+ case 0:
1380
+ raise_parse_error("unexpected end of input", state);
1381
+ break;
1382
+
1263
1383
  default:
1264
1384
  raise_parse_error("unexpected character: %s", state);
1265
1385
  break;
1266
1386
  }
1267
1387
 
1268
- raise_parse_error("unreacheable: %s", state);
1388
+ raise_parse_error("unreachable: %s", state);
1389
+ return Qundef;
1269
1390
  }
1270
1391
 
1271
1392
  static void json_ensure_eof(JSON_ParserState *state)
1272
1393
  {
1273
1394
  json_eat_whitespace(state);
1274
- if (state->cursor != state->end) {
1395
+ if (!eos(state)) {
1275
1396
  raise_parse_error("unexpected token at end of stream %s", state);
1276
1397
  }
1277
1398
  }
@@ -1308,14 +1429,15 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
1308
1429
  {
1309
1430
  JSON_ParserConfig *config = (JSON_ParserConfig *)data;
1310
1431
 
1311
- if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1312
- else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1313
- else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1314
- else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1315
- else if (key == sym_freeze) { config->freeze = RTEST(val); }
1316
- else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1317
- else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1318
- else if (key == sym_decimal_class) {
1432
+ if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1433
+ else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1434
+ else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1435
+ else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); }
1436
+ else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1437
+ else if (key == sym_freeze) { config->freeze = RTEST(val); }
1438
+ else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1439
+ else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1440
+ else if (key == sym_decimal_class) {
1319
1441
  if (RTEST(val)) {
1320
1442
  if (rb_respond_to(val, i_try_convert)) {
1321
1443
  config->decimal_class = val;
@@ -1388,6 +1510,7 @@ static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
1388
1510
  */
1389
1511
  static VALUE cParserConfig_initialize(VALUE self, VALUE opts)
1390
1512
  {
1513
+ rb_check_frozen(self);
1391
1514
  GET_PARSER_CONFIG;
1392
1515
 
1393
1516
  parser_config_init(config, opts);
@@ -1483,7 +1606,7 @@ static const rb_data_type_t JSON_ParserConfig_type = {
1483
1606
  JSON_ParserConfig_memsize,
1484
1607
  },
1485
1608
  0, 0,
1486
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
1609
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE,
1487
1610
  };
1488
1611
 
1489
1612
  static VALUE cJSON_parser_s_allocate(VALUE klass)
@@ -1527,16 +1650,13 @@ void Init_parser(void)
1527
1650
  sym_max_nesting = ID2SYM(rb_intern("max_nesting"));
1528
1651
  sym_allow_nan = ID2SYM(rb_intern("allow_nan"));
1529
1652
  sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma"));
1653
+ sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters"));
1530
1654
  sym_symbolize_names = ID2SYM(rb_intern("symbolize_names"));
1531
1655
  sym_freeze = ID2SYM(rb_intern("freeze"));
1532
1656
  sym_on_load = ID2SYM(rb_intern("on_load"));
1533
1657
  sym_decimal_class = ID2SYM(rb_intern("decimal_class"));
1534
1658
  sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key"));
1535
1659
 
1536
- i_chr = rb_intern("chr");
1537
- i_aset = rb_intern("[]=");
1538
- i_aref = rb_intern("[]");
1539
- i_leftshift = rb_intern("<<");
1540
1660
  i_new = rb_intern("new");
1541
1661
  i_try_convert = rb_intern("try_convert");
1542
1662
  i_uminus = rb_intern("-@");