json 2.15.1 → 2.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,50 +1,22 @@
1
- #include "ruby.h"
2
- #include "ruby/encoding.h"
3
-
4
- /* shims */
5
- /* This is the fallback definition from Ruby 3.4 */
6
-
7
- #ifndef RBIMPL_STDBOOL_H
8
- #if defined(__cplusplus)
9
- # if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
10
- # include <cstdbool>
11
- # endif
12
- #elif defined(HAVE_STDBOOL_H)
13
- # include <stdbool.h>
14
- #elif !defined(HAVE__BOOL)
15
- typedef unsigned char _Bool;
16
- # define bool _Bool
17
- # define true ((_Bool)+1)
18
- # define false ((_Bool)+0)
19
- # define __bool_true_false_are_defined
20
- #endif
21
- #endif
22
-
1
+ #include "../json.h"
2
+ #include "../vendor/ryu.h"
23
3
  #include "../simd/simd.h"
24
4
 
25
- #ifndef RB_UNLIKELY
26
- #define RB_UNLIKELY(expr) expr
27
- #endif
28
-
29
- #ifndef RB_LIKELY
30
- #define RB_LIKELY(expr) expr
31
- #endif
32
-
33
5
  static VALUE mJSON, eNestingError, Encoding_UTF_8;
34
6
  static VALUE CNaN, CInfinity, CMinusInfinity;
35
7
 
36
- static ID i_chr, i_aset, i_aref,
37
- i_leftshift, i_new, i_try_convert, i_uminus, i_encode;
8
+ static ID i_new, i_try_convert, i_uminus, i_encode;
38
9
 
39
- static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze,
40
- sym_decimal_class, sym_on_load, sym_allow_duplicate_key;
10
+ static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters,
11
+ sym_allow_invalid_escape, sym_symbolize_names, sym_freeze, sym_decimal_class, sym_on_load,
12
+ sym_allow_duplicate_key;
41
13
 
42
14
  static int binary_encindex;
43
15
  static int utf8_encindex;
44
16
 
45
17
  #ifndef HAVE_RB_HASH_BULK_INSERT
46
18
  // For TruffleRuby
47
- void
19
+ static void
48
20
  rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
49
21
  {
50
22
  long index = 0;
@@ -61,6 +33,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
61
33
  #define rb_hash_new_capa(n) rb_hash_new()
62
34
  #endif
63
35
 
36
+ #ifndef HAVE_RB_STR_TO_INTERNED_STR
37
+ static VALUE rb_str_to_interned_str(VALUE str)
38
+ {
39
+ return rb_funcall(rb_str_freeze(str), i_uminus, 0);
40
+ }
41
+ #endif
64
42
 
65
43
  /* name cache */
66
44
 
@@ -106,116 +84,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring
106
84
  cache->entries[index] = rstring;
107
85
  }
108
86
 
109
- static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
87
+ #define rstring_cache_memcmp memcmp
88
+
89
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
90
+ #if __has_builtin(__builtin_bswap64)
91
+ #undef rstring_cache_memcmp
92
+ ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
110
93
  {
111
- long rstring_length = RSTRING_LEN(rstring);
112
- if (length == rstring_length) {
113
- return memcmp(str, RSTRING_PTR(rstring), length);
114
- } else {
115
- return (int)(length - rstring_length);
94
+ // The libc memcmp has numerous complex optimizations, but in this particular case,
95
+ // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
96
+ // inline a simpler memcmp outperforms calling the libc version.
97
+ long i = 0;
98
+
99
+ for (; i + 8 <= length; i += 8) {
100
+ uint64_t a, b;
101
+ memcpy(&a, str + i, 8);
102
+ memcpy(&b, rptr + i, 8);
103
+ if (a != b) {
104
+ a = __builtin_bswap64(a);
105
+ b = __builtin_bswap64(b);
106
+ return (a < b) ? -1 : 1;
107
+ }
108
+ }
109
+
110
+ for (; i < length; i++) {
111
+ if (str[i] != rptr[i]) {
112
+ return (str[i] < rptr[i]) ? -1 : 1;
113
+ }
116
114
  }
115
+
116
+ return 0;
117
117
  }
118
+ #endif
119
+ #endif
118
120
 
119
- static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
121
+ ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
120
122
  {
121
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
122
- // Common names aren't likely to be very long. So we just don't
123
- // cache names above an arbitrary threshold.
124
- return Qfalse;
125
- }
123
+ const char *rstring_ptr;
124
+ long rstring_length;
126
125
 
127
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
128
- // Simple heuristic, if the first character isn't a letter,
129
- // we're much less likely to see this string again.
130
- // We mostly want to cache strings that are likely to be repeated.
131
- return Qfalse;
126
+ RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
127
+
128
+ if (length == rstring_length) {
129
+ return rstring_cache_memcmp(str, rstring_ptr, length);
130
+ } else {
131
+ return (int)(length - rstring_length);
132
132
  }
133
+ }
133
134
 
135
+ ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
136
+ {
134
137
  int low = 0;
135
138
  int high = cache->length - 1;
136
- int mid = 0;
137
- int last_cmp = 0;
138
139
 
139
140
  while (low <= high) {
140
- mid = (high + low) >> 1;
141
+ int mid = (high + low) >> 1;
141
142
  VALUE entry = cache->entries[mid];
142
- last_cmp = rstring_cache_cmp(str, length, entry);
143
+ int cmp = rstring_cache_cmp(str, length, entry);
143
144
 
144
- if (last_cmp == 0) {
145
+ if (cmp == 0) {
145
146
  return entry;
146
- } else if (last_cmp > 0) {
147
+ } else if (cmp > 0) {
147
148
  low = mid + 1;
148
149
  } else {
149
150
  high = mid - 1;
150
151
  }
151
152
  }
152
153
 
153
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
154
- // We assume the overwhelming majority of names don't need to be escaped.
155
- // But if they do, we have to fallback to the slow path.
156
- return Qfalse;
157
- }
158
-
159
154
  VALUE rstring = build_interned_string(str, length);
160
155
 
161
156
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
162
- if (last_cmp > 0) {
163
- mid += 1;
164
- }
165
-
166
- rvalue_cache_insert_at(cache, mid, rstring);
157
+ rvalue_cache_insert_at(cache, low, rstring);
167
158
  }
168
159
  return rstring;
169
160
  }
170
161
 
171
162
  static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
172
163
  {
173
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
174
- // Common names aren't likely to be very long. So we just don't
175
- // cache names above an arbitrary threshold.
176
- return Qfalse;
177
- }
178
-
179
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
180
- // Simple heuristic, if the first character isn't a letter,
181
- // we're much less likely to see this string again.
182
- // We mostly want to cache strings that are likely to be repeated.
183
- return Qfalse;
184
- }
185
-
186
164
  int low = 0;
187
165
  int high = cache->length - 1;
188
- int mid = 0;
189
- int last_cmp = 0;
190
166
 
191
167
  while (low <= high) {
192
- mid = (high + low) >> 1;
168
+ int mid = (high + low) >> 1;
193
169
  VALUE entry = cache->entries[mid];
194
- last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
170
+ int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
195
171
 
196
- if (last_cmp == 0) {
172
+ if (cmp == 0) {
197
173
  return entry;
198
- } else if (last_cmp > 0) {
174
+ } else if (cmp > 0) {
199
175
  low = mid + 1;
200
176
  } else {
201
177
  high = mid - 1;
202
178
  }
203
179
  }
204
180
 
205
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
206
- // We assume the overwhelming majority of names don't need to be escaped.
207
- // But if they do, we have to fallback to the slow path.
208
- return Qfalse;
209
- }
210
-
211
181
  VALUE rsymbol = build_symbol(str, length);
212
182
 
213
183
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
214
- if (last_cmp > 0) {
215
- mid += 1;
216
- }
217
-
218
- rvalue_cache_insert_at(cache, mid, rsymbol);
184
+ rvalue_cache_insert_at(cache, low, rsymbol);
219
185
  }
220
186
  return rsymbol;
221
187
  }
@@ -330,15 +296,6 @@ static void rvalue_stack_eagerly_release(VALUE handle)
330
296
  }
331
297
  }
332
298
 
333
-
334
- #ifndef HAVE_STRNLEN
335
- static size_t strnlen(const char *s, size_t maxlen)
336
- {
337
- char *p;
338
- return ((p = memchr(s, '\0', maxlen)) ? p - s : maxlen);
339
- }
340
- #endif
341
-
342
299
  static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
343
300
  {
344
301
  int len = 1;
@@ -379,7 +336,8 @@ typedef struct JSON_ParserStruct {
379
336
  int max_nesting;
380
337
  bool allow_nan;
381
338
  bool allow_trailing_comma;
382
- bool parsing_name;
339
+ bool allow_control_characters;
340
+ bool allow_invalid_escape;
383
341
  bool symbolize_names;
384
342
  bool freeze;
385
343
  } JSON_ParserConfig;
@@ -395,6 +353,22 @@ typedef struct JSON_ParserStateStruct {
395
353
  int current_nesting;
396
354
  } JSON_ParserState;
397
355
 
356
+ static inline size_t rest(JSON_ParserState *state) {
357
+ return state->end - state->cursor;
358
+ }
359
+
360
+ static inline bool eos(JSON_ParserState *state) {
361
+ return state->cursor >= state->end;
362
+ }
363
+
364
+ static inline char peek(JSON_ParserState *state)
365
+ {
366
+ if (RB_UNLIKELY(eos(state))) {
367
+ return 0;
368
+ }
369
+ return *state->cursor;
370
+ }
371
+
398
372
  static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
399
373
  {
400
374
  const char *cursor = state->cursor;
@@ -428,10 +402,7 @@ static void emit_parse_warning(const char *message, JSON_ParserState *state)
428
402
 
429
403
  #define PARSE_ERROR_FRAGMENT_LEN 32
430
404
 
431
- #ifdef RBIMPL_ATTR_NORETURN
432
- RBIMPL_ATTR_NORETURN()
433
- #endif
434
- static void raise_parse_error(const char *format, JSON_ParserState *state)
405
+ NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state)
435
406
  {
436
407
  unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
437
408
  long line, column;
@@ -477,10 +448,7 @@ static void raise_parse_error(const char *format, JSON_ParserState *state)
477
448
  rb_exc_raise(exc);
478
449
  }
479
450
 
480
- #ifdef RBIMPL_ATTR_NORETURN
481
- RBIMPL_ATTR_NORETURN()
482
- #endif
483
- static void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
451
+ NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
484
452
  {
485
453
  state->cursor = at;
486
454
  raise_parse_error(format, state);
@@ -505,23 +473,24 @@ static const signed char digit_values[256] = {
505
473
  -1, -1, -1, -1, -1, -1, -1
506
474
  };
507
475
 
508
- static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
509
- {
510
- signed char b;
511
- uint32_t result = 0;
512
- b = digit_values[p[0]];
513
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
514
- result = (result << 4) | (unsigned char)b;
515
- b = digit_values[p[1]];
516
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
517
- result = (result << 4) | (unsigned char)b;
518
- b = digit_values[p[2]];
519
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
520
- result = (result << 4) | (unsigned char)b;
521
- b = digit_values[p[3]];
522
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
523
- result = (result << 4) | (unsigned char)b;
524
- return result;
476
+ static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe)
477
+ {
478
+ if (RB_UNLIKELY(sp > spe - 4)) {
479
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
480
+ }
481
+
482
+ const unsigned char *p = (const unsigned char *)sp;
483
+
484
+ const signed char b0 = digit_values[p[0]];
485
+ const signed char b1 = digit_values[p[1]];
486
+ const signed char b2 = digit_values[p[2]];
487
+ const signed char b3 = digit_values[p[3]];
488
+
489
+ if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) {
490
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
491
+ }
492
+
493
+ return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
525
494
  }
526
495
 
527
496
  #define GET_PARSER_CONFIG \
@@ -530,61 +499,82 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
530
499
 
531
500
  static const rb_data_type_t JSON_ParserConfig_type;
532
501
 
533
- static const bool whitespace[256] = {
534
- [' '] = 1,
535
- ['\t'] = 1,
536
- ['\n'] = 1,
537
- ['\r'] = 1,
538
- ['/'] = 1,
539
- };
540
-
541
502
  static void
542
503
  json_eat_comments(JSON_ParserState *state)
543
504
  {
544
- if (state->cursor + 1 < state->end) {
545
- switch (state->cursor[1]) {
546
- case '/': {
547
- state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
548
- if (!state->cursor) {
549
- state->cursor = state->end;
550
- } else {
551
- state->cursor++;
552
- }
553
- break;
505
+ const char *start = state->cursor;
506
+ state->cursor++;
507
+
508
+ switch (peek(state)) {
509
+ case '/': {
510
+ state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
511
+ if (!state->cursor) {
512
+ state->cursor = state->end;
513
+ } else {
514
+ state->cursor++;
554
515
  }
555
- case '*': {
556
- state->cursor += 2;
557
- while (true) {
558
- state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
559
- if (!state->cursor) {
560
- raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
561
- } else {
562
- state->cursor++;
563
- if (state->cursor < state->end && *state->cursor == '/') {
564
- state->cursor++;
565
- break;
566
- }
567
- }
516
+ break;
517
+ }
518
+ case '*': {
519
+ state->cursor++;
520
+
521
+ while (true) {
522
+ const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
523
+ if (!next_match) {
524
+ raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
525
+ }
526
+
527
+ state->cursor = next_match + 1;
528
+ if (peek(state) == '/') {
529
+ state->cursor++;
530
+ break;
568
531
  }
569
- break;
570
532
  }
571
- default:
572
- raise_parse_error("unexpected token %s", state);
573
- break;
533
+ break;
574
534
  }
575
- } else {
576
- raise_parse_error("unexpected token %s", state);
535
+ default:
536
+ raise_parse_error_at("unexpected token %s", state, start);
537
+ break;
577
538
  }
578
539
  }
579
540
 
580
- static inline void
541
+ ALWAYS_INLINE(static) void
581
542
  json_eat_whitespace(JSON_ParserState *state)
582
543
  {
583
- while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) {
584
- if (RB_LIKELY(*state->cursor != '/')) {
585
- state->cursor++;
586
- } else {
587
- json_eat_comments(state);
544
+ while (true) {
545
+ switch (peek(state)) {
546
+ case ' ':
547
+ state->cursor++;
548
+ break;
549
+ case '\n':
550
+ state->cursor++;
551
+
552
+ // Heuristic: if we see a newline, there is likely consecutive spaces after it.
553
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
554
+ while (rest(state) > 8) {
555
+ uint64_t chunk;
556
+ memcpy(&chunk, state->cursor, sizeof(uint64_t));
557
+ if (chunk == 0x2020202020202020) {
558
+ state->cursor += 8;
559
+ continue;
560
+ }
561
+
562
+ uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
563
+ state->cursor += consecutive_spaces;
564
+ break;
565
+ }
566
+ #endif
567
+ break;
568
+ case '\t':
569
+ case '\r':
570
+ state->cursor++;
571
+ break;
572
+ case '/':
573
+ json_eat_comments(state);
574
+ break;
575
+
576
+ default:
577
+ return;
588
578
  }
589
579
  }
590
580
  }
@@ -615,11 +605,22 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
615
605
  return result;
616
606
  }
617
607
 
618
- static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
608
+ static inline bool json_string_cacheable_p(const char *string, size_t length)
619
609
  {
610
+ // We mostly want to cache strings that are likely to be repeated.
611
+ // Simple heuristics:
612
+ // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
613
+ // - If the first character isn't a letter, we're much less likely to see this string again.
614
+ return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
615
+ }
616
+
617
+ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
618
+ {
619
+ bool intern = is_name || config->freeze;
620
+ bool symbolize = is_name && config->symbolize_names;
620
621
  size_t bufferSize = stringEnd - string;
621
622
 
622
- if (is_name && state->in_array) {
623
+ if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
623
624
  VALUE cached_key;
624
625
  if (RB_UNLIKELY(symbolize)) {
625
626
  cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
@@ -635,109 +636,127 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
635
636
  return build_string(string, stringEnd, intern, symbolize);
636
637
  }
637
638
 
638
- static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
639
- {
640
- size_t bufferSize = stringEnd - string;
641
- const char *p = string, *pe = string, *unescape, *bufferStart;
642
- char *buffer;
643
- int unescape_len;
644
- char buf[4];
639
+ #define JSON_MAX_UNESCAPE_POSITIONS 16
640
+ typedef struct _json_unescape_positions {
641
+ long size;
642
+ const char **positions;
643
+ unsigned long additional_backslashes;
644
+ } JSON_UnescapePositions;
645
645
 
646
- if (is_name && state->in_array) {
647
- VALUE cached_key;
648
- if (RB_UNLIKELY(symbolize)) {
649
- cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
650
- } else {
651
- cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
646
+ static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
647
+ {
648
+ while (positions->size) {
649
+ positions->size--;
650
+ const char *next_position = positions->positions[0];
651
+ positions->positions++;
652
+ if (next_position >= pe) {
653
+ return next_position;
652
654
  }
655
+ }
653
656
 
654
- if (RB_LIKELY(cached_key)) {
655
- return cached_key;
656
- }
657
+ if (positions->additional_backslashes) {
658
+ positions->additional_backslashes--;
659
+ return memchr(pe, '\\', stringEnd - pe);
657
660
  }
658
661
 
662
+ return NULL;
663
+ }
664
+
665
+ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
666
+ {
667
+ bool intern = is_name || config->freeze;
668
+ bool symbolize = is_name && config->symbolize_names;
669
+ size_t bufferSize = stringEnd - string;
670
+ const char *p = string, *pe = string, *bufferStart;
671
+ char *buffer;
672
+
659
673
  VALUE result = rb_str_buf_new(bufferSize);
660
674
  rb_enc_associate_index(result, utf8_encindex);
661
675
  buffer = RSTRING_PTR(result);
662
676
  bufferStart = buffer;
663
677
 
664
- while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) {
665
- unescape = (char *) "?";
666
- unescape_len = 1;
678
+ #define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
679
+
680
+ while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
667
681
  if (pe > p) {
668
682
  MEMCPY(buffer, p, char, pe - p);
669
683
  buffer += pe - p;
670
684
  }
671
685
  switch (*++pe) {
686
+ case '"':
687
+ case '/':
688
+ p = pe; // nothing to unescape just need to skip the backslash
689
+ break;
690
+ case '\\':
691
+ APPEND_CHAR('\\');
692
+ break;
672
693
  case 'n':
673
- unescape = (char *) "\n";
694
+ APPEND_CHAR('\n');
674
695
  break;
675
696
  case 'r':
676
- unescape = (char *) "\r";
697
+ APPEND_CHAR('\r');
677
698
  break;
678
699
  case 't':
679
- unescape = (char *) "\t";
680
- break;
681
- case '"':
682
- unescape = (char *) "\"";
683
- break;
684
- case '\\':
685
- unescape = (char *) "\\";
700
+ APPEND_CHAR('\t');
686
701
  break;
687
702
  case 'b':
688
- unescape = (char *) "\b";
703
+ APPEND_CHAR('\b');
689
704
  break;
690
705
  case 'f':
691
- unescape = (char *) "\f";
706
+ APPEND_CHAR('\f');
692
707
  break;
693
- case 'u':
694
- if (pe > stringEnd - 5) {
695
- raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
696
- } else {
697
- uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
698
- pe += 3;
699
- /* To handle values above U+FFFF, we take a sequence of
700
- * \uXXXX escapes in the U+D800..U+DBFF then
701
- * U+DC00..U+DFFF ranges, take the low 10 bits from each
702
- * to make a 20-bit number, then add 0x10000 to get the
703
- * final codepoint.
704
- *
705
- * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
706
- * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
707
- * Area".
708
- */
709
- if ((ch & 0xFC00) == 0xD800) {
710
- pe++;
711
- if (pe > stringEnd - 6) {
712
- raise_parse_error_at("incomplete surrogate pair at %s", state, p);
713
- }
714
- if (pe[0] == '\\' && pe[1] == 'u') {
715
- uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
716
-
717
- if ((sur & 0xFC00) != 0xDC00) {
718
- raise_parse_error_at("invalid surrogate pair at %s", state, p);
719
- }
720
-
721
- ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
722
- | (sur & 0x3FF));
723
- pe += 5;
724
- } else {
725
- raise_parse_error_at("incomplete surrogate pair at %s", state, p);
726
- break;
708
+ case 'u': {
709
+ uint32_t ch = unescape_unicode(state, ++pe, stringEnd);
710
+ pe += 3;
711
+ /* To handle values above U+FFFF, we take a sequence of
712
+ * \uXXXX escapes in the U+D800..U+DBFF then
713
+ * U+DC00..U+DFFF ranges, take the low 10 bits from each
714
+ * to make a 20-bit number, then add 0x10000 to get the
715
+ * final codepoint.
716
+ *
717
+ * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
718
+ * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
719
+ * Area".
720
+ */
721
+ if ((ch & 0xFC00) == 0xD800) {
722
+ pe++;
723
+ if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) {
724
+ uint32_t sur = unescape_unicode(state, pe + 2, stringEnd);
725
+
726
+ if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) {
727
+ raise_parse_error_at("invalid surrogate pair at %s", state, p);
727
728
  }
729
+
730
+ ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF));
731
+ pe += 5;
732
+ } else {
733
+ raise_parse_error_at("incomplete surrogate pair at %s", state, p);
734
+ break;
728
735
  }
729
- unescape_len = convert_UTF32_to_UTF8(buf, ch);
730
- unescape = buf;
731
736
  }
737
+
738
+ int unescape_len = convert_UTF32_to_UTF8(buffer, ch);
739
+ buffer += unescape_len;
740
+ p = ++pe;
732
741
  break;
742
+ }
733
743
  default:
734
- p = pe;
735
- continue;
744
+ if ((unsigned char)*pe < 0x20) {
745
+ if (!config->allow_control_characters) {
746
+ if (*pe == '\n') {
747
+ raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1);
748
+ }
749
+ raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1);
750
+ }
751
+ } else if (config->allow_invalid_escape) {
752
+ APPEND_CHAR(*pe);
753
+ } else {
754
+ raise_parse_error_at("invalid escape character in string: %s", state, pe - 1);
755
+ }
756
+ break;
736
757
  }
737
- MEMCPY(buffer, unescape, char, unescape_len);
738
- buffer += unescape_len;
739
- p = ++pe;
740
758
  }
759
+ #undef APPEND_CHAR
741
760
 
742
761
  if (stringEnd > p) {
743
762
  MEMCPY(buffer, p, char, stringEnd - p);
@@ -748,81 +767,85 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
748
767
  if (symbolize) {
749
768
  result = rb_str_intern(result);
750
769
  } else if (intern) {
751
- result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
770
+ result = rb_str_to_interned_str(result);
752
771
  }
753
772
 
754
773
  return result;
755
774
  }
756
775
 
757
776
  #define MAX_FAST_INTEGER_SIZE 18
758
- static inline VALUE fast_decode_integer(const char *p, const char *pe)
759
- {
760
- bool negative = false;
761
- if (*p == '-') {
762
- negative = true;
763
- p++;
764
- }
777
+ #define MAX_NUMBER_STACK_BUFFER 128
765
778
 
766
- long long memo = 0;
767
- while (p < pe) {
768
- memo *= 10;
769
- memo += *p - '0';
770
- p++;
771
- }
779
+ typedef VALUE (*json_number_decode_func_t)(const char *ptr);
772
780
 
773
- if (negative) {
774
- memo = -memo;
781
+ static inline VALUE json_decode_large_number(const char *start, long len, json_number_decode_func_t func)
782
+ {
783
+ if (RB_LIKELY(len < MAX_NUMBER_STACK_BUFFER)) {
784
+ char buffer[MAX_NUMBER_STACK_BUFFER];
785
+ MEMCPY(buffer, start, char, len);
786
+ buffer[len] = '\0';
787
+ return func(buffer);
788
+ } else {
789
+ VALUE buffer_v = rb_str_tmp_new(len);
790
+ char *buffer = RSTRING_PTR(buffer_v);
791
+ MEMCPY(buffer, start, char, len);
792
+ buffer[len] = '\0';
793
+ VALUE number = func(buffer);
794
+ RB_GC_GUARD(buffer_v);
795
+ return number;
775
796
  }
776
- return LL2NUM(memo);
777
797
  }
778
798
 
779
- static VALUE json_decode_large_integer(const char *start, long len)
799
+ static VALUE json_decode_inum(const char *buffer)
780
800
  {
781
- VALUE buffer_v;
782
- char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
783
- MEMCPY(buffer, start, char, len);
784
- buffer[len] = '\0';
785
- VALUE number = rb_cstr2inum(buffer, 10);
786
- RB_ALLOCV_END(buffer_v);
787
- return number;
801
+ return rb_cstr2inum(buffer, 10);
788
802
  }
789
803
 
790
- static inline VALUE
791
- json_decode_integer(const char *start, const char *end)
804
+ NOINLINE(static) VALUE json_decode_large_integer(const char *start, long len)
792
805
  {
793
- long len = end - start;
794
- if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
795
- return fast_decode_integer(start, end);
806
+ return json_decode_large_number(start, len, json_decode_inum);
807
+ }
808
+
809
+ static inline VALUE json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
810
+ {
811
+ if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
812
+ if (negative) {
813
+ return INT64T2NUM(-((int64_t)mantissa));
796
814
  }
797
- return json_decode_large_integer(start, len);
815
+ return UINT64T2NUM(mantissa);
816
+ }
817
+
818
+ return json_decode_large_integer(start, end - start);
798
819
  }
799
820
 
800
- static VALUE json_decode_large_float(const char *start, long len)
821
+ static VALUE json_decode_dnum(const char *buffer)
801
822
  {
802
- VALUE buffer_v;
803
- char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
804
- MEMCPY(buffer, start, char, len);
805
- buffer[len] = '\0';
806
- VALUE number = DBL2NUM(rb_cstr_to_dbl(buffer, 1));
807
- RB_ALLOCV_END(buffer_v);
808
- return number;
823
+ return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
809
824
  }
810
825
 
811
- static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
826
+ NOINLINE(static) VALUE json_decode_large_float(const char *start, long len)
812
827
  {
813
- long len = end - start;
828
+ return json_decode_large_number(start, len, json_decode_dnum);
829
+ }
814
830
 
831
+ /* Ruby JSON optimized float decoder using vendored Ryu algorithm
832
+ * Accepts pre-extracted mantissa and exponent from first-pass validation
833
+ */
834
+ static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
835
+ const char *start, const char *end)
836
+ {
815
837
  if (RB_UNLIKELY(config->decimal_class)) {
816
- VALUE text = rb_str_new(start, len);
838
+ VALUE text = rb_str_new(start, end - start);
817
839
  return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
818
- } else if (RB_LIKELY(len < 64)) {
819
- char buffer[64];
820
- MEMCPY(buffer, start, char, len);
821
- buffer[len] = '\0';
822
- return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
823
- } else {
824
- return json_decode_large_float(start, len);
825
840
  }
841
+
842
+ // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
843
+ // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
844
+ if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
845
+ return json_decode_large_float(start, end - start);
846
+ }
847
+
848
+ return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
826
849
  }
827
850
 
828
851
  static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
@@ -854,7 +877,7 @@ static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs)
854
877
  return Qfalse;
855
878
  }
856
879
 
857
- static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
880
+ NOINLINE(static) void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
858
881
  {
859
882
  VALUE message = rb_sprintf(
860
883
  "detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`",
@@ -865,10 +888,7 @@ static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_
865
888
  RB_GC_GUARD(message);
866
889
  }
867
890
 
868
- #ifdef RBIMPL_ATTR_NORETURN
869
- RBIMPL_ATTR_NORETURN()
870
- #endif
871
- static void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
891
+ NORETURN(static) void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
872
892
  {
873
893
  VALUE message = rb_sprintf(
874
894
  "duplicate key %"PRIsVALUE,
@@ -908,20 +928,6 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi
908
928
  return object;
909
929
  }
910
930
 
911
- static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name)
912
- {
913
- VALUE string;
914
- bool intern = is_name || config->freeze;
915
- bool symbolize = is_name && config->symbolize_names;
916
- if (escaped) {
917
- string = json_string_unescape(state, start, end, is_name, intern, symbolize);
918
- } else {
919
- string = json_string_fastpath(state, start, end, is_name, intern, symbolize);
920
- }
921
-
922
- return string;
923
- }
924
-
925
931
  static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
926
932
  {
927
933
  if (RB_UNLIKELY(config->on_load_proc)) {
@@ -944,17 +950,11 @@ static const bool string_scan_table[256] = {
944
950
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
945
951
  };
946
952
 
947
- #if (defined(__GNUC__ ) || defined(__clang__))
948
- #define FORCE_INLINE __attribute__((always_inline))
949
- #else
950
- #define FORCE_INLINE
951
- #endif
952
-
953
953
  #ifdef HAVE_SIMD
954
954
  static SIMD_Implementation simd_impl = SIMD_NONE;
955
955
  #endif /* HAVE_SIMD */
956
956
 
957
- static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
957
+ ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state)
958
958
  {
959
959
  #ifdef HAVE_SIMD
960
960
  #if defined(HAVE_SIMD_NEON)
@@ -962,7 +962,7 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
962
962
  uint64_t mask = 0;
963
963
  if (string_scan_simd_neon(&state->cursor, state->end, &mask)) {
964
964
  state->cursor += trailing_zeros64(mask) >> 2;
965
- return 1;
965
+ return true;
966
966
  }
967
967
 
968
968
  #elif defined(HAVE_SIMD_SSE2)
@@ -970,64 +970,232 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
970
970
  int mask = 0;
971
971
  if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) {
972
972
  state->cursor += trailing_zeros(mask);
973
- return 1;
973
+ return true;
974
974
  }
975
975
  }
976
976
  #endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
977
977
  #endif /* HAVE_SIMD */
978
978
 
979
- while (state->cursor < state->end) {
979
+ while (!eos(state)) {
980
980
  if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
981
- return 1;
981
+ return true;
982
982
  }
983
983
  state->cursor++;
984
984
  }
985
- return 0;
985
+ return false;
986
986
  }
987
987
 
988
- static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
988
+ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
989
989
  {
990
- state->cursor++;
991
- const char *start = state->cursor;
992
- bool escaped = false;
990
+ const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
991
+ JSON_UnescapePositions positions = {
992
+ .size = 0,
993
+ .positions = backslashes,
994
+ .additional_backslashes = 0,
995
+ };
993
996
 
994
- while (RB_UNLIKELY(string_scan(state))) {
997
+ do {
995
998
  switch (*state->cursor) {
996
999
  case '"': {
997
- VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name);
1000
+ VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
998
1001
  state->cursor++;
999
1002
  return json_push_value(state, config, string);
1000
1003
  }
1001
1004
  case '\\': {
1002
- state->cursor++;
1003
- escaped = true;
1004
- if ((unsigned char)*state->cursor < 0x20) {
1005
- raise_parse_error("invalid ASCII control character in string: %s", state);
1005
+ if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
1006
+ backslashes[positions.size] = state->cursor;
1007
+ positions.size++;
1008
+ } else {
1009
+ positions.additional_backslashes++;
1006
1010
  }
1011
+ state->cursor++;
1007
1012
  break;
1008
1013
  }
1009
1014
  default:
1010
- raise_parse_error("invalid ASCII control character in string: %s", state);
1015
+ if (!config->allow_control_characters) {
1016
+ raise_parse_error("invalid ASCII control character in string: %s", state);
1017
+ }
1011
1018
  break;
1012
1019
  }
1013
1020
 
1014
1021
  state->cursor++;
1015
- }
1022
+ } while (string_scan(state));
1016
1023
 
1017
1024
  raise_parse_error("unexpected end of input, expected closing \"", state);
1018
1025
  return Qfalse;
1019
1026
  }
1020
1027
 
1028
+ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
1029
+ {
1030
+ state->cursor++;
1031
+ const char *start = state->cursor;
1032
+
1033
+ if (RB_UNLIKELY(!string_scan(state))) {
1034
+ raise_parse_error("unexpected end of input, expected closing \"", state);
1035
+ }
1036
+
1037
+ if (RB_LIKELY(*state->cursor == '"')) {
1038
+ VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name);
1039
+ state->cursor++;
1040
+ return json_push_value(state, config, string);
1041
+ }
1042
+ return json_parse_escaped_string(state, config, is_name, start);
1043
+ }
1044
+
1045
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1046
+ // From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
1047
+ // Additional References:
1048
+ // https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
1049
+ // http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
1050
+ static inline uint64_t decode_8digits_unrolled(uint64_t val) {
1051
+ const uint64_t mask = 0x000000FF000000FF;
1052
+ const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
1053
+ const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
1054
+ val -= 0x3030303030303030;
1055
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1056
+ val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
1057
+ return val;
1058
+ }
1059
+
1060
+ static inline uint64_t decode_4digits_unrolled(uint32_t val) {
1061
+ const uint32_t mask = 0x000000FF;
1062
+ const uint32_t mul1 = 100;
1063
+ val -= 0x30303030;
1064
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1065
+ val = ((val & mask) * mul1) + (((val >> 16) & mask));
1066
+ return val;
1067
+ }
1068
+ #endif
1069
+
1070
+ static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
1071
+ {
1072
+ const char *start = state->cursor;
1073
+
1074
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1075
+ while (rest(state) >= sizeof(uint64_t)) {
1076
+ uint64_t next_8bytes;
1077
+ memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
1078
+
1079
+ // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
1080
+ // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
1081
+ uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
1082
+
1083
+ if (match == 0x3333333333333333) { // 8 consecutive digits
1084
+ *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
1085
+ state->cursor += 8;
1086
+ continue;
1087
+ }
1088
+
1089
+ uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
1090
+
1091
+ if (consecutive_digits >= 4) {
1092
+ *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
1093
+ state->cursor += 4;
1094
+ consecutive_digits -= 4;
1095
+ }
1096
+
1097
+ while (consecutive_digits) {
1098
+ *accumulator = *accumulator * 10 + (*state->cursor - '0');
1099
+ consecutive_digits--;
1100
+ state->cursor++;
1101
+ }
1102
+
1103
+ return (int)(state->cursor - start);
1104
+ }
1105
+ #endif
1106
+
1107
+ char next_char;
1108
+ while (rb_isdigit(next_char = peek(state))) {
1109
+ *accumulator = *accumulator * 10 + (next_char - '0');
1110
+ state->cursor++;
1111
+ }
1112
+ return (int)(state->cursor - start);
1113
+ }
1114
+
1115
+ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
1116
+ {
1117
+ bool integer = true;
1118
+ const char first_digit = *state->cursor;
1119
+
1120
+ // Variables for Ryu optimization - extract digits during parsing
1121
+ int32_t exponent = 0;
1122
+ int decimal_point_pos = -1;
1123
+ uint64_t mantissa = 0;
1124
+
1125
+ // Parse integer part and extract mantissa digits
1126
+ int mantissa_digits = json_parse_digits(state, &mantissa);
1127
+
1128
+ if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
1129
+ raise_parse_error_at("invalid number: %s", state, start);
1130
+ }
1131
+
1132
+ // Parse fractional part
1133
+ if (peek(state) == '.') {
1134
+ integer = false;
1135
+ decimal_point_pos = mantissa_digits; // Remember position of decimal point
1136
+ state->cursor++;
1137
+
1138
+ int fractional_digits = json_parse_digits(state, &mantissa);
1139
+ mantissa_digits += fractional_digits;
1140
+
1141
+ if (RB_UNLIKELY(!fractional_digits)) {
1142
+ raise_parse_error_at("invalid number: %s", state, start);
1143
+ }
1144
+ }
1145
+
1146
+ // Parse exponent
1147
+ if (rb_tolower(peek(state)) == 'e') {
1148
+ integer = false;
1149
+ state->cursor++;
1150
+
1151
+ bool negative_exponent = false;
1152
+ const char next_char = peek(state);
1153
+ if (next_char == '-' || next_char == '+') {
1154
+ negative_exponent = next_char == '-';
1155
+ state->cursor++;
1156
+ }
1157
+
1158
+ uint64_t abs_exponent = 0;
1159
+ int exponent_digits = json_parse_digits(state, &abs_exponent);
1160
+
1161
+ if (RB_UNLIKELY(!exponent_digits)) {
1162
+ raise_parse_error_at("invalid number: %s", state, start);
1163
+ }
1164
+
1165
+ exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent);
1166
+ }
1167
+
1168
+ if (integer) {
1169
+ return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
1170
+ }
1171
+
1172
+ // Adjust exponent based on decimal point position
1173
+ if (decimal_point_pos >= 0) {
1174
+ exponent -= (mantissa_digits - decimal_point_pos);
1175
+ }
1176
+
1177
+ return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
1178
+ }
1179
+
1180
+ static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
1181
+ {
1182
+ return json_parse_number(state, config, false, state->cursor);
1183
+ }
1184
+
1185
+ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
1186
+ {
1187
+ const char *start = state->cursor;
1188
+ state->cursor++;
1189
+ return json_parse_number(state, config, true, start);
1190
+ }
1191
+
1021
1192
  static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1022
1193
  {
1023
1194
  json_eat_whitespace(state);
1024
- if (state->cursor >= state->end) {
1025
- raise_parse_error("unexpected end of input", state);
1026
- }
1027
1195
 
1028
- switch (*state->cursor) {
1196
+ switch (peek(state)) {
1029
1197
  case 'n':
1030
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) {
1198
+ if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) {
1031
1199
  state->cursor += 4;
1032
1200
  return json_push_value(state, config, Qnil);
1033
1201
  }
@@ -1035,7 +1203,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1035
1203
  raise_parse_error("unexpected token %s", state);
1036
1204
  break;
1037
1205
  case 't':
1038
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
1206
+ if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) {
1039
1207
  state->cursor += 4;
1040
1208
  return json_push_value(state, config, Qtrue);
1041
1209
  }
@@ -1044,7 +1212,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1044
1212
  break;
1045
1213
  case 'f':
1046
1214
  // Note: memcmp with a small power of two compile to an integer comparison
1047
- if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1215
+ if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1048
1216
  state->cursor += 5;
1049
1217
  return json_push_value(state, config, Qfalse);
1050
1218
  }
@@ -1053,7 +1221,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1053
1221
  break;
1054
1222
  case 'N':
1055
1223
  // Note: memcmp with a small power of two compile to an integer comparison
1056
- if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1224
+ if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1057
1225
  state->cursor += 3;
1058
1226
  return json_push_value(state, config, CNaN);
1059
1227
  }
@@ -1061,16 +1229,16 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1061
1229
  raise_parse_error("unexpected token %s", state);
1062
1230
  break;
1063
1231
  case 'I':
1064
- if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1232
+ if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1065
1233
  state->cursor += 8;
1066
1234
  return json_push_value(state, config, CInfinity);
1067
1235
  }
1068
1236
 
1069
1237
  raise_parse_error("unexpected token %s", state);
1070
1238
  break;
1071
- case '-':
1239
+ case '-': {
1072
1240
  // Note: memcmp with a small power of two compile to an integer comparison
1073
- if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1241
+ if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1074
1242
  if (config->allow_nan) {
1075
1243
  state->cursor += 9;
1076
1244
  return json_push_value(state, config, CMinusInfinity);
@@ -1078,62 +1246,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1078
1246
  raise_parse_error("unexpected token %s", state);
1079
1247
  }
1080
1248
  }
1081
- // Fallthrough
1082
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
1083
- bool integer = true;
1084
-
1085
- // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
1086
- const char *start = state->cursor;
1087
- state->cursor++;
1088
-
1089
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1090
- state->cursor++;
1091
- }
1092
-
1093
- long integer_length = state->cursor - start;
1094
-
1095
- if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
1096
- raise_parse_error_at("invalid number: %s", state, start);
1097
- } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
1098
- raise_parse_error_at("invalid number: %s", state, start);
1099
- } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
1100
- raise_parse_error_at("invalid number: %s", state, start);
1101
- }
1102
-
1103
- if ((state->cursor < state->end) && (*state->cursor == '.')) {
1104
- integer = false;
1105
- state->cursor++;
1106
-
1107
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1108
- raise_parse_error("invalid number: %s", state);
1109
- }
1110
-
1111
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1112
- state->cursor++;
1113
- }
1114
- }
1115
-
1116
- if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
1117
- integer = false;
1118
- state->cursor++;
1119
- if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
1120
- state->cursor++;
1121
- }
1122
-
1123
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1124
- raise_parse_error("invalid number: %s", state);
1125
- }
1126
-
1127
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1128
- state->cursor++;
1129
- }
1130
- }
1131
-
1132
- if (integer) {
1133
- return json_push_value(state, config, json_decode_integer(start, state->cursor));
1134
- }
1135
- return json_push_value(state, config, json_decode_float(config, start, state->cursor));
1249
+ return json_push_value(state, config, json_parse_negative_number(state, config));
1250
+ break;
1136
1251
  }
1252
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1253
+ return json_push_value(state, config, json_parse_positive_number(state, config));
1254
+ break;
1137
1255
  case '"': {
1138
1256
  // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
1139
1257
  return json_parse_string(state, config, false);
@@ -1144,7 +1262,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1144
1262
  json_eat_whitespace(state);
1145
1263
  long stack_head = state->stack->head;
1146
1264
 
1147
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1265
+ if (peek(state) == ']') {
1148
1266
  state->cursor++;
1149
1267
  return json_push_value(state, config, json_decode_array(state, config, 0));
1150
1268
  } else {
@@ -1159,26 +1277,26 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1159
1277
  while (true) {
1160
1278
  json_eat_whitespace(state);
1161
1279
 
1162
- if (state->cursor < state->end) {
1163
- if (*state->cursor == ']') {
1164
- state->cursor++;
1165
- long count = state->stack->head - stack_head;
1166
- state->current_nesting--;
1167
- state->in_array--;
1168
- return json_push_value(state, config, json_decode_array(state, config, count));
1169
- }
1280
+ const char next_char = peek(state);
1170
1281
 
1171
- if (*state->cursor == ',') {
1172
- state->cursor++;
1173
- if (config->allow_trailing_comma) {
1174
- json_eat_whitespace(state);
1175
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1176
- continue;
1177
- }
1282
+ if (RB_LIKELY(next_char == ',')) {
1283
+ state->cursor++;
1284
+ if (config->allow_trailing_comma) {
1285
+ json_eat_whitespace(state);
1286
+ if (peek(state) == ']') {
1287
+ continue;
1178
1288
  }
1179
- json_parse_any(state, config);
1180
- continue;
1181
1289
  }
1290
+ json_parse_any(state, config);
1291
+ continue;
1292
+ }
1293
+
1294
+ if (next_char == ']') {
1295
+ state->cursor++;
1296
+ long count = state->stack->head - stack_head;
1297
+ state->current_nesting--;
1298
+ state->in_array--;
1299
+ return json_push_value(state, config, json_decode_array(state, config, count));
1182
1300
  }
1183
1301
 
1184
1302
  raise_parse_error("expected ',' or ']' after array value", state);
@@ -1192,7 +1310,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1192
1310
  json_eat_whitespace(state);
1193
1311
  long stack_head = state->stack->head;
1194
1312
 
1195
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1313
+ if (peek(state) == '}') {
1196
1314
  state->cursor++;
1197
1315
  return json_push_value(state, config, json_decode_object(state, config, 0));
1198
1316
  } else {
@@ -1201,13 +1319,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1201
1319
  rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
1202
1320
  }
1203
1321
 
1204
- if (*state->cursor != '"') {
1322
+ if (peek(state) != '"') {
1205
1323
  raise_parse_error("expected object key, got %s", state);
1206
1324
  }
1207
1325
  json_parse_string(state, config, true);
1208
1326
 
1209
1327
  json_eat_whitespace(state);
1210
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1328
+ if (peek(state) != ':') {
1211
1329
  raise_parse_error("expected ':' after object key", state);
1212
1330
  }
1213
1331
  state->cursor++;
@@ -1218,46 +1336,45 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1218
1336
  while (true) {
1219
1337
  json_eat_whitespace(state);
1220
1338
 
1221
- if (state->cursor < state->end) {
1222
- if (*state->cursor == '}') {
1223
- state->cursor++;
1224
- state->current_nesting--;
1225
- size_t count = state->stack->head - stack_head;
1339
+ const char next_char = peek(state);
1340
+ if (next_char == '}') {
1341
+ state->cursor++;
1342
+ state->current_nesting--;
1343
+ size_t count = state->stack->head - stack_head;
1226
1344
 
1227
- // Temporary rewind cursor in case an error is raised
1228
- const char *final_cursor = state->cursor;
1229
- state->cursor = object_start_cursor;
1230
- VALUE object = json_decode_object(state, config, count);
1231
- state->cursor = final_cursor;
1345
+ // Temporary rewind cursor in case an error is raised
1346
+ const char *final_cursor = state->cursor;
1347
+ state->cursor = object_start_cursor;
1348
+ VALUE object = json_decode_object(state, config, count);
1349
+ state->cursor = final_cursor;
1232
1350
 
1233
- return json_push_value(state, config, object);
1234
- }
1351
+ return json_push_value(state, config, object);
1352
+ }
1235
1353
 
1236
- if (*state->cursor == ',') {
1237
- state->cursor++;
1238
- json_eat_whitespace(state);
1354
+ if (next_char == ',') {
1355
+ state->cursor++;
1356
+ json_eat_whitespace(state);
1239
1357
 
1240
- if (config->allow_trailing_comma) {
1241
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1242
- continue;
1243
- }
1358
+ if (config->allow_trailing_comma) {
1359
+ if (peek(state) == '}') {
1360
+ continue;
1244
1361
  }
1362
+ }
1245
1363
 
1246
- if (*state->cursor != '"') {
1247
- raise_parse_error("expected object key, got: %s", state);
1248
- }
1249
- json_parse_string(state, config, true);
1364
+ if (RB_UNLIKELY(peek(state) != '"')) {
1365
+ raise_parse_error("expected object key, got: %s", state);
1366
+ }
1367
+ json_parse_string(state, config, true);
1250
1368
 
1251
- json_eat_whitespace(state);
1252
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1253
- raise_parse_error("expected ':' after object key, got: %s", state);
1254
- }
1255
- state->cursor++;
1369
+ json_eat_whitespace(state);
1370
+ if (RB_UNLIKELY(peek(state) != ':')) {
1371
+ raise_parse_error("expected ':' after object key, got: %s", state);
1372
+ }
1373
+ state->cursor++;
1256
1374
 
1257
- json_parse_any(state, config);
1375
+ json_parse_any(state, config);
1258
1376
 
1259
- continue;
1260
- }
1377
+ continue;
1261
1378
  }
1262
1379
 
1263
1380
  raise_parse_error("expected ',' or '}' after object value, got: %s", state);
@@ -1265,18 +1382,23 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1265
1382
  break;
1266
1383
  }
1267
1384
 
1385
+ case 0:
1386
+ raise_parse_error("unexpected end of input", state);
1387
+ break;
1388
+
1268
1389
  default:
1269
1390
  raise_parse_error("unexpected character: %s", state);
1270
1391
  break;
1271
1392
  }
1272
1393
 
1273
1394
  raise_parse_error("unreachable: %s", state);
1395
+ return Qundef;
1274
1396
  }
1275
1397
 
1276
1398
  static void json_ensure_eof(JSON_ParserState *state)
1277
1399
  {
1278
1400
  json_eat_whitespace(state);
1279
- if (state->cursor != state->end) {
1401
+ if (!eos(state)) {
1280
1402
  raise_parse_error("unexpected token at end of stream %s", state);
1281
1403
  }
1282
1404
  }
@@ -1313,14 +1435,16 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
1313
1435
  {
1314
1436
  JSON_ParserConfig *config = (JSON_ParserConfig *)data;
1315
1437
 
1316
- if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1317
- else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1318
- else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1319
- else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1320
- else if (key == sym_freeze) { config->freeze = RTEST(val); }
1321
- else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1322
- else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1323
- else if (key == sym_decimal_class) {
1438
+ if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1439
+ else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1440
+ else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1441
+ else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); }
1442
+ else if (key == sym_allow_invalid_escape) { config->allow_invalid_escape = RTEST(val); }
1443
+ else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1444
+ else if (key == sym_freeze) { config->freeze = RTEST(val); }
1445
+ else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1446
+ else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1447
+ else if (key == sym_decimal_class) {
1324
1448
  if (RTEST(val)) {
1325
1449
  if (rb_respond_to(val, i_try_convert)) {
1326
1450
  config->decimal_class = val;
@@ -1393,6 +1517,7 @@ static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
1393
1517
  */
1394
1518
  static VALUE cParserConfig_initialize(VALUE self, VALUE opts)
1395
1519
  {
1520
+ rb_check_frozen(self);
1396
1521
  GET_PARSER_CONFIG;
1397
1522
 
1398
1523
  parser_config_init(config, opts);
@@ -1488,7 +1613,7 @@ static const rb_data_type_t JSON_ParserConfig_type = {
1488
1613
  JSON_ParserConfig_memsize,
1489
1614
  },
1490
1615
  0, 0,
1491
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
1616
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE,
1492
1617
  };
1493
1618
 
1494
1619
  static VALUE cJSON_parser_s_allocate(VALUE klass)
@@ -1532,16 +1657,14 @@ void Init_parser(void)
1532
1657
  sym_max_nesting = ID2SYM(rb_intern("max_nesting"));
1533
1658
  sym_allow_nan = ID2SYM(rb_intern("allow_nan"));
1534
1659
  sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma"));
1660
+ sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters"));
1661
+ sym_allow_invalid_escape = ID2SYM(rb_intern("allow_invalid_escape"));
1535
1662
  sym_symbolize_names = ID2SYM(rb_intern("symbolize_names"));
1536
1663
  sym_freeze = ID2SYM(rb_intern("freeze"));
1537
1664
  sym_on_load = ID2SYM(rb_intern("on_load"));
1538
1665
  sym_decimal_class = ID2SYM(rb_intern("decimal_class"));
1539
1666
  sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key"));
1540
1667
 
1541
- i_chr = rb_intern("chr");
1542
- i_aset = rb_intern("[]=");
1543
- i_aref = rb_intern("[]");
1544
- i_leftshift = rb_intern("<<");
1545
1668
  i_new = rb_intern("new");
1546
1669
  i_try_convert = rb_intern("try_convert");
1547
1670
  i_uminus = rb_intern("-@");