json 2.13.1 → 2.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,50 +1,22 @@
1
- #include "ruby.h"
2
- #include "ruby/encoding.h"
3
-
4
- /* shims */
5
- /* This is the fallback definition from Ruby 3.4 */
6
-
7
- #ifndef RBIMPL_STDBOOL_H
8
- #if defined(__cplusplus)
9
- # if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
10
- # include <cstdbool>
11
- # endif
12
- #elif defined(HAVE_STDBOOL_H)
13
- # include <stdbool.h>
14
- #elif !defined(HAVE__BOOL)
15
- typedef unsigned char _Bool;
16
- # define bool _Bool
17
- # define true ((_Bool)+1)
18
- # define false ((_Bool)+0)
19
- # define __bool_true_false_are_defined
20
- #endif
21
- #endif
22
-
1
+ #include "../json.h"
2
+ #include "../vendor/ryu.h"
23
3
  #include "../simd/simd.h"
24
4
 
25
- #ifndef RB_UNLIKELY
26
- #define RB_UNLIKELY(expr) expr
27
- #endif
28
-
29
- #ifndef RB_LIKELY
30
- #define RB_LIKELY(expr) expr
31
- #endif
32
-
33
5
  static VALUE mJSON, eNestingError, Encoding_UTF_8;
34
6
  static VALUE CNaN, CInfinity, CMinusInfinity;
35
7
 
36
- static ID i_chr, i_aset, i_aref,
37
- i_leftshift, i_new, i_try_convert, i_uminus, i_encode;
8
+ static ID i_new, i_try_convert, i_uminus, i_encode;
38
9
 
39
- static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze,
40
- sym_decimal_class, sym_on_load, sym_allow_duplicate_key;
10
+ static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters,
11
+ sym_allow_invalid_escape, sym_symbolize_names, sym_freeze, sym_decimal_class, sym_on_load,
12
+ sym_allow_duplicate_key;
41
13
 
42
14
  static int binary_encindex;
43
15
  static int utf8_encindex;
44
16
 
45
17
  #ifndef HAVE_RB_HASH_BULK_INSERT
46
18
  // For TruffleRuby
47
- void
19
+ static void
48
20
  rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
49
21
  {
50
22
  long index = 0;
@@ -61,6 +33,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
61
33
  #define rb_hash_new_capa(n) rb_hash_new()
62
34
  #endif
63
35
 
36
+ #ifndef HAVE_RB_STR_TO_INTERNED_STR
37
+ static VALUE rb_str_to_interned_str(VALUE str)
38
+ {
39
+ return rb_funcall(rb_str_freeze(str), i_uminus, 0);
40
+ }
41
+ #endif
64
42
 
65
43
  /* name cache */
66
44
 
@@ -106,116 +84,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring
106
84
  cache->entries[index] = rstring;
107
85
  }
108
86
 
109
- static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
87
+ #define rstring_cache_memcmp memcmp
88
+
89
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
90
+ #if __has_builtin(__builtin_bswap64)
91
+ #undef rstring_cache_memcmp
92
+ ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
110
93
  {
111
- long rstring_length = RSTRING_LEN(rstring);
112
- if (length == rstring_length) {
113
- return memcmp(str, RSTRING_PTR(rstring), length);
114
- } else {
115
- return (int)(length - rstring_length);
94
+ // The libc memcmp has numerous complex optimizations, but in this particular case,
95
+ // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
96
+ // inline a simpler memcmp outperforms calling the libc version.
97
+ long i = 0;
98
+
99
+ for (; i + 8 <= length; i += 8) {
100
+ uint64_t a, b;
101
+ memcpy(&a, str + i, 8);
102
+ memcpy(&b, rptr + i, 8);
103
+ if (a != b) {
104
+ a = __builtin_bswap64(a);
105
+ b = __builtin_bswap64(b);
106
+ return (a < b) ? -1 : 1;
107
+ }
116
108
  }
109
+
110
+ for (; i < length; i++) {
111
+ if (str[i] != rptr[i]) {
112
+ return (str[i] < rptr[i]) ? -1 : 1;
113
+ }
114
+ }
115
+
116
+ return 0;
117
117
  }
118
+ #endif
119
+ #endif
118
120
 
119
- static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
121
+ ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
120
122
  {
121
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
122
- // Common names aren't likely to be very long. So we just don't
123
- // cache names above an arbitrary threshold.
124
- return Qfalse;
125
- }
123
+ const char *rstring_ptr;
124
+ long rstring_length;
125
+
126
+ RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
126
127
 
127
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
128
- // Simple heuristic, if the first character isn't a letter,
129
- // we're much less likely to see this string again.
130
- // We mostly want to cache strings that are likely to be repeated.
131
- return Qfalse;
128
+ if (length == rstring_length) {
129
+ return rstring_cache_memcmp(str, rstring_ptr, length);
130
+ } else {
131
+ return (int)(length - rstring_length);
132
132
  }
133
+ }
133
134
 
135
+ ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
136
+ {
134
137
  int low = 0;
135
138
  int high = cache->length - 1;
136
- int mid = 0;
137
- int last_cmp = 0;
138
139
 
139
140
  while (low <= high) {
140
- mid = (high + low) >> 1;
141
+ int mid = (high + low) >> 1;
141
142
  VALUE entry = cache->entries[mid];
142
- last_cmp = rstring_cache_cmp(str, length, entry);
143
+ int cmp = rstring_cache_cmp(str, length, entry);
143
144
 
144
- if (last_cmp == 0) {
145
+ if (cmp == 0) {
145
146
  return entry;
146
- } else if (last_cmp > 0) {
147
+ } else if (cmp > 0) {
147
148
  low = mid + 1;
148
149
  } else {
149
150
  high = mid - 1;
150
151
  }
151
152
  }
152
153
 
153
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
154
- // We assume the overwhelming majority of names don't need to be escaped.
155
- // But if they do, we have to fallback to the slow path.
156
- return Qfalse;
157
- }
158
-
159
154
  VALUE rstring = build_interned_string(str, length);
160
155
 
161
156
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
162
- if (last_cmp > 0) {
163
- mid += 1;
164
- }
165
-
166
- rvalue_cache_insert_at(cache, mid, rstring);
157
+ rvalue_cache_insert_at(cache, low, rstring);
167
158
  }
168
159
  return rstring;
169
160
  }
170
161
 
171
162
  static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
172
163
  {
173
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
174
- // Common names aren't likely to be very long. So we just don't
175
- // cache names above an arbitrary threshold.
176
- return Qfalse;
177
- }
178
-
179
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
180
- // Simple heuristic, if the first character isn't a letter,
181
- // we're much less likely to see this string again.
182
- // We mostly want to cache strings that are likely to be repeated.
183
- return Qfalse;
184
- }
185
-
186
164
  int low = 0;
187
165
  int high = cache->length - 1;
188
- int mid = 0;
189
- int last_cmp = 0;
190
166
 
191
167
  while (low <= high) {
192
- mid = (high + low) >> 1;
168
+ int mid = (high + low) >> 1;
193
169
  VALUE entry = cache->entries[mid];
194
- last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
170
+ int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
195
171
 
196
- if (last_cmp == 0) {
172
+ if (cmp == 0) {
197
173
  return entry;
198
- } else if (last_cmp > 0) {
174
+ } else if (cmp > 0) {
199
175
  low = mid + 1;
200
176
  } else {
201
177
  high = mid - 1;
202
178
  }
203
179
  }
204
180
 
205
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
206
- // We assume the overwhelming majority of names don't need to be escaped.
207
- // But if they do, we have to fallback to the slow path.
208
- return Qfalse;
209
- }
210
-
211
181
  VALUE rsymbol = build_symbol(str, length);
212
182
 
213
183
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
214
- if (last_cmp > 0) {
215
- mid += 1;
216
- }
217
-
218
- rvalue_cache_insert_at(cache, mid, rsymbol);
184
+ rvalue_cache_insert_at(cache, low, rsymbol);
219
185
  }
220
186
  return rsymbol;
221
187
  }
@@ -330,15 +296,6 @@ static void rvalue_stack_eagerly_release(VALUE handle)
330
296
  }
331
297
  }
332
298
 
333
-
334
- #ifndef HAVE_STRNLEN
335
- static size_t strnlen(const char *s, size_t maxlen)
336
- {
337
- char *p;
338
- return ((p = memchr(s, '\0', maxlen)) ? p - s : maxlen);
339
- }
340
- #endif
341
-
342
299
  static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
343
300
  {
344
301
  int len = 1;
@@ -379,7 +336,8 @@ typedef struct JSON_ParserStruct {
379
336
  int max_nesting;
380
337
  bool allow_nan;
381
338
  bool allow_trailing_comma;
382
- bool parsing_name;
339
+ bool allow_control_characters;
340
+ bool allow_invalid_escape;
383
341
  bool symbolize_names;
384
342
  bool freeze;
385
343
  } JSON_ParserConfig;
@@ -395,6 +353,22 @@ typedef struct JSON_ParserStateStruct {
395
353
  int current_nesting;
396
354
  } JSON_ParserState;
397
355
 
356
+ static inline size_t rest(JSON_ParserState *state) {
357
+ return state->end - state->cursor;
358
+ }
359
+
360
+ static inline bool eos(JSON_ParserState *state) {
361
+ return state->cursor >= state->end;
362
+ }
363
+
364
+ static inline char peek(JSON_ParserState *state)
365
+ {
366
+ if (RB_UNLIKELY(eos(state))) {
367
+ return 0;
368
+ }
369
+ return *state->cursor;
370
+ }
371
+
398
372
  static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
399
373
  {
400
374
  const char *cursor = state->cursor;
@@ -422,18 +396,15 @@ static void emit_parse_warning(const char *message, JSON_ParserState *state)
422
396
  long line, column;
423
397
  cursor_position(state, &line, &column);
424
398
 
425
- rb_warn("%s at line %ld column %ld", message, line, column);
399
+ VALUE warning = rb_sprintf("%s at line %ld column %ld", message, line, column);
400
+ rb_funcall(mJSON, rb_intern("deprecation_warning"), 1, warning);
426
401
  }
427
402
 
428
403
  #define PARSE_ERROR_FRAGMENT_LEN 32
429
- #ifdef RBIMPL_ATTR_NORETURN
430
- RBIMPL_ATTR_NORETURN()
431
- #endif
432
- static void raise_parse_error(const char *format, JSON_ParserState *state)
404
+
405
+ static VALUE build_parse_error_message(const char *format, JSON_ParserState *state, long line, long column)
433
406
  {
434
407
  unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
435
- long line, column;
436
- cursor_position(state, &line, &column);
437
408
 
438
409
  const char *ptr = "EOF";
439
410
  if (state->cursor && state->cursor < state->end) {
@@ -468,17 +439,26 @@ static void raise_parse_error(const char *format, JSON_ParserState *state)
468
439
  VALUE msg = rb_sprintf(format, ptr);
469
440
  VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column);
470
441
  RB_GC_GUARD(msg);
442
+ return message;
443
+ }
471
444
 
445
+ static VALUE parse_error_new(VALUE message, long line, long column)
446
+ {
472
447
  VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message);
473
448
  rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line));
474
449
  rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column));
475
- rb_exc_raise(exc);
450
+ return exc;
476
451
  }
477
452
 
478
- #ifdef RBIMPL_ATTR_NORETURN
479
- RBIMPL_ATTR_NORETURN()
480
- #endif
481
- static void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
453
+ NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state)
454
+ {
455
+ long line, column;
456
+ cursor_position(state, &line, &column);
457
+ VALUE message = build_parse_error_message(format, state, line, column);
458
+ rb_exc_raise(parse_error_new(message, line, column));
459
+ }
460
+
461
+ NORETURN(static) void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
482
462
  {
483
463
  state->cursor = at;
484
464
  raise_parse_error(format, state);
@@ -503,23 +483,24 @@ static const signed char digit_values[256] = {
503
483
  -1, -1, -1, -1, -1, -1, -1
504
484
  };
505
485
 
506
- static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
507
- {
508
- signed char b;
509
- uint32_t result = 0;
510
- b = digit_values[p[0]];
511
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
512
- result = (result << 4) | (unsigned char)b;
513
- b = digit_values[p[1]];
514
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
515
- result = (result << 4) | (unsigned char)b;
516
- b = digit_values[p[2]];
517
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
518
- result = (result << 4) | (unsigned char)b;
519
- b = digit_values[p[3]];
520
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
521
- result = (result << 4) | (unsigned char)b;
522
- return result;
486
+ static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe)
487
+ {
488
+ if (RB_UNLIKELY(sp > spe - 4)) {
489
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
490
+ }
491
+
492
+ const unsigned char *p = (const unsigned char *)sp;
493
+
494
+ const signed char b0 = digit_values[p[0]];
495
+ const signed char b1 = digit_values[p[1]];
496
+ const signed char b2 = digit_values[p[2]];
497
+ const signed char b3 = digit_values[p[3]];
498
+
499
+ if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) {
500
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
501
+ }
502
+
503
+ return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
523
504
  }
524
505
 
525
506
  #define GET_PARSER_CONFIG \
@@ -528,61 +509,82 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
528
509
 
529
510
  static const rb_data_type_t JSON_ParserConfig_type;
530
511
 
531
- static const bool whitespace[256] = {
532
- [' '] = 1,
533
- ['\t'] = 1,
534
- ['\n'] = 1,
535
- ['\r'] = 1,
536
- ['/'] = 1,
537
- };
538
-
539
512
  static void
540
513
  json_eat_comments(JSON_ParserState *state)
541
514
  {
542
- if (state->cursor + 1 < state->end) {
543
- switch (state->cursor[1]) {
544
- case '/': {
545
- state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
546
- if (!state->cursor) {
547
- state->cursor = state->end;
548
- } else {
549
- state->cursor++;
550
- }
551
- break;
515
+ const char *start = state->cursor;
516
+ state->cursor++;
517
+
518
+ switch (peek(state)) {
519
+ case '/': {
520
+ state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
521
+ if (!state->cursor) {
522
+ state->cursor = state->end;
523
+ } else {
524
+ state->cursor++;
552
525
  }
553
- case '*': {
554
- state->cursor += 2;
555
- while (true) {
556
- state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
557
- if (!state->cursor) {
558
- raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
559
- } else {
560
- state->cursor++;
561
- if (state->cursor < state->end && *state->cursor == '/') {
562
- state->cursor++;
563
- break;
564
- }
565
- }
526
+ break;
527
+ }
528
+ case '*': {
529
+ state->cursor++;
530
+
531
+ while (true) {
532
+ const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
533
+ if (!next_match) {
534
+ raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
535
+ }
536
+
537
+ state->cursor = next_match + 1;
538
+ if (peek(state) == '/') {
539
+ state->cursor++;
540
+ break;
566
541
  }
567
- break;
568
542
  }
569
- default:
570
- raise_parse_error("unexpected token %s", state);
571
- break;
543
+ break;
572
544
  }
573
- } else {
574
- raise_parse_error("unexpected token %s", state);
545
+ default:
546
+ raise_parse_error_at("unexpected token %s", state, start);
547
+ break;
575
548
  }
576
549
  }
577
550
 
578
- static inline void
551
+ ALWAYS_INLINE(static) void
579
552
  json_eat_whitespace(JSON_ParserState *state)
580
553
  {
581
- while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) {
582
- if (RB_LIKELY(*state->cursor != '/')) {
583
- state->cursor++;
584
- } else {
585
- json_eat_comments(state);
554
+ while (true) {
555
+ switch (peek(state)) {
556
+ case ' ':
557
+ state->cursor++;
558
+ break;
559
+ case '\n':
560
+ state->cursor++;
561
+
562
+ // Heuristic: if we see a newline, there is likely consecutive spaces after it.
563
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
564
+ while (rest(state) > 8) {
565
+ uint64_t chunk;
566
+ memcpy(&chunk, state->cursor, sizeof(uint64_t));
567
+ if (chunk == 0x2020202020202020) {
568
+ state->cursor += 8;
569
+ continue;
570
+ }
571
+
572
+ uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
573
+ state->cursor += consecutive_spaces;
574
+ break;
575
+ }
576
+ #endif
577
+ break;
578
+ case '\t':
579
+ case '\r':
580
+ state->cursor++;
581
+ break;
582
+ case '/':
583
+ json_eat_comments(state);
584
+ break;
585
+
586
+ default:
587
+ return;
586
588
  }
587
589
  }
588
590
  }
@@ -613,11 +615,22 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
613
615
  return result;
614
616
  }
615
617
 
616
- static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
618
+ static inline bool json_string_cacheable_p(const char *string, size_t length)
617
619
  {
620
+ // We mostly want to cache strings that are likely to be repeated.
621
+ // Simple heuristics:
622
+ // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
623
+ // - If the first character isn't a letter, we're much less likely to see this string again.
624
+ return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
625
+ }
626
+
627
+ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
628
+ {
629
+ bool intern = is_name || config->freeze;
630
+ bool symbolize = is_name && config->symbolize_names;
618
631
  size_t bufferSize = stringEnd - string;
619
632
 
620
- if (is_name && state->in_array) {
633
+ if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
621
634
  VALUE cached_key;
622
635
  if (RB_UNLIKELY(symbolize)) {
623
636
  cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
@@ -633,104 +646,127 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
633
646
  return build_string(string, stringEnd, intern, symbolize);
634
647
  }
635
648
 
636
- static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
637
- {
638
- size_t bufferSize = stringEnd - string;
639
- const char *p = string, *pe = string, *unescape, *bufferStart;
640
- char *buffer;
641
- int unescape_len;
642
- char buf[4];
649
+ #define JSON_MAX_UNESCAPE_POSITIONS 16
650
+ typedef struct _json_unescape_positions {
651
+ long size;
652
+ const char **positions;
653
+ unsigned long additional_backslashes;
654
+ } JSON_UnescapePositions;
643
655
 
644
- if (is_name && state->in_array) {
645
- VALUE cached_key;
646
- if (RB_UNLIKELY(symbolize)) {
647
- cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
648
- } else {
649
- cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
656
+ static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
657
+ {
658
+ while (positions->size) {
659
+ positions->size--;
660
+ const char *next_position = positions->positions[0];
661
+ positions->positions++;
662
+ if (next_position >= pe) {
663
+ return next_position;
650
664
  }
665
+ }
651
666
 
652
- if (RB_LIKELY(cached_key)) {
653
- return cached_key;
654
- }
667
+ if (positions->additional_backslashes) {
668
+ positions->additional_backslashes--;
669
+ return memchr(pe, '\\', stringEnd - pe);
655
670
  }
656
671
 
672
+ return NULL;
673
+ }
674
+
675
+ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
676
+ {
677
+ bool intern = is_name || config->freeze;
678
+ bool symbolize = is_name && config->symbolize_names;
679
+ size_t bufferSize = stringEnd - string;
680
+ const char *p = string, *pe = string, *bufferStart;
681
+ char *buffer;
682
+
657
683
  VALUE result = rb_str_buf_new(bufferSize);
658
684
  rb_enc_associate_index(result, utf8_encindex);
659
685
  buffer = RSTRING_PTR(result);
660
686
  bufferStart = buffer;
661
687
 
662
- while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) {
663
- unescape = (char *) "?";
664
- unescape_len = 1;
688
+ #define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
689
+
690
+ while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
665
691
  if (pe > p) {
666
692
  MEMCPY(buffer, p, char, pe - p);
667
693
  buffer += pe - p;
668
694
  }
669
695
  switch (*++pe) {
696
+ case '"':
697
+ case '/':
698
+ p = pe; // nothing to unescape just need to skip the backslash
699
+ break;
700
+ case '\\':
701
+ APPEND_CHAR('\\');
702
+ break;
670
703
  case 'n':
671
- unescape = (char *) "\n";
704
+ APPEND_CHAR('\n');
672
705
  break;
673
706
  case 'r':
674
- unescape = (char *) "\r";
707
+ APPEND_CHAR('\r');
675
708
  break;
676
709
  case 't':
677
- unescape = (char *) "\t";
678
- break;
679
- case '"':
680
- unescape = (char *) "\"";
681
- break;
682
- case '\\':
683
- unescape = (char *) "\\";
710
+ APPEND_CHAR('\t');
684
711
  break;
685
712
  case 'b':
686
- unescape = (char *) "\b";
713
+ APPEND_CHAR('\b');
687
714
  break;
688
715
  case 'f':
689
- unescape = (char *) "\f";
716
+ APPEND_CHAR('\f');
690
717
  break;
691
- case 'u':
692
- if (pe > stringEnd - 5) {
693
- raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
694
- } else {
695
- uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
696
- pe += 3;
697
- /* To handle values above U+FFFF, we take a sequence of
698
- * \uXXXX escapes in the U+D800..U+DBFF then
699
- * U+DC00..U+DFFF ranges, take the low 10 bits from each
700
- * to make a 20-bit number, then add 0x10000 to get the
701
- * final codepoint.
702
- *
703
- * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
704
- * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
705
- * Area".
706
- */
707
- if ((ch & 0xFC00) == 0xD800) {
708
- pe++;
709
- if (pe > stringEnd - 6) {
710
- raise_parse_error_at("incomplete surrogate pair at %s", state, p);
711
- }
712
- if (pe[0] == '\\' && pe[1] == 'u') {
713
- uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
714
- ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
715
- | (sur & 0x3FF));
716
- pe += 5;
717
- } else {
718
- unescape = (char *) "?";
719
- break;
718
+ case 'u': {
719
+ uint32_t ch = unescape_unicode(state, ++pe, stringEnd);
720
+ pe += 3;
721
+ /* To handle values above U+FFFF, we take a sequence of
722
+ * \uXXXX escapes in the U+D800..U+DBFF then
723
+ * U+DC00..U+DFFF ranges, take the low 10 bits from each
724
+ * to make a 20-bit number, then add 0x10000 to get the
725
+ * final codepoint.
726
+ *
727
+ * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
728
+ * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
729
+ * Area".
730
+ */
731
+ if ((ch & 0xFC00) == 0xD800) {
732
+ pe++;
733
+ if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) {
734
+ uint32_t sur = unescape_unicode(state, pe + 2, stringEnd);
735
+
736
+ if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) {
737
+ raise_parse_error_at("invalid surrogate pair at %s", state, p);
720
738
  }
739
+
740
+ ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF));
741
+ pe += 5;
742
+ } else {
743
+ raise_parse_error_at("incomplete surrogate pair at %s", state, p);
744
+ break;
721
745
  }
722
- unescape_len = convert_UTF32_to_UTF8(buf, ch);
723
- unescape = buf;
724
746
  }
747
+
748
+ int unescape_len = convert_UTF32_to_UTF8(buffer, ch);
749
+ buffer += unescape_len;
750
+ p = ++pe;
725
751
  break;
752
+ }
726
753
  default:
727
- p = pe;
728
- continue;
754
+ if ((unsigned char)*pe < 0x20) {
755
+ if (!config->allow_control_characters) {
756
+ if (*pe == '\n') {
757
+ raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1);
758
+ }
759
+ raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1);
760
+ }
761
+ } else if (config->allow_invalid_escape) {
762
+ APPEND_CHAR(*pe);
763
+ } else {
764
+ raise_parse_error_at("invalid escape character in string: %s", state, pe - 1);
765
+ }
766
+ break;
729
767
  }
730
- MEMCPY(buffer, unescape, char, unescape_len);
731
- buffer += unescape_len;
732
- p = ++pe;
733
768
  }
769
+ #undef APPEND_CHAR
734
770
 
735
771
  if (stringEnd > p) {
736
772
  MEMCPY(buffer, p, char, stringEnd - p);
@@ -741,81 +777,85 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
741
777
  if (symbolize) {
742
778
  result = rb_str_intern(result);
743
779
  } else if (intern) {
744
- result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
780
+ result = rb_str_to_interned_str(result);
745
781
  }
746
782
 
747
783
  return result;
748
784
  }
749
785
 
750
786
  #define MAX_FAST_INTEGER_SIZE 18
751
- static inline VALUE fast_decode_integer(const char *p, const char *pe)
752
- {
753
- bool negative = false;
754
- if (*p == '-') {
755
- negative = true;
756
- p++;
757
- }
787
+ #define MAX_NUMBER_STACK_BUFFER 128
758
788
 
759
- long long memo = 0;
760
- while (p < pe) {
761
- memo *= 10;
762
- memo += *p - '0';
763
- p++;
764
- }
789
+ typedef VALUE (*json_number_decode_func_t)(const char *ptr);
765
790
 
766
- if (negative) {
767
- memo = -memo;
791
+ static inline VALUE json_decode_large_number(const char *start, long len, json_number_decode_func_t func)
792
+ {
793
+ if (RB_LIKELY(len < MAX_NUMBER_STACK_BUFFER)) {
794
+ char buffer[MAX_NUMBER_STACK_BUFFER];
795
+ MEMCPY(buffer, start, char, len);
796
+ buffer[len] = '\0';
797
+ return func(buffer);
798
+ } else {
799
+ VALUE buffer_v = rb_str_tmp_new(len);
800
+ char *buffer = RSTRING_PTR(buffer_v);
801
+ MEMCPY(buffer, start, char, len);
802
+ buffer[len] = '\0';
803
+ VALUE number = func(buffer);
804
+ RB_GC_GUARD(buffer_v);
805
+ return number;
768
806
  }
769
- return LL2NUM(memo);
770
807
  }
771
808
 
772
- static VALUE json_decode_large_integer(const char *start, long len)
809
+ static VALUE json_decode_inum(const char *buffer)
773
810
  {
774
- VALUE buffer_v;
775
- char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
776
- MEMCPY(buffer, start, char, len);
777
- buffer[len] = '\0';
778
- VALUE number = rb_cstr2inum(buffer, 10);
779
- RB_ALLOCV_END(buffer_v);
780
- return number;
811
+ return rb_cstr2inum(buffer, 10);
781
812
  }
782
813
 
783
- static inline VALUE
784
- json_decode_integer(const char *start, const char *end)
814
+ NOINLINE(static) VALUE json_decode_large_integer(const char *start, long len)
785
815
  {
786
- long len = end - start;
787
- if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
788
- return fast_decode_integer(start, end);
816
+ return json_decode_large_number(start, len, json_decode_inum);
817
+ }
818
+
819
+ static inline VALUE json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
820
+ {
821
+ if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
822
+ if (negative) {
823
+ return INT64T2NUM(-((int64_t)mantissa));
789
824
  }
790
- return json_decode_large_integer(start, len);
825
+ return UINT64T2NUM(mantissa);
826
+ }
827
+
828
+ return json_decode_large_integer(start, end - start);
791
829
  }
792
830
 
793
- static VALUE json_decode_large_float(const char *start, long len)
831
+ static VALUE json_decode_dnum(const char *buffer)
794
832
  {
795
- VALUE buffer_v;
796
- char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
797
- MEMCPY(buffer, start, char, len);
798
- buffer[len] = '\0';
799
- VALUE number = DBL2NUM(rb_cstr_to_dbl(buffer, 1));
800
- RB_ALLOCV_END(buffer_v);
801
- return number;
833
+ return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
802
834
  }
803
835
 
804
- static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
836
+ NOINLINE(static) VALUE json_decode_large_float(const char *start, long len)
805
837
  {
806
- long len = end - start;
838
+ return json_decode_large_number(start, len, json_decode_dnum);
839
+ }
807
840
 
841
+ /* Ruby JSON optimized float decoder using vendored Ryu algorithm
842
+ * Accepts pre-extracted mantissa and exponent from first-pass validation
843
+ */
844
+ static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
845
+ const char *start, const char *end)
846
+ {
808
847
  if (RB_UNLIKELY(config->decimal_class)) {
809
- VALUE text = rb_str_new(start, len);
848
+ VALUE text = rb_str_new(start, end - start);
810
849
  return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
811
- } else if (RB_LIKELY(len < 64)) {
812
- char buffer[64];
813
- MEMCPY(buffer, start, char, len);
814
- buffer[len] = '\0';
815
- return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
816
- } else {
817
- return json_decode_large_float(start, len);
818
850
  }
851
+
852
+ // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
853
+ // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
854
+ if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
855
+ return json_decode_large_float(start, end - start);
856
+ }
857
+
858
+ return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
819
859
  }
820
860
 
821
861
  static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
@@ -830,21 +870,66 @@ static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig
830
870
  return array;
831
871
  }
832
872
 
873
+ static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs)
874
+ {
875
+ VALUE set = rb_hash_new_capa(count / 2);
876
+ for (size_t index = 0; index < count; index += 2) {
877
+ size_t before = RHASH_SIZE(set);
878
+ VALUE key = pairs[index];
879
+ rb_hash_aset(set, key, Qtrue);
880
+ if (RHASH_SIZE(set) == before) {
881
+ if (RB_SYMBOL_P(key)) {
882
+ return rb_sym2str(key);
883
+ }
884
+ return key;
885
+ }
886
+ }
887
+ return Qfalse;
888
+ }
889
+
890
+ NOINLINE(static) void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
891
+ {
892
+ VALUE message = rb_sprintf(
893
+ "detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`",
894
+ rb_inspect(duplicate_key)
895
+ );
896
+
897
+ emit_parse_warning(RSTRING_PTR(message), state);
898
+ RB_GC_GUARD(message);
899
+ }
900
+
901
+ NORETURN(static) void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
902
+ {
903
+ VALUE message = rb_sprintf(
904
+ "duplicate key %"PRIsVALUE,
905
+ rb_inspect(duplicate_key)
906
+ );
907
+
908
+ long line, column;
909
+ cursor_position(state, &line, &column);
910
+ rb_str_concat(message, build_parse_error_message("", state, line, column)) ;
911
+ rb_exc_raise(parse_error_new(message, line, column));
912
+
913
+ raise_parse_error(RSTRING_PTR(message), state);
914
+ RB_GC_GUARD(message);
915
+ }
916
+
833
917
  static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count)
834
918
  {
835
919
  size_t entries_count = count / 2;
836
920
  VALUE object = rb_hash_new_capa(entries_count);
837
- rb_hash_bulk_insert(count, rvalue_stack_peek(state->stack, count), object);
921
+ const VALUE *pairs = rvalue_stack_peek(state->stack, count);
922
+ rb_hash_bulk_insert(count, pairs, object);
838
923
 
839
924
  if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) {
840
925
  switch (config->on_duplicate_key) {
841
926
  case JSON_IGNORE:
842
927
  break;
843
928
  case JSON_DEPRECATED:
844
- emit_parse_warning("detected duplicate keys in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`", state);
929
+ emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs));
845
930
  break;
846
931
  case JSON_RAISE:
847
- raise_parse_error("duplicate key", state);
932
+ raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs));
848
933
  break;
849
934
  }
850
935
  }
@@ -858,20 +943,6 @@ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfi
858
943
  return object;
859
944
  }
860
945
 
861
- static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name)
862
- {
863
- VALUE string;
864
- bool intern = is_name || config->freeze;
865
- bool symbolize = is_name && config->symbolize_names;
866
- if (escaped) {
867
- string = json_string_unescape(state, start, end, is_name, intern, symbolize);
868
- } else {
869
- string = json_string_fastpath(state, start, end, is_name, intern, symbolize);
870
- }
871
-
872
- return string;
873
- }
874
-
875
946
  static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
876
947
  {
877
948
  if (RB_UNLIKELY(config->on_load_proc)) {
@@ -894,17 +965,11 @@ static const bool string_scan_table[256] = {
894
965
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
895
966
  };
896
967
 
897
- #if (defined(__GNUC__ ) || defined(__clang__))
898
- #define FORCE_INLINE __attribute__((always_inline))
899
- #else
900
- #define FORCE_INLINE
901
- #endif
902
-
903
968
  #ifdef HAVE_SIMD
904
969
  static SIMD_Implementation simd_impl = SIMD_NONE;
905
970
  #endif /* HAVE_SIMD */
906
971
 
907
- static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
972
+ ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state)
908
973
  {
909
974
  #ifdef HAVE_SIMD
910
975
  #if defined(HAVE_SIMD_NEON)
@@ -912,7 +977,7 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
912
977
  uint64_t mask = 0;
913
978
  if (string_scan_simd_neon(&state->cursor, state->end, &mask)) {
914
979
  state->cursor += trailing_zeros64(mask) >> 2;
915
- return 1;
980
+ return true;
916
981
  }
917
982
 
918
983
  #elif defined(HAVE_SIMD_SSE2)
@@ -920,64 +985,232 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
920
985
  int mask = 0;
921
986
  if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) {
922
987
  state->cursor += trailing_zeros(mask);
923
- return 1;
988
+ return true;
924
989
  }
925
990
  }
926
991
  #endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
927
992
  #endif /* HAVE_SIMD */
928
993
 
929
- while (state->cursor < state->end) {
994
+ while (!eos(state)) {
930
995
  if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
931
- return 1;
996
+ return true;
932
997
  }
933
- *state->cursor++;
998
+ state->cursor++;
934
999
  }
935
- return 0;
1000
+ return false;
936
1001
  }
937
1002
 
938
- static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
1003
+ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
939
1004
  {
940
- state->cursor++;
941
- const char *start = state->cursor;
942
- bool escaped = false;
1005
+ const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
1006
+ JSON_UnescapePositions positions = {
1007
+ .size = 0,
1008
+ .positions = backslashes,
1009
+ .additional_backslashes = 0,
1010
+ };
943
1011
 
944
- while (RB_UNLIKELY(string_scan(state))) {
1012
+ do {
945
1013
  switch (*state->cursor) {
946
1014
  case '"': {
947
- VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name);
1015
+ VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
948
1016
  state->cursor++;
949
1017
  return json_push_value(state, config, string);
950
1018
  }
951
1019
  case '\\': {
952
- state->cursor++;
953
- escaped = true;
954
- if ((unsigned char)*state->cursor < 0x20) {
955
- raise_parse_error("invalid ASCII control character in string: %s", state);
1020
+ if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
1021
+ backslashes[positions.size] = state->cursor;
1022
+ positions.size++;
1023
+ } else {
1024
+ positions.additional_backslashes++;
956
1025
  }
1026
+ state->cursor++;
957
1027
  break;
958
1028
  }
959
1029
  default:
960
- raise_parse_error("invalid ASCII control character in string: %s", state);
1030
+ if (!config->allow_control_characters) {
1031
+ raise_parse_error("invalid ASCII control character in string: %s", state);
1032
+ }
961
1033
  break;
962
1034
  }
963
1035
 
964
1036
  state->cursor++;
965
- }
1037
+ } while (string_scan(state));
966
1038
 
967
1039
  raise_parse_error("unexpected end of input, expected closing \"", state);
968
1040
  return Qfalse;
969
1041
  }
970
1042
 
1043
+ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
1044
+ {
1045
+ state->cursor++;
1046
+ const char *start = state->cursor;
1047
+
1048
+ if (RB_UNLIKELY(!string_scan(state))) {
1049
+ raise_parse_error("unexpected end of input, expected closing \"", state);
1050
+ }
1051
+
1052
+ if (RB_LIKELY(*state->cursor == '"')) {
1053
+ VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name);
1054
+ state->cursor++;
1055
+ return json_push_value(state, config, string);
1056
+ }
1057
+ return json_parse_escaped_string(state, config, is_name, start);
1058
+ }
1059
+
1060
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1061
+ // From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
1062
+ // Additional References:
1063
+ // https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
1064
+ // http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
1065
+ static inline uint64_t decode_8digits_unrolled(uint64_t val) {
1066
+ const uint64_t mask = 0x000000FF000000FF;
1067
+ const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
1068
+ const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
1069
+ val -= 0x3030303030303030;
1070
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1071
+ val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
1072
+ return val;
1073
+ }
1074
+
1075
+ static inline uint64_t decode_4digits_unrolled(uint32_t val) {
1076
+ const uint32_t mask = 0x000000FF;
1077
+ const uint32_t mul1 = 100;
1078
+ val -= 0x30303030;
1079
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1080
+ val = ((val & mask) * mul1) + (((val >> 16) & mask));
1081
+ return val;
1082
+ }
1083
+ #endif
1084
+
1085
+ static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
1086
+ {
1087
+ const char *start = state->cursor;
1088
+
1089
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1090
+ while (rest(state) >= sizeof(uint64_t)) {
1091
+ uint64_t next_8bytes;
1092
+ memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
1093
+
1094
+ // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
1095
+ // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
1096
+ uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
1097
+
1098
+ if (match == 0x3333333333333333) { // 8 consecutive digits
1099
+ *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
1100
+ state->cursor += 8;
1101
+ continue;
1102
+ }
1103
+
1104
+ uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
1105
+
1106
+ if (consecutive_digits >= 4) {
1107
+ *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
1108
+ state->cursor += 4;
1109
+ consecutive_digits -= 4;
1110
+ }
1111
+
1112
+ while (consecutive_digits) {
1113
+ *accumulator = *accumulator * 10 + (*state->cursor - '0');
1114
+ consecutive_digits--;
1115
+ state->cursor++;
1116
+ }
1117
+
1118
+ return (int)(state->cursor - start);
1119
+ }
1120
+ #endif
1121
+
1122
+ char next_char;
1123
+ while (rb_isdigit(next_char = peek(state))) {
1124
+ *accumulator = *accumulator * 10 + (next_char - '0');
1125
+ state->cursor++;
1126
+ }
1127
+ return (int)(state->cursor - start);
1128
+ }
1129
+
1130
+ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
1131
+ {
1132
+ bool integer = true;
1133
+ const char first_digit = *state->cursor;
1134
+
1135
+ // Variables for Ryu optimization - extract digits during parsing
1136
+ int32_t exponent = 0;
1137
+ int decimal_point_pos = -1;
1138
+ uint64_t mantissa = 0;
1139
+
1140
+ // Parse integer part and extract mantissa digits
1141
+ int mantissa_digits = json_parse_digits(state, &mantissa);
1142
+
1143
+ if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
1144
+ raise_parse_error_at("invalid number: %s", state, start);
1145
+ }
1146
+
1147
+ // Parse fractional part
1148
+ if (peek(state) == '.') {
1149
+ integer = false;
1150
+ decimal_point_pos = mantissa_digits; // Remember position of decimal point
1151
+ state->cursor++;
1152
+
1153
+ int fractional_digits = json_parse_digits(state, &mantissa);
1154
+ mantissa_digits += fractional_digits;
1155
+
1156
+ if (RB_UNLIKELY(!fractional_digits)) {
1157
+ raise_parse_error_at("invalid number: %s", state, start);
1158
+ }
1159
+ }
1160
+
1161
+ // Parse exponent
1162
+ if (rb_tolower(peek(state)) == 'e') {
1163
+ integer = false;
1164
+ state->cursor++;
1165
+
1166
+ bool negative_exponent = false;
1167
+ const char next_char = peek(state);
1168
+ if (next_char == '-' || next_char == '+') {
1169
+ negative_exponent = next_char == '-';
1170
+ state->cursor++;
1171
+ }
1172
+
1173
+ uint64_t abs_exponent = 0;
1174
+ int exponent_digits = json_parse_digits(state, &abs_exponent);
1175
+
1176
+ if (RB_UNLIKELY(!exponent_digits)) {
1177
+ raise_parse_error_at("invalid number: %s", state, start);
1178
+ }
1179
+
1180
+ exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent);
1181
+ }
1182
+
1183
+ if (integer) {
1184
+ return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
1185
+ }
1186
+
1187
+ // Adjust exponent based on decimal point position
1188
+ if (decimal_point_pos >= 0) {
1189
+ exponent -= (mantissa_digits - decimal_point_pos);
1190
+ }
1191
+
1192
+ return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
1193
+ }
1194
+
1195
+ static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
1196
+ {
1197
+ return json_parse_number(state, config, false, state->cursor);
1198
+ }
1199
+
1200
+ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
1201
+ {
1202
+ const char *start = state->cursor;
1203
+ state->cursor++;
1204
+ return json_parse_number(state, config, true, start);
1205
+ }
1206
+
971
1207
  static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
972
1208
  {
973
1209
  json_eat_whitespace(state);
974
- if (state->cursor >= state->end) {
975
- raise_parse_error("unexpected end of input", state);
976
- }
977
1210
 
978
- switch (*state->cursor) {
1211
+ switch (peek(state)) {
979
1212
  case 'n':
980
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) {
1213
+ if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) {
981
1214
  state->cursor += 4;
982
1215
  return json_push_value(state, config, Qnil);
983
1216
  }
@@ -985,7 +1218,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
985
1218
  raise_parse_error("unexpected token %s", state);
986
1219
  break;
987
1220
  case 't':
988
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
1221
+ if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) {
989
1222
  state->cursor += 4;
990
1223
  return json_push_value(state, config, Qtrue);
991
1224
  }
@@ -994,7 +1227,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
994
1227
  break;
995
1228
  case 'f':
996
1229
  // Note: memcmp with a small power of two compile to an integer comparison
997
- if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1230
+ if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
998
1231
  state->cursor += 5;
999
1232
  return json_push_value(state, config, Qfalse);
1000
1233
  }
@@ -1003,7 +1236,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1003
1236
  break;
1004
1237
  case 'N':
1005
1238
  // Note: memcmp with a small power of two compile to an integer comparison
1006
- if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1239
+ if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1007
1240
  state->cursor += 3;
1008
1241
  return json_push_value(state, config, CNaN);
1009
1242
  }
@@ -1011,16 +1244,16 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1011
1244
  raise_parse_error("unexpected token %s", state);
1012
1245
  break;
1013
1246
  case 'I':
1014
- if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1247
+ if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1015
1248
  state->cursor += 8;
1016
1249
  return json_push_value(state, config, CInfinity);
1017
1250
  }
1018
1251
 
1019
1252
  raise_parse_error("unexpected token %s", state);
1020
1253
  break;
1021
- case '-':
1254
+ case '-': {
1022
1255
  // Note: memcmp with a small power of two compile to an integer comparison
1023
- if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1256
+ if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1024
1257
  if (config->allow_nan) {
1025
1258
  state->cursor += 9;
1026
1259
  return json_push_value(state, config, CMinusInfinity);
@@ -1028,62 +1261,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1028
1261
  raise_parse_error("unexpected token %s", state);
1029
1262
  }
1030
1263
  }
1031
- // Fallthrough
1032
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
1033
- bool integer = true;
1034
-
1035
- // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
1036
- const char *start = state->cursor;
1037
- state->cursor++;
1038
-
1039
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1040
- state->cursor++;
1041
- }
1042
-
1043
- long integer_length = state->cursor - start;
1044
-
1045
- if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
1046
- raise_parse_error_at("invalid number: %s", state, start);
1047
- } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
1048
- raise_parse_error_at("invalid number: %s", state, start);
1049
- } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
1050
- raise_parse_error_at("invalid number: %s", state, start);
1051
- }
1052
-
1053
- if ((state->cursor < state->end) && (*state->cursor == '.')) {
1054
- integer = false;
1055
- state->cursor++;
1056
-
1057
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1058
- raise_parse_error("invalid number: %s", state);
1059
- }
1060
-
1061
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1062
- state->cursor++;
1063
- }
1064
- }
1065
-
1066
- if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
1067
- integer = false;
1068
- state->cursor++;
1069
- if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
1070
- state->cursor++;
1071
- }
1072
-
1073
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1074
- raise_parse_error("invalid number: %s", state);
1075
- }
1076
-
1077
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1078
- state->cursor++;
1079
- }
1080
- }
1081
-
1082
- if (integer) {
1083
- return json_push_value(state, config, json_decode_integer(start, state->cursor));
1084
- }
1085
- return json_push_value(state, config, json_decode_float(config, start, state->cursor));
1264
+ return json_push_value(state, config, json_parse_negative_number(state, config));
1265
+ break;
1086
1266
  }
1267
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1268
+ return json_push_value(state, config, json_parse_positive_number(state, config));
1269
+ break;
1087
1270
  case '"': {
1088
1271
  // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
1089
1272
  return json_parse_string(state, config, false);
@@ -1094,7 +1277,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1094
1277
  json_eat_whitespace(state);
1095
1278
  long stack_head = state->stack->head;
1096
1279
 
1097
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1280
+ if (peek(state) == ']') {
1098
1281
  state->cursor++;
1099
1282
  return json_push_value(state, config, json_decode_array(state, config, 0));
1100
1283
  } else {
@@ -1109,26 +1292,26 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1109
1292
  while (true) {
1110
1293
  json_eat_whitespace(state);
1111
1294
 
1112
- if (state->cursor < state->end) {
1113
- if (*state->cursor == ']') {
1114
- state->cursor++;
1115
- long count = state->stack->head - stack_head;
1116
- state->current_nesting--;
1117
- state->in_array--;
1118
- return json_push_value(state, config, json_decode_array(state, config, count));
1119
- }
1295
+ const char next_char = peek(state);
1120
1296
 
1121
- if (*state->cursor == ',') {
1122
- state->cursor++;
1123
- if (config->allow_trailing_comma) {
1124
- json_eat_whitespace(state);
1125
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1126
- continue;
1127
- }
1297
+ if (RB_LIKELY(next_char == ',')) {
1298
+ state->cursor++;
1299
+ if (config->allow_trailing_comma) {
1300
+ json_eat_whitespace(state);
1301
+ if (peek(state) == ']') {
1302
+ continue;
1128
1303
  }
1129
- json_parse_any(state, config);
1130
- continue;
1131
1304
  }
1305
+ json_parse_any(state, config);
1306
+ continue;
1307
+ }
1308
+
1309
+ if (next_char == ']') {
1310
+ state->cursor++;
1311
+ long count = state->stack->head - stack_head;
1312
+ state->current_nesting--;
1313
+ state->in_array--;
1314
+ return json_push_value(state, config, json_decode_array(state, config, count));
1132
1315
  }
1133
1316
 
1134
1317
  raise_parse_error("expected ',' or ']' after array value", state);
@@ -1142,7 +1325,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1142
1325
  json_eat_whitespace(state);
1143
1326
  long stack_head = state->stack->head;
1144
1327
 
1145
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1328
+ if (peek(state) == '}') {
1146
1329
  state->cursor++;
1147
1330
  return json_push_value(state, config, json_decode_object(state, config, 0));
1148
1331
  } else {
@@ -1151,13 +1334,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1151
1334
  rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
1152
1335
  }
1153
1336
 
1154
- if (*state->cursor != '"') {
1337
+ if (peek(state) != '"') {
1155
1338
  raise_parse_error("expected object key, got %s", state);
1156
1339
  }
1157
1340
  json_parse_string(state, config, true);
1158
1341
 
1159
1342
  json_eat_whitespace(state);
1160
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1343
+ if (peek(state) != ':') {
1161
1344
  raise_parse_error("expected ':' after object key", state);
1162
1345
  }
1163
1346
  state->cursor++;
@@ -1168,46 +1351,45 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1168
1351
  while (true) {
1169
1352
  json_eat_whitespace(state);
1170
1353
 
1171
- if (state->cursor < state->end) {
1172
- if (*state->cursor == '}') {
1173
- state->cursor++;
1174
- state->current_nesting--;
1175
- size_t count = state->stack->head - stack_head;
1354
+ const char next_char = peek(state);
1355
+ if (next_char == '}') {
1356
+ state->cursor++;
1357
+ state->current_nesting--;
1358
+ size_t count = state->stack->head - stack_head;
1176
1359
 
1177
- // Temporary rewind cursor in case an error is raised
1178
- const char *final_cursor = state->cursor;
1179
- state->cursor = object_start_cursor;
1180
- VALUE object = json_decode_object(state, config, count);
1181
- state->cursor = final_cursor;
1360
+ // Temporary rewind cursor in case an error is raised
1361
+ const char *final_cursor = state->cursor;
1362
+ state->cursor = object_start_cursor;
1363
+ VALUE object = json_decode_object(state, config, count);
1364
+ state->cursor = final_cursor;
1182
1365
 
1183
- return json_push_value(state, config, object);
1184
- }
1366
+ return json_push_value(state, config, object);
1367
+ }
1185
1368
 
1186
- if (*state->cursor == ',') {
1187
- state->cursor++;
1188
- json_eat_whitespace(state);
1369
+ if (next_char == ',') {
1370
+ state->cursor++;
1371
+ json_eat_whitespace(state);
1189
1372
 
1190
- if (config->allow_trailing_comma) {
1191
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1192
- continue;
1193
- }
1373
+ if (config->allow_trailing_comma) {
1374
+ if (peek(state) == '}') {
1375
+ continue;
1194
1376
  }
1377
+ }
1195
1378
 
1196
- if (*state->cursor != '"') {
1197
- raise_parse_error("expected object key, got: %s", state);
1198
- }
1199
- json_parse_string(state, config, true);
1379
+ if (RB_UNLIKELY(peek(state) != '"')) {
1380
+ raise_parse_error("expected object key, got: %s", state);
1381
+ }
1382
+ json_parse_string(state, config, true);
1200
1383
 
1201
- json_eat_whitespace(state);
1202
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1203
- raise_parse_error("expected ':' after object key, got: %s", state);
1204
- }
1205
- state->cursor++;
1384
+ json_eat_whitespace(state);
1385
+ if (RB_UNLIKELY(peek(state) != ':')) {
1386
+ raise_parse_error("expected ':' after object key, got: %s", state);
1387
+ }
1388
+ state->cursor++;
1206
1389
 
1207
- json_parse_any(state, config);
1390
+ json_parse_any(state, config);
1208
1391
 
1209
- continue;
1210
- }
1392
+ continue;
1211
1393
  }
1212
1394
 
1213
1395
  raise_parse_error("expected ',' or '}' after object value, got: %s", state);
@@ -1215,18 +1397,23 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1215
1397
  break;
1216
1398
  }
1217
1399
 
1400
+ case 0:
1401
+ raise_parse_error("unexpected end of input", state);
1402
+ break;
1403
+
1218
1404
  default:
1219
1405
  raise_parse_error("unexpected character: %s", state);
1220
1406
  break;
1221
1407
  }
1222
1408
 
1223
- raise_parse_error("unreacheable: %s", state);
1409
+ raise_parse_error("unreachable: %s", state);
1410
+ return Qundef;
1224
1411
  }
1225
1412
 
1226
1413
  static void json_ensure_eof(JSON_ParserState *state)
1227
1414
  {
1228
1415
  json_eat_whitespace(state);
1229
- if (state->cursor != state->end) {
1416
+ if (!eos(state)) {
1230
1417
  raise_parse_error("unexpected token at end of stream %s", state);
1231
1418
  }
1232
1419
  }
@@ -1263,14 +1450,16 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
1263
1450
  {
1264
1451
  JSON_ParserConfig *config = (JSON_ParserConfig *)data;
1265
1452
 
1266
- if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1267
- else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1268
- else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1269
- else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1270
- else if (key == sym_freeze) { config->freeze = RTEST(val); }
1271
- else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1272
- else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1273
- else if (key == sym_decimal_class) {
1453
+ if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1454
+ else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1455
+ else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1456
+ else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); }
1457
+ else if (key == sym_allow_invalid_escape) { config->allow_invalid_escape = RTEST(val); }
1458
+ else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1459
+ else if (key == sym_freeze) { config->freeze = RTEST(val); }
1460
+ else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1461
+ else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1462
+ else if (key == sym_decimal_class) {
1274
1463
  if (RTEST(val)) {
1275
1464
  if (rb_respond_to(val, i_try_convert)) {
1276
1465
  config->decimal_class = val;
@@ -1343,6 +1532,7 @@ static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
1343
1532
  */
1344
1533
  static VALUE cParserConfig_initialize(VALUE self, VALUE opts)
1345
1534
  {
1535
+ rb_check_frozen(self);
1346
1536
  GET_PARSER_CONFIG;
1347
1537
 
1348
1538
  parser_config_init(config, opts);
@@ -1438,7 +1628,7 @@ static const rb_data_type_t JSON_ParserConfig_type = {
1438
1628
  JSON_ParserConfig_memsize,
1439
1629
  },
1440
1630
  0, 0,
1441
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
1631
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE,
1442
1632
  };
1443
1633
 
1444
1634
  static VALUE cJSON_parser_s_allocate(VALUE klass)
@@ -1482,16 +1672,14 @@ void Init_parser(void)
1482
1672
  sym_max_nesting = ID2SYM(rb_intern("max_nesting"));
1483
1673
  sym_allow_nan = ID2SYM(rb_intern("allow_nan"));
1484
1674
  sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma"));
1675
+ sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters"));
1676
+ sym_allow_invalid_escape = ID2SYM(rb_intern("allow_invalid_escape"));
1485
1677
  sym_symbolize_names = ID2SYM(rb_intern("symbolize_names"));
1486
1678
  sym_freeze = ID2SYM(rb_intern("freeze"));
1487
1679
  sym_on_load = ID2SYM(rb_intern("on_load"));
1488
1680
  sym_decimal_class = ID2SYM(rb_intern("decimal_class"));
1489
1681
  sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key"));
1490
1682
 
1491
- i_chr = rb_intern("chr");
1492
- i_aset = rb_intern("[]=");
1493
- i_aref = rb_intern("[]");
1494
- i_leftshift = rb_intern("<<");
1495
1683
  i_new = rb_intern("new");
1496
1684
  i_try_convert = rb_intern("try_convert");
1497
1685
  i_uminus = rb_intern("-@");