json 2.12.2 → 2.18.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,48 +1,21 @@
1
- #include "ruby.h"
2
- #include "ruby/encoding.h"
3
-
4
- /* shims */
5
- /* This is the fallback definition from Ruby 3.4 */
6
-
7
- #ifndef RBIMPL_STDBOOL_H
8
- #if defined(__cplusplus)
9
- # if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
10
- # include <cstdbool>
11
- # endif
12
- #elif defined(HAVE_STDBOOL_H)
13
- # include <stdbool.h>
14
- #elif !defined(HAVE__BOOL)
15
- typedef unsigned char _Bool;
16
- # define bool _Bool
17
- # define true ((_Bool)+1)
18
- # define false ((_Bool)+0)
19
- # define __bool_true_false_are_defined
20
- #endif
21
- #endif
22
-
23
- #ifndef RB_UNLIKELY
24
- #define RB_UNLIKELY(expr) expr
25
- #endif
26
-
27
- #ifndef RB_LIKELY
28
- #define RB_LIKELY(expr) expr
29
- #endif
1
+ #include "../json.h"
2
+ #include "../vendor/ryu.h"
3
+ #include "../simd/simd.h"
30
4
 
31
5
  static VALUE mJSON, eNestingError, Encoding_UTF_8;
32
6
  static VALUE CNaN, CInfinity, CMinusInfinity;
33
7
 
34
- static ID i_chr, i_aset, i_aref,
35
- i_leftshift, i_new, i_try_convert, i_uminus, i_encode;
8
+ static ID i_new, i_try_convert, i_uminus, i_encode;
36
9
 
37
- static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze,
38
- sym_decimal_class, sym_on_load;
10
+ static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters, sym_symbolize_names, sym_freeze,
11
+ sym_decimal_class, sym_on_load, sym_allow_duplicate_key;
39
12
 
40
13
  static int binary_encindex;
41
14
  static int utf8_encindex;
42
15
 
43
16
  #ifndef HAVE_RB_HASH_BULK_INSERT
44
17
  // For TruffleRuby
45
- void
18
+ static void
46
19
  rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
47
20
  {
48
21
  long index = 0;
@@ -59,6 +32,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
59
32
  #define rb_hash_new_capa(n) rb_hash_new()
60
33
  #endif
61
34
 
35
+ #ifndef HAVE_RB_STR_TO_INTERNED_STR
36
+ static VALUE rb_str_to_interned_str(VALUE str)
37
+ {
38
+ return rb_funcall(rb_str_freeze(str), i_uminus, 0);
39
+ }
40
+ #endif
62
41
 
63
42
  /* name cache */
64
43
 
@@ -104,116 +83,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring
104
83
  cache->entries[index] = rstring;
105
84
  }
106
85
 
107
- static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
86
+ #define rstring_cache_memcmp memcmp
87
+
88
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
89
+ #if __has_builtin(__builtin_bswap64)
90
+ #undef rstring_cache_memcmp
91
+ ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
108
92
  {
109
- long rstring_length = RSTRING_LEN(rstring);
110
- if (length == rstring_length) {
111
- return memcmp(str, RSTRING_PTR(rstring), length);
112
- } else {
113
- return (int)(length - rstring_length);
93
+ // The libc memcmp has numerous complex optimizations, but in this particular case,
94
+ // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
95
+ // inline a simpler memcmp outperforms calling the libc version.
96
+ long i = 0;
97
+
98
+ for (; i + 8 <= length; i += 8) {
99
+ uint64_t a, b;
100
+ memcpy(&a, str + i, 8);
101
+ memcpy(&b, rptr + i, 8);
102
+ if (a != b) {
103
+ a = __builtin_bswap64(a);
104
+ b = __builtin_bswap64(b);
105
+ return (a < b) ? -1 : 1;
106
+ }
107
+ }
108
+
109
+ for (; i < length; i++) {
110
+ if (str[i] != rptr[i]) {
111
+ return (str[i] < rptr[i]) ? -1 : 1;
112
+ }
114
113
  }
114
+
115
+ return 0;
115
116
  }
117
+ #endif
118
+ #endif
116
119
 
117
- static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
120
+ ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
118
121
  {
119
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
120
- // Common names aren't likely to be very long. So we just don't
121
- // cache names above an arbitrary threshold.
122
- return Qfalse;
123
- }
122
+ const char *rstring_ptr;
123
+ long rstring_length;
124
124
 
125
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
126
- // Simple heuristic, if the first character isn't a letter,
127
- // we're much less likely to see this string again.
128
- // We mostly want to cache strings that are likely to be repeated.
129
- return Qfalse;
125
+ RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
126
+
127
+ if (length == rstring_length) {
128
+ return rstring_cache_memcmp(str, rstring_ptr, length);
129
+ } else {
130
+ return (int)(length - rstring_length);
130
131
  }
132
+ }
131
133
 
134
+ ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
135
+ {
132
136
  int low = 0;
133
137
  int high = cache->length - 1;
134
- int mid = 0;
135
- int last_cmp = 0;
136
138
 
137
139
  while (low <= high) {
138
- mid = (high + low) >> 1;
140
+ int mid = (high + low) >> 1;
139
141
  VALUE entry = cache->entries[mid];
140
- last_cmp = rstring_cache_cmp(str, length, entry);
142
+ int cmp = rstring_cache_cmp(str, length, entry);
141
143
 
142
- if (last_cmp == 0) {
144
+ if (cmp == 0) {
143
145
  return entry;
144
- } else if (last_cmp > 0) {
146
+ } else if (cmp > 0) {
145
147
  low = mid + 1;
146
148
  } else {
147
149
  high = mid - 1;
148
150
  }
149
151
  }
150
152
 
151
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
152
- // We assume the overwhelming majority of names don't need to be escaped.
153
- // But if they do, we have to fallback to the slow path.
154
- return Qfalse;
155
- }
156
-
157
153
  VALUE rstring = build_interned_string(str, length);
158
154
 
159
155
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
160
- if (last_cmp > 0) {
161
- mid += 1;
162
- }
163
-
164
- rvalue_cache_insert_at(cache, mid, rstring);
156
+ rvalue_cache_insert_at(cache, low, rstring);
165
157
  }
166
158
  return rstring;
167
159
  }
168
160
 
169
161
  static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
170
162
  {
171
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
172
- // Common names aren't likely to be very long. So we just don't
173
- // cache names above an arbitrary threshold.
174
- return Qfalse;
175
- }
176
-
177
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
178
- // Simple heuristic, if the first character isn't a letter,
179
- // we're much less likely to see this string again.
180
- // We mostly want to cache strings that are likely to be repeated.
181
- return Qfalse;
182
- }
183
-
184
163
  int low = 0;
185
164
  int high = cache->length - 1;
186
- int mid = 0;
187
- int last_cmp = 0;
188
165
 
189
166
  while (low <= high) {
190
- mid = (high + low) >> 1;
167
+ int mid = (high + low) >> 1;
191
168
  VALUE entry = cache->entries[mid];
192
- last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
169
+ int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
193
170
 
194
- if (last_cmp == 0) {
171
+ if (cmp == 0) {
195
172
  return entry;
196
- } else if (last_cmp > 0) {
173
+ } else if (cmp > 0) {
197
174
  low = mid + 1;
198
175
  } else {
199
176
  high = mid - 1;
200
177
  }
201
178
  }
202
179
 
203
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
204
- // We assume the overwhelming majority of names don't need to be escaped.
205
- // But if they do, we have to fallback to the slow path.
206
- return Qfalse;
207
- }
208
-
209
180
  VALUE rsymbol = build_symbol(str, length);
210
181
 
211
182
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
212
- if (last_cmp > 0) {
213
- mid += 1;
214
- }
215
-
216
- rvalue_cache_insert_at(cache, mid, rsymbol);
183
+ rvalue_cache_insert_at(cache, low, rsymbol);
217
184
  }
218
185
  return rsymbol;
219
186
  }
@@ -328,15 +295,6 @@ static void rvalue_stack_eagerly_release(VALUE handle)
328
295
  }
329
296
  }
330
297
 
331
-
332
- #ifndef HAVE_STRNLEN
333
- static size_t strnlen(const char *s, size_t maxlen)
334
- {
335
- char *p;
336
- return ((p = memchr(s, '\0', maxlen)) ? p - s : maxlen);
337
- }
338
- #endif
339
-
340
298
  static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
341
299
  {
342
300
  int len = 1;
@@ -363,14 +321,21 @@ static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
363
321
  return len;
364
322
  }
365
323
 
324
+ enum duplicate_key_action {
325
+ JSON_DEPRECATED = 0,
326
+ JSON_IGNORE,
327
+ JSON_RAISE,
328
+ };
329
+
366
330
  typedef struct JSON_ParserStruct {
367
331
  VALUE on_load_proc;
368
332
  VALUE decimal_class;
369
333
  ID decimal_method_id;
334
+ enum duplicate_key_action on_duplicate_key;
370
335
  int max_nesting;
371
336
  bool allow_nan;
372
337
  bool allow_trailing_comma;
373
- bool parsing_name;
338
+ bool allow_control_characters;
374
339
  bool symbolize_names;
375
340
  bool freeze;
376
341
  } JSON_ParserConfig;
@@ -386,15 +351,24 @@ typedef struct JSON_ParserStateStruct {
386
351
  int current_nesting;
387
352
  } JSON_ParserState;
388
353
 
354
+ static inline size_t rest(JSON_ParserState *state) {
355
+ return state->end - state->cursor;
356
+ }
389
357
 
390
- #define PARSE_ERROR_FRAGMENT_LEN 32
391
- #ifdef RBIMPL_ATTR_NORETURN
392
- RBIMPL_ATTR_NORETURN()
393
- #endif
394
- static void raise_parse_error(const char *format, JSON_ParserState *state)
358
+ static inline bool eos(JSON_ParserState *state) {
359
+ return state->cursor >= state->end;
360
+ }
361
+
362
+ static inline char peek(JSON_ParserState *state)
395
363
  {
396
- unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
364
+ if (RB_UNLIKELY(eos(state))) {
365
+ return 0;
366
+ }
367
+ return *state->cursor;
368
+ }
397
369
 
370
+ static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
371
+ {
398
372
  const char *cursor = state->cursor;
399
373
  long column = 0;
400
374
  long line = 1;
@@ -411,6 +385,29 @@ static void raise_parse_error(const char *format, JSON_ParserState *state)
411
385
  line++;
412
386
  }
413
387
  }
388
+ *line_out = line;
389
+ *column_out = column;
390
+ }
391
+
392
+ static void emit_parse_warning(const char *message, JSON_ParserState *state)
393
+ {
394
+ long line, column;
395
+ cursor_position(state, &line, &column);
396
+
397
+ VALUE warning = rb_sprintf("%s at line %ld column %ld", message, line, column);
398
+ rb_funcall(mJSON, rb_intern("deprecation_warning"), 1, warning);
399
+ }
400
+
401
+ #define PARSE_ERROR_FRAGMENT_LEN 32
402
+
403
+ #ifdef RBIMPL_ATTR_NORETURN
404
+ RBIMPL_ATTR_NORETURN()
405
+ #endif
406
+ static void raise_parse_error(const char *format, JSON_ParserState *state)
407
+ {
408
+ unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
409
+ long line, column;
410
+ cursor_position(state, &line, &column);
414
411
 
415
412
  const char *ptr = "EOF";
416
413
  if (state->cursor && state->cursor < state->end) {
@@ -480,23 +477,24 @@ static const signed char digit_values[256] = {
480
477
  -1, -1, -1, -1, -1, -1, -1
481
478
  };
482
479
 
483
- static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
480
+ static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe)
484
481
  {
485
- signed char b;
486
- uint32_t result = 0;
487
- b = digit_values[p[0]];
488
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
489
- result = (result << 4) | (unsigned char)b;
490
- b = digit_values[p[1]];
491
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
492
- result = (result << 4) | (unsigned char)b;
493
- b = digit_values[p[2]];
494
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
495
- result = (result << 4) | (unsigned char)b;
496
- b = digit_values[p[3]];
497
- if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
498
- result = (result << 4) | (unsigned char)b;
499
- return result;
482
+ if (RB_UNLIKELY(sp > spe - 4)) {
483
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
484
+ }
485
+
486
+ const unsigned char *p = (const unsigned char *)sp;
487
+
488
+ const signed char b0 = digit_values[p[0]];
489
+ const signed char b1 = digit_values[p[1]];
490
+ const signed char b2 = digit_values[p[2]];
491
+ const signed char b3 = digit_values[p[3]];
492
+
493
+ if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) {
494
+ raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
495
+ }
496
+
497
+ return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
500
498
  }
501
499
 
502
500
  #define GET_PARSER_CONFIG \
@@ -505,61 +503,82 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
505
503
 
506
504
  static const rb_data_type_t JSON_ParserConfig_type;
507
505
 
508
- static const bool whitespace[256] = {
509
- [' '] = 1,
510
- ['\t'] = 1,
511
- ['\n'] = 1,
512
- ['\r'] = 1,
513
- ['/'] = 1,
514
- };
515
-
516
506
  static void
517
507
  json_eat_comments(JSON_ParserState *state)
518
508
  {
519
- if (state->cursor + 1 < state->end) {
520
- switch(state->cursor[1]) {
521
- case '/': {
522
- state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
523
- if (!state->cursor) {
524
- state->cursor = state->end;
525
- } else {
526
- state->cursor++;
527
- }
528
- break;
509
+ const char *start = state->cursor;
510
+ state->cursor++;
511
+
512
+ switch (peek(state)) {
513
+ case '/': {
514
+ state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
515
+ if (!state->cursor) {
516
+ state->cursor = state->end;
517
+ } else {
518
+ state->cursor++;
529
519
  }
530
- case '*': {
531
- state->cursor += 2;
532
- while (true) {
533
- state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
534
- if (!state->cursor) {
535
- raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
536
- } else {
537
- state->cursor++;
538
- if (state->cursor < state->end && *state->cursor == '/') {
539
- state->cursor++;
540
- break;
541
- }
542
- }
520
+ break;
521
+ }
522
+ case '*': {
523
+ state->cursor++;
524
+
525
+ while (true) {
526
+ const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
527
+ if (!next_match) {
528
+ raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
529
+ }
530
+
531
+ state->cursor = next_match + 1;
532
+ if (peek(state) == '/') {
533
+ state->cursor++;
534
+ break;
543
535
  }
544
- break;
545
536
  }
546
- default:
547
- raise_parse_error("unexpected token %s", state);
548
- break;
537
+ break;
549
538
  }
550
- } else {
551
- raise_parse_error("unexpected token %s", state);
539
+ default:
540
+ raise_parse_error_at("unexpected token %s", state, start);
541
+ break;
552
542
  }
553
543
  }
554
544
 
555
- static inline void
545
+ ALWAYS_INLINE(static) void
556
546
  json_eat_whitespace(JSON_ParserState *state)
557
547
  {
558
- while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) {
559
- if (RB_LIKELY(*state->cursor != '/')) {
560
- state->cursor++;
561
- } else {
562
- json_eat_comments(state);
548
+ while (true) {
549
+ switch (peek(state)) {
550
+ case ' ':
551
+ state->cursor++;
552
+ break;
553
+ case '\n':
554
+ state->cursor++;
555
+
556
+ // Heuristic: if we see a newline, there is likely consecutive spaces after it.
557
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
558
+ while (rest(state) > 8) {
559
+ uint64_t chunk;
560
+ memcpy(&chunk, state->cursor, sizeof(uint64_t));
561
+ if (chunk == 0x2020202020202020) {
562
+ state->cursor += 8;
563
+ continue;
564
+ }
565
+
566
+ uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
567
+ state->cursor += consecutive_spaces;
568
+ break;
569
+ }
570
+ #endif
571
+ break;
572
+ case '\t':
573
+ case '\r':
574
+ state->cursor++;
575
+ break;
576
+ case '/':
577
+ json_eat_comments(state);
578
+ break;
579
+
580
+ default:
581
+ return;
563
582
  }
564
583
  }
565
584
  }
@@ -590,11 +609,22 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
590
609
  return result;
591
610
  }
592
611
 
593
- static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
612
+ static inline bool json_string_cacheable_p(const char *string, size_t length)
594
613
  {
614
+ // We mostly want to cache strings that are likely to be repeated.
615
+ // Simple heuristics:
616
+ // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
617
+ // - If the first character isn't a letter, we're much less likely to see this string again.
618
+ return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
619
+ }
620
+
621
+ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
622
+ {
623
+ bool intern = is_name || config->freeze;
624
+ bool symbolize = is_name && config->symbolize_names;
595
625
  size_t bufferSize = stringEnd - string;
596
626
 
597
- if (is_name && state->in_array) {
627
+ if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
598
628
  VALUE cached_key;
599
629
  if (RB_UNLIKELY(symbolize)) {
600
630
  cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
@@ -610,104 +640,125 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
610
640
  return build_string(string, stringEnd, intern, symbolize);
611
641
  }
612
642
 
613
- static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
614
- {
615
- size_t bufferSize = stringEnd - string;
616
- const char *p = string, *pe = string, *unescape, *bufferStart;
617
- char *buffer;
618
- int unescape_len;
619
- char buf[4];
643
+ #define JSON_MAX_UNESCAPE_POSITIONS 16
644
+ typedef struct _json_unescape_positions {
645
+ long size;
646
+ const char **positions;
647
+ unsigned long additional_backslashes;
648
+ } JSON_UnescapePositions;
620
649
 
621
- if (is_name && state->in_array) {
622
- VALUE cached_key;
623
- if (RB_UNLIKELY(symbolize)) {
624
- cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
625
- } else {
626
- cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
650
+ static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
651
+ {
652
+ while (positions->size) {
653
+ positions->size--;
654
+ const char *next_position = positions->positions[0];
655
+ positions->positions++;
656
+ if (next_position >= pe) {
657
+ return next_position;
627
658
  }
659
+ }
628
660
 
629
- if (RB_LIKELY(cached_key)) {
630
- return cached_key;
631
- }
661
+ if (positions->additional_backslashes) {
662
+ positions->additional_backslashes--;
663
+ return memchr(pe, '\\', stringEnd - pe);
632
664
  }
633
665
 
666
+ return NULL;
667
+ }
668
+
669
+ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
670
+ {
671
+ bool intern = is_name || config->freeze;
672
+ bool symbolize = is_name && config->symbolize_names;
673
+ size_t bufferSize = stringEnd - string;
674
+ const char *p = string, *pe = string, *bufferStart;
675
+ char *buffer;
676
+
634
677
  VALUE result = rb_str_buf_new(bufferSize);
635
678
  rb_enc_associate_index(result, utf8_encindex);
636
679
  buffer = RSTRING_PTR(result);
637
680
  bufferStart = buffer;
638
681
 
639
- while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) {
640
- unescape = (char *) "?";
641
- unescape_len = 1;
682
+ #define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
683
+
684
+ while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
642
685
  if (pe > p) {
643
686
  MEMCPY(buffer, p, char, pe - p);
644
687
  buffer += pe - p;
645
688
  }
646
689
  switch (*++pe) {
690
+ case '"':
691
+ case '/':
692
+ p = pe; // nothing to unescape just need to skip the backslash
693
+ break;
694
+ case '\\':
695
+ APPEND_CHAR('\\');
696
+ break;
647
697
  case 'n':
648
- unescape = (char *) "\n";
698
+ APPEND_CHAR('\n');
649
699
  break;
650
700
  case 'r':
651
- unescape = (char *) "\r";
701
+ APPEND_CHAR('\r');
652
702
  break;
653
703
  case 't':
654
- unescape = (char *) "\t";
655
- break;
656
- case '"':
657
- unescape = (char *) "\"";
658
- break;
659
- case '\\':
660
- unescape = (char *) "\\";
704
+ APPEND_CHAR('\t');
661
705
  break;
662
706
  case 'b':
663
- unescape = (char *) "\b";
707
+ APPEND_CHAR('\b');
664
708
  break;
665
709
  case 'f':
666
- unescape = (char *) "\f";
710
+ APPEND_CHAR('\f');
667
711
  break;
668
- case 'u':
669
- if (pe > stringEnd - 5) {
670
- raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
671
- } else {
672
- uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
673
- pe += 3;
674
- /* To handle values above U+FFFF, we take a sequence of
675
- * \uXXXX escapes in the U+D800..U+DBFF then
676
- * U+DC00..U+DFFF ranges, take the low 10 bits from each
677
- * to make a 20-bit number, then add 0x10000 to get the
678
- * final codepoint.
679
- *
680
- * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
681
- * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
682
- * Area".
683
- */
684
- if ((ch & 0xFC00) == 0xD800) {
685
- pe++;
686
- if (pe > stringEnd - 6) {
687
- raise_parse_error_at("incomplete surrogate pair at %s", state, p);
688
- }
689
- if (pe[0] == '\\' && pe[1] == 'u') {
690
- uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
691
- ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
692
- | (sur & 0x3FF));
693
- pe += 5;
694
- } else {
695
- unescape = (char *) "?";
696
- break;
712
+ case 'u': {
713
+ uint32_t ch = unescape_unicode(state, ++pe, stringEnd);
714
+ pe += 3;
715
+ /* To handle values above U+FFFF, we take a sequence of
716
+ * \uXXXX escapes in the U+D800..U+DBFF then
717
+ * U+DC00..U+DFFF ranges, take the low 10 bits from each
718
+ * to make a 20-bit number, then add 0x10000 to get the
719
+ * final codepoint.
720
+ *
721
+ * See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
722
+ * Surrogate Pairs in UTF-16", and 23.6 "Surrogates
723
+ * Area".
724
+ */
725
+ if ((ch & 0xFC00) == 0xD800) {
726
+ pe++;
727
+ if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) {
728
+ uint32_t sur = unescape_unicode(state, pe + 2, stringEnd);
729
+
730
+ if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) {
731
+ raise_parse_error_at("invalid surrogate pair at %s", state, p);
697
732
  }
733
+
734
+ ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF));
735
+ pe += 5;
736
+ } else {
737
+ raise_parse_error_at("incomplete surrogate pair at %s", state, p);
738
+ break;
698
739
  }
699
- unescape_len = convert_UTF32_to_UTF8(buf, ch);
700
- unescape = buf;
701
740
  }
741
+
742
+ int unescape_len = convert_UTF32_to_UTF8(buffer, ch);
743
+ buffer += unescape_len;
744
+ p = ++pe;
702
745
  break;
746
+ }
703
747
  default:
704
- p = pe;
705
- continue;
748
+ if ((unsigned char)*pe < 0x20) {
749
+ if (!config->allow_control_characters) {
750
+ if (*pe == '\n') {
751
+ raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1);
752
+ }
753
+ raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1);
754
+ }
755
+ } else {
756
+ raise_parse_error_at("invalid escape character in string: %s", state, pe - 1);
757
+ }
758
+ break;
706
759
  }
707
- MEMCPY(buffer, unescape, char, unescape_len);
708
- buffer += unescape_len;
709
- p = ++pe;
710
760
  }
761
+ #undef APPEND_CHAR
711
762
 
712
763
  if (stringEnd > p) {
713
764
  MEMCPY(buffer, p, char, stringEnd - p);
@@ -718,33 +769,13 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
718
769
  if (symbolize) {
719
770
  result = rb_str_intern(result);
720
771
  } else if (intern) {
721
- result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
772
+ result = rb_str_to_interned_str(result);
722
773
  }
723
774
 
724
775
  return result;
725
776
  }
726
777
 
727
778
  #define MAX_FAST_INTEGER_SIZE 18
728
- static inline VALUE fast_decode_integer(const char *p, const char *pe)
729
- {
730
- bool negative = false;
731
- if (*p == '-') {
732
- negative = true;
733
- p++;
734
- }
735
-
736
- long long memo = 0;
737
- while (p < pe) {
738
- memo *= 10;
739
- memo += *p - '0';
740
- p++;
741
- }
742
-
743
- if (negative) {
744
- memo = -memo;
745
- }
746
- return LL2NUM(memo);
747
- }
748
779
 
749
780
  static VALUE json_decode_large_integer(const char *start, long len)
750
781
  {
@@ -758,17 +789,27 @@ static VALUE json_decode_large_integer(const char *start, long len)
758
789
  }
759
790
 
760
791
  static inline VALUE
761
- json_decode_integer(const char *start, const char *end)
792
+ json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
762
793
  {
763
- long len = end - start;
764
- if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
765
- return fast_decode_integer(start, end);
794
+ if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
795
+ if (negative) {
796
+ return INT64T2NUM(-((int64_t)mantissa));
766
797
  }
767
- return json_decode_large_integer(start, len);
798
+ return UINT64T2NUM(mantissa);
799
+ }
800
+
801
+ return json_decode_large_integer(start, end - start);
768
802
  }
769
803
 
770
804
  static VALUE json_decode_large_float(const char *start, long len)
771
805
  {
806
+ if (RB_LIKELY(len < 64)) {
807
+ char buffer[64];
808
+ MEMCPY(buffer, start, char, len);
809
+ buffer[len] = '\0';
810
+ return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
811
+ }
812
+
772
813
  VALUE buffer_v;
773
814
  char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
774
815
  MEMCPY(buffer, start, char, len);
@@ -778,21 +819,24 @@ static VALUE json_decode_large_float(const char *start, long len)
778
819
  return number;
779
820
  }
780
821
 
781
- static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
822
+ /* Ruby JSON optimized float decoder using vendored Ryu algorithm
823
+ * Accepts pre-extracted mantissa and exponent from first-pass validation
824
+ */
825
+ static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
826
+ const char *start, const char *end)
782
827
  {
783
- long len = end - start;
784
-
785
828
  if (RB_UNLIKELY(config->decimal_class)) {
786
- VALUE text = rb_str_new(start, len);
829
+ VALUE text = rb_str_new(start, end - start);
787
830
  return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
788
- } else if (RB_LIKELY(len < 64)) {
789
- char buffer[64];
790
- MEMCPY(buffer, start, char, len);
791
- buffer[len] = '\0';
792
- return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
793
- } else {
794
- return json_decode_large_float(start, len);
795
831
  }
832
+
833
+ // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
834
+ // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
835
+ if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
836
+ return json_decode_large_float(start, end - start);
837
+ }
838
+
839
+ return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
796
840
  }
797
841
 
798
842
  static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
@@ -807,32 +851,75 @@ static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig
807
851
  return array;
808
852
  }
809
853
 
810
- static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, long count)
854
+ static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs)
811
855
  {
812
- VALUE object = rb_hash_new_capa(count);
813
- rb_hash_bulk_insert(count, rvalue_stack_peek(state->stack, count), object);
856
+ VALUE set = rb_hash_new_capa(count / 2);
857
+ for (size_t index = 0; index < count; index += 2) {
858
+ size_t before = RHASH_SIZE(set);
859
+ VALUE key = pairs[index];
860
+ rb_hash_aset(set, key, Qtrue);
861
+ if (RHASH_SIZE(set) == before) {
862
+ if (RB_SYMBOL_P(key)) {
863
+ return rb_sym2str(key);
864
+ }
865
+ return key;
866
+ }
867
+ }
868
+ return Qfalse;
869
+ }
814
870
 
815
- rvalue_stack_pop(state->stack, count);
871
+ static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
872
+ {
873
+ VALUE message = rb_sprintf(
874
+ "detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`",
875
+ rb_inspect(duplicate_key)
876
+ );
816
877
 
817
- if (config->freeze) {
818
- RB_OBJ_FREEZE(object);
819
- }
878
+ emit_parse_warning(RSTRING_PTR(message), state);
879
+ RB_GC_GUARD(message);
880
+ }
820
881
 
821
- return object;
882
+ #ifdef RBIMPL_ATTR_NORETURN
883
+ RBIMPL_ATTR_NORETURN()
884
+ #endif
885
+ static void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
886
+ {
887
+ VALUE message = rb_sprintf(
888
+ "duplicate key %"PRIsVALUE,
889
+ rb_inspect(duplicate_key)
890
+ );
891
+
892
+ raise_parse_error(RSTRING_PTR(message), state);
893
+ RB_GC_GUARD(message);
822
894
  }
823
895
 
824
- static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name)
896
+ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count)
825
897
  {
826
- VALUE string;
827
- bool intern = is_name || config->freeze;
828
- bool symbolize = is_name && config->symbolize_names;
829
- if (escaped) {
830
- string = json_string_unescape(state, start, end, is_name, intern, symbolize);
831
- } else {
832
- string = json_string_fastpath(state, start, end, is_name, intern, symbolize);
898
+ size_t entries_count = count / 2;
899
+ VALUE object = rb_hash_new_capa(entries_count);
900
+ const VALUE *pairs = rvalue_stack_peek(state->stack, count);
901
+ rb_hash_bulk_insert(count, pairs, object);
902
+
903
+ if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) {
904
+ switch (config->on_duplicate_key) {
905
+ case JSON_IGNORE:
906
+ break;
907
+ case JSON_DEPRECATED:
908
+ emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs));
909
+ break;
910
+ case JSON_RAISE:
911
+ raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs));
912
+ break;
913
+ }
833
914
  }
834
915
 
835
- return string;
916
+ rvalue_stack_pop(state->stack, count);
917
+
918
+ if (config->freeze) {
919
+ RB_OBJ_FREEZE(object);
920
+ }
921
+
922
+ return object;
836
923
  }
837
924
 
838
925
  static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
@@ -844,7 +931,7 @@ static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *
844
931
  return value;
845
932
  }
846
933
 
847
- static const bool string_scan[256] = {
934
+ static const bool string_scan_table[256] = {
848
935
  // ASCII Control Characters
849
936
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
850
937
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -857,51 +944,252 @@ static const bool string_scan[256] = {
857
944
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
858
945
  };
859
946
 
860
- static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
947
+ #ifdef HAVE_SIMD
948
+ static SIMD_Implementation simd_impl = SIMD_NONE;
949
+ #endif /* HAVE_SIMD */
950
+
951
+ ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state)
861
952
  {
862
- state->cursor++;
863
- const char *start = state->cursor;
864
- bool escaped = false;
953
+ #ifdef HAVE_SIMD
954
+ #if defined(HAVE_SIMD_NEON)
865
955
 
866
- while (state->cursor < state->end) {
867
- if (RB_UNLIKELY(string_scan[(unsigned char)*state->cursor])) {
868
- switch (*state->cursor) {
869
- case '"': {
870
- VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name);
871
- state->cursor++;
872
- return json_push_value(state, config, string);
873
- }
874
- case '\\': {
875
- state->cursor++;
876
- escaped = true;
877
- if ((unsigned char)*state->cursor < 0x20) {
878
- raise_parse_error("invalid ASCII control character in string: %s", state);
879
- }
880
- break;
956
+ uint64_t mask = 0;
957
+ if (string_scan_simd_neon(&state->cursor, state->end, &mask)) {
958
+ state->cursor += trailing_zeros64(mask) >> 2;
959
+ return true;
960
+ }
961
+
962
+ #elif defined(HAVE_SIMD_SSE2)
963
+ if (simd_impl == SIMD_SSE2) {
964
+ int mask = 0;
965
+ if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) {
966
+ state->cursor += trailing_zeros(mask);
967
+ return true;
968
+ }
969
+ }
970
+ #endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
971
+ #endif /* HAVE_SIMD */
972
+
973
+ while (!eos(state)) {
974
+ if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
975
+ return true;
976
+ }
977
+ state->cursor++;
978
+ }
979
+ return false;
980
+ }
981
+
982
+ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
983
+ {
984
+ const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
985
+ JSON_UnescapePositions positions = {
986
+ .size = 0,
987
+ .positions = backslashes,
988
+ .additional_backslashes = 0,
989
+ };
990
+
991
+ do {
992
+ switch (*state->cursor) {
993
+ case '"': {
994
+ VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
995
+ state->cursor++;
996
+ return json_push_value(state, config, string);
997
+ }
998
+ case '\\': {
999
+ if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
1000
+ backslashes[positions.size] = state->cursor;
1001
+ positions.size++;
1002
+ } else {
1003
+ positions.additional_backslashes++;
881
1004
  }
882
- default:
883
- raise_parse_error("invalid ASCII control character in string: %s", state);
884
- break;
1005
+ state->cursor++;
1006
+ break;
885
1007
  }
1008
+ default:
1009
+ if (!config->allow_control_characters) {
1010
+ raise_parse_error("invalid ASCII control character in string: %s", state);
1011
+ }
1012
+ break;
886
1013
  }
887
1014
 
888
1015
  state->cursor++;
889
- }
1016
+ } while (string_scan(state));
890
1017
 
891
1018
  raise_parse_error("unexpected end of input, expected closing \"", state);
892
1019
  return Qfalse;
893
1020
  }
894
1021
 
1022
+ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
1023
+ {
1024
+ state->cursor++;
1025
+ const char *start = state->cursor;
1026
+
1027
+ if (RB_UNLIKELY(!string_scan(state))) {
1028
+ raise_parse_error("unexpected end of input, expected closing \"", state);
1029
+ }
1030
+
1031
+ if (RB_LIKELY(*state->cursor == '"')) {
1032
+ VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name);
1033
+ state->cursor++;
1034
+ return json_push_value(state, config, string);
1035
+ }
1036
+ return json_parse_escaped_string(state, config, is_name, start);
1037
+ }
1038
+
1039
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1040
+ // From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
1041
+ // Additional References:
1042
+ // https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
1043
+ // http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
1044
+ static inline uint64_t decode_8digits_unrolled(uint64_t val) {
1045
+ const uint64_t mask = 0x000000FF000000FF;
1046
+ const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
1047
+ const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
1048
+ val -= 0x3030303030303030;
1049
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1050
+ val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
1051
+ return val;
1052
+ }
1053
+
1054
+ static inline uint64_t decode_4digits_unrolled(uint32_t val) {
1055
+ const uint32_t mask = 0x000000FF;
1056
+ const uint32_t mul1 = 100;
1057
+ val -= 0x30303030;
1058
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1059
+ val = ((val & mask) * mul1) + (((val >> 16) & mask));
1060
+ return val;
1061
+ }
1062
+ #endif
1063
+
1064
+ static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
1065
+ {
1066
+ const char *start = state->cursor;
1067
+
1068
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1069
+ while (rest(state) >= sizeof(uint64_t)) {
1070
+ uint64_t next_8bytes;
1071
+ memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
1072
+
1073
+ // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
1074
+ // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
1075
+ uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
1076
+
1077
+ if (match == 0x3333333333333333) { // 8 consecutive digits
1078
+ *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
1079
+ state->cursor += 8;
1080
+ continue;
1081
+ }
1082
+
1083
+ uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
1084
+
1085
+ if (consecutive_digits >= 4) {
1086
+ *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
1087
+ state->cursor += 4;
1088
+ consecutive_digits -= 4;
1089
+ }
1090
+
1091
+ while (consecutive_digits) {
1092
+ *accumulator = *accumulator * 10 + (*state->cursor - '0');
1093
+ consecutive_digits--;
1094
+ state->cursor++;
1095
+ }
1096
+
1097
+ return (int)(state->cursor - start);
1098
+ }
1099
+ #endif
1100
+
1101
+ char next_char;
1102
+ while (rb_isdigit(next_char = peek(state))) {
1103
+ *accumulator = *accumulator * 10 + (next_char - '0');
1104
+ state->cursor++;
1105
+ }
1106
+ return (int)(state->cursor - start);
1107
+ }
1108
+
1109
+ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
1110
+ {
1111
+ bool integer = true;
1112
+ const char first_digit = *state->cursor;
1113
+
1114
+ // Variables for Ryu optimization - extract digits during parsing
1115
+ int32_t exponent = 0;
1116
+ int decimal_point_pos = -1;
1117
+ uint64_t mantissa = 0;
1118
+
1119
+ // Parse integer part and extract mantissa digits
1120
+ int mantissa_digits = json_parse_digits(state, &mantissa);
1121
+
1122
+ if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
1123
+ raise_parse_error_at("invalid number: %s", state, start);
1124
+ }
1125
+
1126
+ // Parse fractional part
1127
+ if (peek(state) == '.') {
1128
+ integer = false;
1129
+ decimal_point_pos = mantissa_digits; // Remember position of decimal point
1130
+ state->cursor++;
1131
+
1132
+ int fractional_digits = json_parse_digits(state, &mantissa);
1133
+ mantissa_digits += fractional_digits;
1134
+
1135
+ if (RB_UNLIKELY(!fractional_digits)) {
1136
+ raise_parse_error_at("invalid number: %s", state, start);
1137
+ }
1138
+ }
1139
+
1140
+ // Parse exponent
1141
+ if (rb_tolower(peek(state)) == 'e') {
1142
+ integer = false;
1143
+ state->cursor++;
1144
+
1145
+ bool negative_exponent = false;
1146
+ const char next_char = peek(state);
1147
+ if (next_char == '-' || next_char == '+') {
1148
+ negative_exponent = next_char == '-';
1149
+ state->cursor++;
1150
+ }
1151
+
1152
+ uint64_t abs_exponent = 0;
1153
+ int exponent_digits = json_parse_digits(state, &abs_exponent);
1154
+
1155
+ if (RB_UNLIKELY(!exponent_digits)) {
1156
+ raise_parse_error_at("invalid number: %s", state, start);
1157
+ }
1158
+
1159
+ exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent);
1160
+ }
1161
+
1162
+ if (integer) {
1163
+ return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
1164
+ }
1165
+
1166
+ // Adjust exponent based on decimal point position
1167
+ if (decimal_point_pos >= 0) {
1168
+ exponent -= (mantissa_digits - decimal_point_pos);
1169
+ }
1170
+
1171
+ return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
1172
+ }
1173
+
1174
+ static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
1175
+ {
1176
+ return json_parse_number(state, config, false, state->cursor);
1177
+ }
1178
+
1179
+ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
1180
+ {
1181
+ const char *start = state->cursor;
1182
+ state->cursor++;
1183
+ return json_parse_number(state, config, true, start);
1184
+ }
1185
+
895
1186
  static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
896
1187
  {
897
1188
  json_eat_whitespace(state);
898
- if (state->cursor >= state->end) {
899
- raise_parse_error("unexpected end of input", state);
900
- }
901
1189
 
902
- switch (*state->cursor) {
1190
+ switch (peek(state)) {
903
1191
  case 'n':
904
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) {
1192
+ if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) {
905
1193
  state->cursor += 4;
906
1194
  return json_push_value(state, config, Qnil);
907
1195
  }
@@ -909,7 +1197,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
909
1197
  raise_parse_error("unexpected token %s", state);
910
1198
  break;
911
1199
  case 't':
912
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
1200
+ if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) {
913
1201
  state->cursor += 4;
914
1202
  return json_push_value(state, config, Qtrue);
915
1203
  }
@@ -918,7 +1206,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
918
1206
  break;
919
1207
  case 'f':
920
1208
  // Note: memcmp with a small power of two compile to an integer comparison
921
- if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1209
+ if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
922
1210
  state->cursor += 5;
923
1211
  return json_push_value(state, config, Qfalse);
924
1212
  }
@@ -927,7 +1215,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
927
1215
  break;
928
1216
  case 'N':
929
1217
  // Note: memcmp with a small power of two compile to an integer comparison
930
- if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1218
+ if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
931
1219
  state->cursor += 3;
932
1220
  return json_push_value(state, config, CNaN);
933
1221
  }
@@ -935,16 +1223,16 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
935
1223
  raise_parse_error("unexpected token %s", state);
936
1224
  break;
937
1225
  case 'I':
938
- if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1226
+ if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) {
939
1227
  state->cursor += 8;
940
1228
  return json_push_value(state, config, CInfinity);
941
1229
  }
942
1230
 
943
1231
  raise_parse_error("unexpected token %s", state);
944
1232
  break;
945
- case '-':
1233
+ case '-': {
946
1234
  // Note: memcmp with a small power of two compile to an integer comparison
947
- if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1235
+ if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
948
1236
  if (config->allow_nan) {
949
1237
  state->cursor += 9;
950
1238
  return json_push_value(state, config, CMinusInfinity);
@@ -952,62 +1240,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
952
1240
  raise_parse_error("unexpected token %s", state);
953
1241
  }
954
1242
  }
955
- // Fallthrough
956
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
957
- bool integer = true;
958
-
959
- // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
960
- const char *start = state->cursor;
961
- state->cursor++;
962
-
963
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
964
- state->cursor++;
965
- }
966
-
967
- long integer_length = state->cursor - start;
968
-
969
- if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
970
- raise_parse_error_at("invalid number: %s", state, start);
971
- } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
972
- raise_parse_error_at("invalid number: %s", state, start);
973
- } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
974
- raise_parse_error_at("invalid number: %s", state, start);
975
- }
976
-
977
- if ((state->cursor < state->end) && (*state->cursor == '.')) {
978
- integer = false;
979
- state->cursor++;
980
-
981
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
982
- raise_parse_error("invalid number: %s", state);
983
- }
984
-
985
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
986
- state->cursor++;
987
- }
988
- }
989
-
990
- if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
991
- integer = false;
992
- state->cursor++;
993
- if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
994
- state->cursor++;
995
- }
996
-
997
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
998
- raise_parse_error("invalid number: %s", state);
999
- }
1000
-
1001
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1002
- state->cursor++;
1003
- }
1004
- }
1005
-
1006
- if (integer) {
1007
- return json_push_value(state, config, json_decode_integer(start, state->cursor));
1008
- }
1009
- return json_push_value(state, config, json_decode_float(config, start, state->cursor));
1243
+ return json_push_value(state, config, json_parse_negative_number(state, config));
1244
+ break;
1010
1245
  }
1246
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1247
+ return json_push_value(state, config, json_parse_positive_number(state, config));
1248
+ break;
1011
1249
  case '"': {
1012
1250
  // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
1013
1251
  return json_parse_string(state, config, false);
@@ -1018,7 +1256,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1018
1256
  json_eat_whitespace(state);
1019
1257
  long stack_head = state->stack->head;
1020
1258
 
1021
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1259
+ if (peek(state) == ']') {
1022
1260
  state->cursor++;
1023
1261
  return json_push_value(state, config, json_decode_array(state, config, 0));
1024
1262
  } else {
@@ -1033,26 +1271,26 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1033
1271
  while (true) {
1034
1272
  json_eat_whitespace(state);
1035
1273
 
1036
- if (state->cursor < state->end) {
1037
- if (*state->cursor == ']') {
1038
- state->cursor++;
1039
- long count = state->stack->head - stack_head;
1040
- state->current_nesting--;
1041
- state->in_array--;
1042
- return json_push_value(state, config, json_decode_array(state, config, count));
1043
- }
1274
+ const char next_char = peek(state);
1044
1275
 
1045
- if (*state->cursor == ',') {
1046
- state->cursor++;
1047
- if (config->allow_trailing_comma) {
1048
- json_eat_whitespace(state);
1049
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1050
- continue;
1051
- }
1276
+ if (RB_LIKELY(next_char == ',')) {
1277
+ state->cursor++;
1278
+ if (config->allow_trailing_comma) {
1279
+ json_eat_whitespace(state);
1280
+ if (peek(state) == ']') {
1281
+ continue;
1052
1282
  }
1053
- json_parse_any(state, config);
1054
- continue;
1055
1283
  }
1284
+ json_parse_any(state, config);
1285
+ continue;
1286
+ }
1287
+
1288
+ if (next_char == ']') {
1289
+ state->cursor++;
1290
+ long count = state->stack->head - stack_head;
1291
+ state->current_nesting--;
1292
+ state->in_array--;
1293
+ return json_push_value(state, config, json_decode_array(state, config, count));
1056
1294
  }
1057
1295
 
1058
1296
  raise_parse_error("expected ',' or ']' after array value", state);
@@ -1060,11 +1298,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1060
1298
  break;
1061
1299
  }
1062
1300
  case '{': {
1301
+ const char *object_start_cursor = state->cursor;
1302
+
1063
1303
  state->cursor++;
1064
1304
  json_eat_whitespace(state);
1065
1305
  long stack_head = state->stack->head;
1066
1306
 
1067
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1307
+ if (peek(state) == '}') {
1068
1308
  state->cursor++;
1069
1309
  return json_push_value(state, config, json_decode_object(state, config, 0));
1070
1310
  } else {
@@ -1073,13 +1313,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1073
1313
  rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
1074
1314
  }
1075
1315
 
1076
- if (*state->cursor != '"') {
1316
+ if (peek(state) != '"') {
1077
1317
  raise_parse_error("expected object key, got %s", state);
1078
1318
  }
1079
1319
  json_parse_string(state, config, true);
1080
1320
 
1081
1321
  json_eat_whitespace(state);
1082
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1322
+ if (peek(state) != ':') {
1083
1323
  raise_parse_error("expected ':' after object key", state);
1084
1324
  }
1085
1325
  state->cursor++;
@@ -1090,39 +1330,45 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1090
1330
  while (true) {
1091
1331
  json_eat_whitespace(state);
1092
1332
 
1093
- if (state->cursor < state->end) {
1094
- if (*state->cursor == '}') {
1095
- state->cursor++;
1096
- state->current_nesting--;
1097
- long count = state->stack->head - stack_head;
1098
- return json_push_value(state, config, json_decode_object(state, config, count));
1099
- }
1333
+ const char next_char = peek(state);
1334
+ if (next_char == '}') {
1335
+ state->cursor++;
1336
+ state->current_nesting--;
1337
+ size_t count = state->stack->head - stack_head;
1100
1338
 
1101
- if (*state->cursor == ',') {
1102
- state->cursor++;
1103
- json_eat_whitespace(state);
1339
+ // Temporary rewind cursor in case an error is raised
1340
+ const char *final_cursor = state->cursor;
1341
+ state->cursor = object_start_cursor;
1342
+ VALUE object = json_decode_object(state, config, count);
1343
+ state->cursor = final_cursor;
1104
1344
 
1105
- if (config->allow_trailing_comma) {
1106
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1107
- continue;
1108
- }
1109
- }
1345
+ return json_push_value(state, config, object);
1346
+ }
1110
1347
 
1111
- if (*state->cursor != '"') {
1112
- raise_parse_error("expected object key, got: %s", state);
1113
- }
1114
- json_parse_string(state, config, true);
1348
+ if (next_char == ',') {
1349
+ state->cursor++;
1350
+ json_eat_whitespace(state);
1115
1351
 
1116
- json_eat_whitespace(state);
1117
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1118
- raise_parse_error("expected ':' after object key, got: %s", state);
1352
+ if (config->allow_trailing_comma) {
1353
+ if (peek(state) == '}') {
1354
+ continue;
1119
1355
  }
1120
- state->cursor++;
1356
+ }
1121
1357
 
1122
- json_parse_any(state, config);
1358
+ if (RB_UNLIKELY(peek(state) != '"')) {
1359
+ raise_parse_error("expected object key, got: %s", state);
1360
+ }
1361
+ json_parse_string(state, config, true);
1123
1362
 
1124
- continue;
1363
+ json_eat_whitespace(state);
1364
+ if (RB_UNLIKELY(peek(state) != ':')) {
1365
+ raise_parse_error("expected ':' after object key, got: %s", state);
1125
1366
  }
1367
+ state->cursor++;
1368
+
1369
+ json_parse_any(state, config);
1370
+
1371
+ continue;
1126
1372
  }
1127
1373
 
1128
1374
  raise_parse_error("expected ',' or '}' after object value, got: %s", state);
@@ -1130,18 +1376,23 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1130
1376
  break;
1131
1377
  }
1132
1378
 
1379
+ case 0:
1380
+ raise_parse_error("unexpected end of input", state);
1381
+ break;
1382
+
1133
1383
  default:
1134
1384
  raise_parse_error("unexpected character: %s", state);
1135
1385
  break;
1136
1386
  }
1137
1387
 
1138
- raise_parse_error("unreacheable: %s", state);
1388
+ raise_parse_error("unreachable: %s", state);
1389
+ return Qundef;
1139
1390
  }
1140
1391
 
1141
1392
  static void json_ensure_eof(JSON_ParserState *state)
1142
1393
  {
1143
1394
  json_eat_whitespace(state);
1144
- if (state->cursor != state->end) {
1395
+ if (!eos(state)) {
1145
1396
  raise_parse_error("unexpected token at end of stream %s", state);
1146
1397
  }
1147
1398
  }
@@ -1178,13 +1429,15 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
1178
1429
  {
1179
1430
  JSON_ParserConfig *config = (JSON_ParserConfig *)data;
1180
1431
 
1181
- if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1182
- else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1183
- else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1184
- else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1185
- else if (key == sym_freeze) { config->freeze = RTEST(val); }
1186
- else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1187
- else if (key == sym_decimal_class) {
1432
+ if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1433
+ else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1434
+ else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1435
+ else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); }
1436
+ else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1437
+ else if (key == sym_freeze) { config->freeze = RTEST(val); }
1438
+ else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1439
+ else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1440
+ else if (key == sym_decimal_class) {
1188
1441
  if (RTEST(val)) {
1189
1442
  if (rb_respond_to(val, i_try_convert)) {
1190
1443
  config->decimal_class = val;
@@ -1257,6 +1510,7 @@ static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
1257
1510
  */
1258
1511
  static VALUE cParserConfig_initialize(VALUE self, VALUE opts)
1259
1512
  {
1513
+ rb_check_frozen(self);
1260
1514
  GET_PARSER_CONFIG;
1261
1515
 
1262
1516
  parser_config_init(config, opts);
@@ -1352,7 +1606,7 @@ static const rb_data_type_t JSON_ParserConfig_type = {
1352
1606
  JSON_ParserConfig_memsize,
1353
1607
  },
1354
1608
  0, 0,
1355
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
1609
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE,
1356
1610
  };
1357
1611
 
1358
1612
  static VALUE cJSON_parser_s_allocate(VALUE klass)
@@ -1396,15 +1650,13 @@ void Init_parser(void)
1396
1650
  sym_max_nesting = ID2SYM(rb_intern("max_nesting"));
1397
1651
  sym_allow_nan = ID2SYM(rb_intern("allow_nan"));
1398
1652
  sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma"));
1653
+ sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters"));
1399
1654
  sym_symbolize_names = ID2SYM(rb_intern("symbolize_names"));
1400
1655
  sym_freeze = ID2SYM(rb_intern("freeze"));
1401
1656
  sym_on_load = ID2SYM(rb_intern("on_load"));
1402
1657
  sym_decimal_class = ID2SYM(rb_intern("decimal_class"));
1658
+ sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key"));
1403
1659
 
1404
- i_chr = rb_intern("chr");
1405
- i_aset = rb_intern("[]=");
1406
- i_aref = rb_intern("[]");
1407
- i_leftshift = rb_intern("<<");
1408
1660
  i_new = rb_intern("new");
1409
1661
  i_try_convert = rb_intern("try_convert");
1410
1662
  i_uminus = rb_intern("-@");
@@ -1413,4 +1665,8 @@ void Init_parser(void)
1413
1665
  binary_encindex = rb_ascii8bit_encindex();
1414
1666
  utf8_encindex = rb_utf8_encindex();
1415
1667
  enc_utf8 = rb_utf8_encoding();
1668
+
1669
+ #ifdef HAVE_SIMD
1670
+ simd_impl = find_simd_implementation();
1671
+ #endif
1416
1672
  }