json 2.12.2 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,48 +1,21 @@
1
- #include "ruby.h"
2
- #include "ruby/encoding.h"
3
-
4
- /* shims */
5
- /* This is the fallback definition from Ruby 3.4 */
6
-
7
- #ifndef RBIMPL_STDBOOL_H
8
- #if defined(__cplusplus)
9
- # if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
10
- # include <cstdbool>
11
- # endif
12
- #elif defined(HAVE_STDBOOL_H)
13
- # include <stdbool.h>
14
- #elif !defined(HAVE__BOOL)
15
- typedef unsigned char _Bool;
16
- # define bool _Bool
17
- # define true ((_Bool)+1)
18
- # define false ((_Bool)+0)
19
- # define __bool_true_false_are_defined
20
- #endif
21
- #endif
22
-
23
- #ifndef RB_UNLIKELY
24
- #define RB_UNLIKELY(expr) expr
25
- #endif
26
-
27
- #ifndef RB_LIKELY
28
- #define RB_LIKELY(expr) expr
29
- #endif
1
+ #include "../json.h"
2
+ #include "../vendor/ryu.h"
3
+ #include "../simd/simd.h"
30
4
 
31
5
  static VALUE mJSON, eNestingError, Encoding_UTF_8;
32
6
  static VALUE CNaN, CInfinity, CMinusInfinity;
33
7
 
34
- static ID i_chr, i_aset, i_aref,
35
- i_leftshift, i_new, i_try_convert, i_uminus, i_encode;
8
+ static ID i_new, i_try_convert, i_uminus, i_encode;
36
9
 
37
- static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze,
38
- sym_decimal_class, sym_on_load;
10
+ static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_allow_control_characters, sym_symbolize_names, sym_freeze,
11
+ sym_decimal_class, sym_on_load, sym_allow_duplicate_key;
39
12
 
40
13
  static int binary_encindex;
41
14
  static int utf8_encindex;
42
15
 
43
16
  #ifndef HAVE_RB_HASH_BULK_INSERT
44
17
  // For TruffleRuby
45
- void
18
+ static void
46
19
  rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
47
20
  {
48
21
  long index = 0;
@@ -59,6 +32,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
59
32
  #define rb_hash_new_capa(n) rb_hash_new()
60
33
  #endif
61
34
 
35
+ #ifndef HAVE_RB_STR_TO_INTERNED_STR
36
+ static VALUE rb_str_to_interned_str(VALUE str)
37
+ {
38
+ return rb_funcall(rb_str_freeze(str), i_uminus, 0);
39
+ }
40
+ #endif
62
41
 
63
42
  /* name cache */
64
43
 
@@ -104,116 +83,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring
104
83
  cache->entries[index] = rstring;
105
84
  }
106
85
 
107
- static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
86
+ #define rstring_cache_memcmp memcmp
87
+
88
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
89
+ #if __has_builtin(__builtin_bswap64)
90
+ #undef rstring_cache_memcmp
91
+ ALWAYS_INLINE(static) int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
108
92
  {
109
- long rstring_length = RSTRING_LEN(rstring);
110
- if (length == rstring_length) {
111
- return memcmp(str, RSTRING_PTR(rstring), length);
112
- } else {
113
- return (int)(length - rstring_length);
93
+ // The libc memcmp has numerous complex optimizations, but in this particular case,
94
+ // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
95
+ // inline a simpler memcmp outperforms calling the libc version.
96
+ long i = 0;
97
+
98
+ for (; i + 8 <= length; i += 8) {
99
+ uint64_t a, b;
100
+ memcpy(&a, str + i, 8);
101
+ memcpy(&b, rptr + i, 8);
102
+ if (a != b) {
103
+ a = __builtin_bswap64(a);
104
+ b = __builtin_bswap64(b);
105
+ return (a < b) ? -1 : 1;
106
+ }
107
+ }
108
+
109
+ for (; i < length; i++) {
110
+ if (str[i] != rptr[i]) {
111
+ return (str[i] < rptr[i]) ? -1 : 1;
112
+ }
114
113
  }
114
+
115
+ return 0;
115
116
  }
117
+ #endif
118
+ #endif
116
119
 
117
- static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
120
+ ALWAYS_INLINE(static) int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
118
121
  {
119
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
120
- // Common names aren't likely to be very long. So we just don't
121
- // cache names above an arbitrary threshold.
122
- return Qfalse;
123
- }
122
+ const char *rstring_ptr;
123
+ long rstring_length;
124
124
 
125
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
126
- // Simple heuristic, if the first character isn't a letter,
127
- // we're much less likely to see this string again.
128
- // We mostly want to cache strings that are likely to be repeated.
129
- return Qfalse;
125
+ RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
126
+
127
+ if (length == rstring_length) {
128
+ return rstring_cache_memcmp(str, rstring_ptr, length);
129
+ } else {
130
+ return (int)(length - rstring_length);
130
131
  }
132
+ }
131
133
 
134
+ ALWAYS_INLINE(static) VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
135
+ {
132
136
  int low = 0;
133
137
  int high = cache->length - 1;
134
- int mid = 0;
135
- int last_cmp = 0;
136
138
 
137
139
  while (low <= high) {
138
- mid = (high + low) >> 1;
140
+ int mid = (high + low) >> 1;
139
141
  VALUE entry = cache->entries[mid];
140
- last_cmp = rstring_cache_cmp(str, length, entry);
142
+ int cmp = rstring_cache_cmp(str, length, entry);
141
143
 
142
- if (last_cmp == 0) {
144
+ if (cmp == 0) {
143
145
  return entry;
144
- } else if (last_cmp > 0) {
146
+ } else if (cmp > 0) {
145
147
  low = mid + 1;
146
148
  } else {
147
149
  high = mid - 1;
148
150
  }
149
151
  }
150
152
 
151
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
152
- // We assume the overwhelming majority of names don't need to be escaped.
153
- // But if they do, we have to fallback to the slow path.
154
- return Qfalse;
155
- }
156
-
157
153
  VALUE rstring = build_interned_string(str, length);
158
154
 
159
155
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
160
- if (last_cmp > 0) {
161
- mid += 1;
162
- }
163
-
164
- rvalue_cache_insert_at(cache, mid, rstring);
156
+ rvalue_cache_insert_at(cache, low, rstring);
165
157
  }
166
158
  return rstring;
167
159
  }
168
160
 
169
161
  static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
170
162
  {
171
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
172
- // Common names aren't likely to be very long. So we just don't
173
- // cache names above an arbitrary threshold.
174
- return Qfalse;
175
- }
176
-
177
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
178
- // Simple heuristic, if the first character isn't a letter,
179
- // we're much less likely to see this string again.
180
- // We mostly want to cache strings that are likely to be repeated.
181
- return Qfalse;
182
- }
183
-
184
163
  int low = 0;
185
164
  int high = cache->length - 1;
186
- int mid = 0;
187
- int last_cmp = 0;
188
165
 
189
166
  while (low <= high) {
190
- mid = (high + low) >> 1;
167
+ int mid = (high + low) >> 1;
191
168
  VALUE entry = cache->entries[mid];
192
- last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
169
+ int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
193
170
 
194
- if (last_cmp == 0) {
171
+ if (cmp == 0) {
195
172
  return entry;
196
- } else if (last_cmp > 0) {
173
+ } else if (cmp > 0) {
197
174
  low = mid + 1;
198
175
  } else {
199
176
  high = mid - 1;
200
177
  }
201
178
  }
202
179
 
203
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
204
- // We assume the overwhelming majority of names don't need to be escaped.
205
- // But if they do, we have to fallback to the slow path.
206
- return Qfalse;
207
- }
208
-
209
180
  VALUE rsymbol = build_symbol(str, length);
210
181
 
211
182
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
212
- if (last_cmp > 0) {
213
- mid += 1;
214
- }
215
-
216
- rvalue_cache_insert_at(cache, mid, rsymbol);
183
+ rvalue_cache_insert_at(cache, low, rsymbol);
217
184
  }
218
185
  return rsymbol;
219
186
  }
@@ -328,15 +295,6 @@ static void rvalue_stack_eagerly_release(VALUE handle)
328
295
  }
329
296
  }
330
297
 
331
-
332
- #ifndef HAVE_STRNLEN
333
- static size_t strnlen(const char *s, size_t maxlen)
334
- {
335
- char *p;
336
- return ((p = memchr(s, '\0', maxlen)) ? p - s : maxlen);
337
- }
338
- #endif
339
-
340
298
  static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
341
299
  {
342
300
  int len = 1;
@@ -363,14 +321,21 @@ static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
363
321
  return len;
364
322
  }
365
323
 
324
+ enum duplicate_key_action {
325
+ JSON_DEPRECATED = 0,
326
+ JSON_IGNORE,
327
+ JSON_RAISE,
328
+ };
329
+
366
330
  typedef struct JSON_ParserStruct {
367
331
  VALUE on_load_proc;
368
332
  VALUE decimal_class;
369
333
  ID decimal_method_id;
334
+ enum duplicate_key_action on_duplicate_key;
370
335
  int max_nesting;
371
336
  bool allow_nan;
372
337
  bool allow_trailing_comma;
373
- bool parsing_name;
338
+ bool allow_control_characters;
374
339
  bool symbolize_names;
375
340
  bool freeze;
376
341
  } JSON_ParserConfig;
@@ -386,15 +351,24 @@ typedef struct JSON_ParserStateStruct {
386
351
  int current_nesting;
387
352
  } JSON_ParserState;
388
353
 
354
+ static inline size_t rest(JSON_ParserState *state) {
355
+ return state->end - state->cursor;
356
+ }
389
357
 
390
- #define PARSE_ERROR_FRAGMENT_LEN 32
391
- #ifdef RBIMPL_ATTR_NORETURN
392
- RBIMPL_ATTR_NORETURN()
393
- #endif
394
- static void raise_parse_error(const char *format, JSON_ParserState *state)
358
+ static inline bool eos(JSON_ParserState *state) {
359
+ return state->cursor >= state->end;
360
+ }
361
+
362
+ static inline char peek(JSON_ParserState *state)
395
363
  {
396
- unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
364
+ if (RB_UNLIKELY(eos(state))) {
365
+ return 0;
366
+ }
367
+ return *state->cursor;
368
+ }
397
369
 
370
+ static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
371
+ {
398
372
  const char *cursor = state->cursor;
399
373
  long column = 0;
400
374
  long line = 1;
@@ -411,6 +385,29 @@ static void raise_parse_error(const char *format, JSON_ParserState *state)
411
385
  line++;
412
386
  }
413
387
  }
388
+ *line_out = line;
389
+ *column_out = column;
390
+ }
391
+
392
+ static void emit_parse_warning(const char *message, JSON_ParserState *state)
393
+ {
394
+ long line, column;
395
+ cursor_position(state, &line, &column);
396
+
397
+ VALUE warning = rb_sprintf("%s at line %ld column %ld", message, line, column);
398
+ rb_funcall(mJSON, rb_intern("deprecation_warning"), 1, warning);
399
+ }
400
+
401
+ #define PARSE_ERROR_FRAGMENT_LEN 32
402
+
403
+ #ifdef RBIMPL_ATTR_NORETURN
404
+ RBIMPL_ATTR_NORETURN()
405
+ #endif
406
+ static void raise_parse_error(const char *format, JSON_ParserState *state)
407
+ {
408
+ unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
409
+ long line, column;
410
+ cursor_position(state, &line, &column);
414
411
 
415
412
  const char *ptr = "EOF";
416
413
  if (state->cursor && state->cursor < state->end) {
@@ -505,61 +502,82 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
505
502
 
506
503
  static const rb_data_type_t JSON_ParserConfig_type;
507
504
 
508
- static const bool whitespace[256] = {
509
- [' '] = 1,
510
- ['\t'] = 1,
511
- ['\n'] = 1,
512
- ['\r'] = 1,
513
- ['/'] = 1,
514
- };
515
-
516
505
  static void
517
506
  json_eat_comments(JSON_ParserState *state)
518
507
  {
519
- if (state->cursor + 1 < state->end) {
520
- switch(state->cursor[1]) {
521
- case '/': {
522
- state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
523
- if (!state->cursor) {
524
- state->cursor = state->end;
525
- } else {
526
- state->cursor++;
527
- }
528
- break;
508
+ const char *start = state->cursor;
509
+ state->cursor++;
510
+
511
+ switch (peek(state)) {
512
+ case '/': {
513
+ state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
514
+ if (!state->cursor) {
515
+ state->cursor = state->end;
516
+ } else {
517
+ state->cursor++;
529
518
  }
530
- case '*': {
531
- state->cursor += 2;
532
- while (true) {
533
- state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
534
- if (!state->cursor) {
535
- raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
536
- } else {
537
- state->cursor++;
538
- if (state->cursor < state->end && *state->cursor == '/') {
539
- state->cursor++;
540
- break;
541
- }
542
- }
519
+ break;
520
+ }
521
+ case '*': {
522
+ state->cursor++;
523
+
524
+ while (true) {
525
+ const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
526
+ if (!next_match) {
527
+ raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
528
+ }
529
+
530
+ state->cursor = next_match + 1;
531
+ if (peek(state) == '/') {
532
+ state->cursor++;
533
+ break;
543
534
  }
544
- break;
545
535
  }
546
- default:
547
- raise_parse_error("unexpected token %s", state);
548
- break;
536
+ break;
549
537
  }
550
- } else {
551
- raise_parse_error("unexpected token %s", state);
538
+ default:
539
+ raise_parse_error_at("unexpected token %s", state, start);
540
+ break;
552
541
  }
553
542
  }
554
543
 
555
- static inline void
544
+ ALWAYS_INLINE(static) void
556
545
  json_eat_whitespace(JSON_ParserState *state)
557
546
  {
558
- while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) {
559
- if (RB_LIKELY(*state->cursor != '/')) {
560
- state->cursor++;
561
- } else {
562
- json_eat_comments(state);
547
+ while (true) {
548
+ switch (peek(state)) {
549
+ case ' ':
550
+ state->cursor++;
551
+ break;
552
+ case '\n':
553
+ state->cursor++;
554
+
555
+ // Heuristic: if we see a newline, there is likely consecutive spaces after it.
556
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
557
+ while (rest(state) > 8) {
558
+ uint64_t chunk;
559
+ memcpy(&chunk, state->cursor, sizeof(uint64_t));
560
+ if (chunk == 0x2020202020202020) {
561
+ state->cursor += 8;
562
+ continue;
563
+ }
564
+
565
+ uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
566
+ state->cursor += consecutive_spaces;
567
+ break;
568
+ }
569
+ #endif
570
+ break;
571
+ case '\t':
572
+ case '\r':
573
+ state->cursor++;
574
+ break;
575
+ case '/':
576
+ json_eat_comments(state);
577
+ break;
578
+
579
+ default:
580
+ return;
563
581
  }
564
582
  }
565
583
  }
@@ -590,11 +608,22 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
590
608
  return result;
591
609
  }
592
610
 
593
- static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
611
+ static inline bool json_string_cacheable_p(const char *string, size_t length)
612
+ {
613
+ // We mostly want to cache strings that are likely to be repeated.
614
+ // Simple heuristics:
615
+ // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
616
+ // - If the first character isn't a letter, we're much less likely to see this string again.
617
+ return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
618
+ }
619
+
620
+ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name)
594
621
  {
622
+ bool intern = is_name || config->freeze;
623
+ bool symbolize = is_name && config->symbolize_names;
595
624
  size_t bufferSize = stringEnd - string;
596
625
 
597
- if (is_name && state->in_array) {
626
+ if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
598
627
  VALUE cached_key;
599
628
  if (RB_UNLIKELY(symbolize)) {
600
629
  cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
@@ -610,60 +639,73 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *st
610
639
  return build_string(string, stringEnd, intern, symbolize);
611
640
  }
612
641
 
613
- static VALUE json_string_unescape(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
614
- {
615
- size_t bufferSize = stringEnd - string;
616
- const char *p = string, *pe = string, *unescape, *bufferStart;
617
- char *buffer;
618
- int unescape_len;
619
- char buf[4];
642
+ #define JSON_MAX_UNESCAPE_POSITIONS 16
643
+ typedef struct _json_unescape_positions {
644
+ long size;
645
+ const char **positions;
646
+ bool has_more;
647
+ } JSON_UnescapePositions;
620
648
 
621
- if (is_name && state->in_array) {
622
- VALUE cached_key;
623
- if (RB_UNLIKELY(symbolize)) {
624
- cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
625
- } else {
626
- cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
649
+ static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
650
+ {
651
+ while (positions->size) {
652
+ positions->size--;
653
+ const char *next_position = positions->positions[0];
654
+ positions->positions++;
655
+ if (next_position >= pe) {
656
+ return next_position;
627
657
  }
658
+ }
628
659
 
629
- if (RB_LIKELY(cached_key)) {
630
- return cached_key;
631
- }
660
+ if (positions->has_more) {
661
+ return memchr(pe, '\\', stringEnd - pe);
632
662
  }
633
663
 
664
+ return NULL;
665
+ }
666
+
667
+ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_ParserConfig *config, const char *string, const char *stringEnd, bool is_name, JSON_UnescapePositions *positions)
668
+ {
669
+ bool intern = is_name || config->freeze;
670
+ bool symbolize = is_name && config->symbolize_names;
671
+ size_t bufferSize = stringEnd - string;
672
+ const char *p = string, *pe = string, *bufferStart;
673
+ char *buffer;
674
+
634
675
  VALUE result = rb_str_buf_new(bufferSize);
635
676
  rb_enc_associate_index(result, utf8_encindex);
636
677
  buffer = RSTRING_PTR(result);
637
678
  bufferStart = buffer;
638
679
 
639
- while (pe < stringEnd && (pe = memchr(pe, '\\', stringEnd - pe))) {
640
- unescape = (char *) "?";
641
- unescape_len = 1;
680
+ #define APPEND_CHAR(chr) *buffer++ = chr; p = ++pe;
681
+
682
+ while (pe < stringEnd && (pe = json_next_backslash(pe, stringEnd, positions))) {
642
683
  if (pe > p) {
643
684
  MEMCPY(buffer, p, char, pe - p);
644
685
  buffer += pe - p;
645
686
  }
646
687
  switch (*++pe) {
688
+ case '"':
689
+ case '/':
690
+ p = pe; // nothing to unescape just need to skip the backslash
691
+ break;
692
+ case '\\':
693
+ APPEND_CHAR('\\');
694
+ break;
647
695
  case 'n':
648
- unescape = (char *) "\n";
696
+ APPEND_CHAR('\n');
649
697
  break;
650
698
  case 'r':
651
- unescape = (char *) "\r";
699
+ APPEND_CHAR('\r');
652
700
  break;
653
701
  case 't':
654
- unescape = (char *) "\t";
655
- break;
656
- case '"':
657
- unescape = (char *) "\"";
658
- break;
659
- case '\\':
660
- unescape = (char *) "\\";
702
+ APPEND_CHAR('\t');
661
703
  break;
662
704
  case 'b':
663
- unescape = (char *) "\b";
705
+ APPEND_CHAR('\b');
664
706
  break;
665
707
  case 'f':
666
- unescape = (char *) "\f";
708
+ APPEND_CHAR('\f');
667
709
  break;
668
710
  case 'u':
669
711
  if (pe > stringEnd - 5) {
@@ -688,26 +730,42 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
688
730
  }
689
731
  if (pe[0] == '\\' && pe[1] == 'u') {
690
732
  uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
733
+
734
+ if ((sur & 0xFC00) != 0xDC00) {
735
+ raise_parse_error_at("invalid surrogate pair at %s", state, p);
736
+ }
737
+
691
738
  ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
692
739
  | (sur & 0x3FF));
693
740
  pe += 5;
694
741
  } else {
695
- unescape = (char *) "?";
742
+ raise_parse_error_at("incomplete surrogate pair at %s", state, p);
696
743
  break;
697
744
  }
698
745
  }
699
- unescape_len = convert_UTF32_to_UTF8(buf, ch);
700
- unescape = buf;
746
+
747
+ char buf[4];
748
+ int unescape_len = convert_UTF32_to_UTF8(buf, ch);
749
+ MEMCPY(buffer, buf, char, unescape_len);
750
+ buffer += unescape_len;
751
+ p = ++pe;
701
752
  }
702
753
  break;
703
754
  default:
704
- p = pe;
705
- continue;
755
+ if ((unsigned char)*pe < 0x20) {
756
+ if (!config->allow_control_characters) {
757
+ if (*pe == '\n') {
758
+ raise_parse_error_at("Invalid unescaped newline character (\\n) in string: %s", state, pe - 1);
759
+ }
760
+ raise_parse_error_at("invalid ASCII control character in string: %s", state, pe - 1);
761
+ }
762
+ } else {
763
+ raise_parse_error_at("invalid escape character in string: %s", state, pe - 1);
764
+ }
765
+ break;
706
766
  }
707
- MEMCPY(buffer, unescape, char, unescape_len);
708
- buffer += unescape_len;
709
- p = ++pe;
710
767
  }
768
+ #undef APPEND_CHAR
711
769
 
712
770
  if (stringEnd > p) {
713
771
  MEMCPY(buffer, p, char, stringEnd - p);
@@ -718,33 +776,13 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
718
776
  if (symbolize) {
719
777
  result = rb_str_intern(result);
720
778
  } else if (intern) {
721
- result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
779
+ result = rb_str_to_interned_str(result);
722
780
  }
723
781
 
724
782
  return result;
725
783
  }
726
784
 
727
785
  #define MAX_FAST_INTEGER_SIZE 18
728
- static inline VALUE fast_decode_integer(const char *p, const char *pe)
729
- {
730
- bool negative = false;
731
- if (*p == '-') {
732
- negative = true;
733
- p++;
734
- }
735
-
736
- long long memo = 0;
737
- while (p < pe) {
738
- memo *= 10;
739
- memo += *p - '0';
740
- p++;
741
- }
742
-
743
- if (negative) {
744
- memo = -memo;
745
- }
746
- return LL2NUM(memo);
747
- }
748
786
 
749
787
  static VALUE json_decode_large_integer(const char *start, long len)
750
788
  {
@@ -758,17 +796,27 @@ static VALUE json_decode_large_integer(const char *start, long len)
758
796
  }
759
797
 
760
798
  static inline VALUE
761
- json_decode_integer(const char *start, const char *end)
799
+ json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
762
800
  {
763
- long len = end - start;
764
- if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
765
- return fast_decode_integer(start, end);
801
+ if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
802
+ if (negative) {
803
+ return INT64T2NUM(-((int64_t)mantissa));
766
804
  }
767
- return json_decode_large_integer(start, len);
805
+ return UINT64T2NUM(mantissa);
806
+ }
807
+
808
+ return json_decode_large_integer(start, end - start);
768
809
  }
769
810
 
770
811
  static VALUE json_decode_large_float(const char *start, long len)
771
812
  {
813
+ if (RB_LIKELY(len < 64)) {
814
+ char buffer[64];
815
+ MEMCPY(buffer, start, char, len);
816
+ buffer[len] = '\0';
817
+ return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
818
+ }
819
+
772
820
  VALUE buffer_v;
773
821
  char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
774
822
  MEMCPY(buffer, start, char, len);
@@ -778,21 +826,24 @@ static VALUE json_decode_large_float(const char *start, long len)
778
826
  return number;
779
827
  }
780
828
 
781
- static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
829
+ /* Ruby JSON optimized float decoder using vendored Ryu algorithm
830
+ * Accepts pre-extracted mantissa and exponent from first-pass validation
831
+ */
832
+ static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
833
+ const char *start, const char *end)
782
834
  {
783
- long len = end - start;
784
-
785
835
  if (RB_UNLIKELY(config->decimal_class)) {
786
- VALUE text = rb_str_new(start, len);
836
+ VALUE text = rb_str_new(start, end - start);
787
837
  return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
788
- } else if (RB_LIKELY(len < 64)) {
789
- char buffer[64];
790
- MEMCPY(buffer, start, char, len);
791
- buffer[len] = '\0';
792
- return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
793
- } else {
794
- return json_decode_large_float(start, len);
795
838
  }
839
+
840
+ // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
841
+ // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
842
+ if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
843
+ return json_decode_large_float(start, end - start);
844
+ }
845
+
846
+ return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
796
847
  }
797
848
 
798
849
  static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
@@ -807,32 +858,75 @@ static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig
807
858
  return array;
808
859
  }
809
860
 
810
- static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, long count)
861
+ static VALUE json_find_duplicated_key(size_t count, const VALUE *pairs)
811
862
  {
812
- VALUE object = rb_hash_new_capa(count);
813
- rb_hash_bulk_insert(count, rvalue_stack_peek(state->stack, count), object);
863
+ VALUE set = rb_hash_new_capa(count / 2);
864
+ for (size_t index = 0; index < count; index += 2) {
865
+ size_t before = RHASH_SIZE(set);
866
+ VALUE key = pairs[index];
867
+ rb_hash_aset(set, key, Qtrue);
868
+ if (RHASH_SIZE(set) == before) {
869
+ if (RB_SYMBOL_P(key)) {
870
+ return rb_sym2str(key);
871
+ }
872
+ return key;
873
+ }
874
+ }
875
+ return Qfalse;
876
+ }
814
877
 
815
- rvalue_stack_pop(state->stack, count);
878
+ static void emit_duplicate_key_warning(JSON_ParserState *state, VALUE duplicate_key)
879
+ {
880
+ VALUE message = rb_sprintf(
881
+ "detected duplicate key %"PRIsVALUE" in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`",
882
+ rb_inspect(duplicate_key)
883
+ );
816
884
 
817
- if (config->freeze) {
818
- RB_OBJ_FREEZE(object);
819
- }
885
+ emit_parse_warning(RSTRING_PTR(message), state);
886
+ RB_GC_GUARD(message);
887
+ }
820
888
 
821
- return object;
889
+ #ifdef RBIMPL_ATTR_NORETURN
890
+ RBIMPL_ATTR_NORETURN()
891
+ #endif
892
+ static void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_key)
893
+ {
894
+ VALUE message = rb_sprintf(
895
+ "duplicate key %"PRIsVALUE,
896
+ rb_inspect(duplicate_key)
897
+ );
898
+
899
+ raise_parse_error(RSTRING_PTR(message), state);
900
+ RB_GC_GUARD(message);
822
901
  }
823
902
 
824
- static inline VALUE json_decode_string(JSON_ParserState *state, JSON_ParserConfig *config, const char *start, const char *end, bool escaped, bool is_name)
903
+ static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count)
825
904
  {
826
- VALUE string;
827
- bool intern = is_name || config->freeze;
828
- bool symbolize = is_name && config->symbolize_names;
829
- if (escaped) {
830
- string = json_string_unescape(state, start, end, is_name, intern, symbolize);
831
- } else {
832
- string = json_string_fastpath(state, start, end, is_name, intern, symbolize);
905
+ size_t entries_count = count / 2;
906
+ VALUE object = rb_hash_new_capa(entries_count);
907
+ const VALUE *pairs = rvalue_stack_peek(state->stack, count);
908
+ rb_hash_bulk_insert(count, pairs, object);
909
+
910
+ if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) {
911
+ switch (config->on_duplicate_key) {
912
+ case JSON_IGNORE:
913
+ break;
914
+ case JSON_DEPRECATED:
915
+ emit_duplicate_key_warning(state, json_find_duplicated_key(count, pairs));
916
+ break;
917
+ case JSON_RAISE:
918
+ raise_duplicate_key_error(state, json_find_duplicated_key(count, pairs));
919
+ break;
920
+ }
833
921
  }
834
922
 
835
- return string;
923
+ rvalue_stack_pop(state->stack, count);
924
+
925
+ if (config->freeze) {
926
+ RB_OBJ_FREEZE(object);
927
+ }
928
+
929
+ return object;
836
930
  }
837
931
 
838
932
  static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *config, VALUE value)
@@ -844,7 +938,7 @@ static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig *
844
938
  return value;
845
939
  }
846
940
 
847
- static const bool string_scan[256] = {
941
+ static const bool string_scan_table[256] = {
848
942
  // ASCII Control Characters
849
943
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
850
944
  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -857,51 +951,252 @@ static const bool string_scan[256] = {
857
951
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
858
952
  };
859
953
 
860
- static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
954
+ #ifdef HAVE_SIMD
955
+ static SIMD_Implementation simd_impl = SIMD_NONE;
956
+ #endif /* HAVE_SIMD */
957
+
958
+ ALWAYS_INLINE(static) bool string_scan(JSON_ParserState *state)
861
959
  {
862
- state->cursor++;
863
- const char *start = state->cursor;
864
- bool escaped = false;
960
+ #ifdef HAVE_SIMD
961
+ #if defined(HAVE_SIMD_NEON)
865
962
 
866
- while (state->cursor < state->end) {
867
- if (RB_UNLIKELY(string_scan[(unsigned char)*state->cursor])) {
868
- switch (*state->cursor) {
869
- case '"': {
870
- VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name);
871
- state->cursor++;
872
- return json_push_value(state, config, string);
873
- }
874
- case '\\': {
875
- state->cursor++;
876
- escaped = true;
877
- if ((unsigned char)*state->cursor < 0x20) {
878
- raise_parse_error("invalid ASCII control character in string: %s", state);
879
- }
880
- break;
963
+ uint64_t mask = 0;
964
+ if (string_scan_simd_neon(&state->cursor, state->end, &mask)) {
965
+ state->cursor += trailing_zeros64(mask) >> 2;
966
+ return true;
967
+ }
968
+
969
+ #elif defined(HAVE_SIMD_SSE2)
970
+ if (simd_impl == SIMD_SSE2) {
971
+ int mask = 0;
972
+ if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) {
973
+ state->cursor += trailing_zeros(mask);
974
+ return true;
975
+ }
976
+ }
977
+ #endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
978
+ #endif /* HAVE_SIMD */
979
+
980
+ while (!eos(state)) {
981
+ if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
982
+ return true;
983
+ }
984
+ state->cursor++;
985
+ }
986
+ return false;
987
+ }
988
+
989
+ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name, const char *start)
990
+ {
991
+ const char *backslashes[JSON_MAX_UNESCAPE_POSITIONS];
992
+ JSON_UnescapePositions positions = {
993
+ .size = 0,
994
+ .positions = backslashes,
995
+ .has_more = false,
996
+ };
997
+
998
+ do {
999
+ switch (*state->cursor) {
1000
+ case '"': {
1001
+ VALUE string = json_string_unescape(state, config, start, state->cursor, is_name, &positions);
1002
+ state->cursor++;
1003
+ return json_push_value(state, config, string);
1004
+ }
1005
+ case '\\': {
1006
+ if (RB_LIKELY(positions.size < JSON_MAX_UNESCAPE_POSITIONS)) {
1007
+ backslashes[positions.size] = state->cursor;
1008
+ positions.size++;
1009
+ } else {
1010
+ positions.has_more = true;
881
1011
  }
882
- default:
883
- raise_parse_error("invalid ASCII control character in string: %s", state);
884
- break;
1012
+ state->cursor++;
1013
+ break;
885
1014
  }
1015
+ default:
1016
+ if (!config->allow_control_characters) {
1017
+ raise_parse_error("invalid ASCII control character in string: %s", state);
1018
+ }
1019
+ break;
886
1020
  }
887
1021
 
888
1022
  state->cursor++;
889
- }
1023
+ } while (string_scan(state));
890
1024
 
891
1025
  raise_parse_error("unexpected end of input, expected closing \"", state);
892
1026
  return Qfalse;
893
1027
  }
894
1028
 
1029
+ ALWAYS_INLINE(static) VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name)
1030
+ {
1031
+ state->cursor++;
1032
+ const char *start = state->cursor;
1033
+
1034
+ if (RB_UNLIKELY(!string_scan(state))) {
1035
+ raise_parse_error("unexpected end of input, expected closing \"", state);
1036
+ }
1037
+
1038
+ if (RB_LIKELY(*state->cursor == '"')) {
1039
+ VALUE string = json_string_fastpath(state, config, start, state->cursor, is_name);
1040
+ state->cursor++;
1041
+ return json_push_value(state, config, string);
1042
+ }
1043
+ return json_parse_escaped_string(state, config, is_name, start);
1044
+ }
1045
+
1046
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1047
+ // From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
1048
+ // Additional References:
1049
+ // https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
1050
+ // http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
1051
+ static inline uint64_t decode_8digits_unrolled(uint64_t val) {
1052
+ const uint64_t mask = 0x000000FF000000FF;
1053
+ const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
1054
+ const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
1055
+ val -= 0x3030303030303030;
1056
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1057
+ val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
1058
+ return val;
1059
+ }
1060
+
1061
+ static inline uint64_t decode_4digits_unrolled(uint32_t val) {
1062
+ const uint32_t mask = 0x000000FF;
1063
+ const uint32_t mul1 = 100;
1064
+ val -= 0x30303030;
1065
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1066
+ val = ((val & mask) * mul1) + (((val >> 16) & mask));
1067
+ return val;
1068
+ }
1069
+ #endif
1070
+
1071
+ static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
1072
+ {
1073
+ const char *start = state->cursor;
1074
+
1075
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1076
+ while (rest(state) >= sizeof(uint64_t)) {
1077
+ uint64_t next_8bytes;
1078
+ memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
1079
+
1080
+ // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
1081
+ // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
1082
+ uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
1083
+
1084
+ if (match == 0x3333333333333333) { // 8 consecutive digits
1085
+ *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
1086
+ state->cursor += 8;
1087
+ continue;
1088
+ }
1089
+
1090
+ uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
1091
+
1092
+ if (consecutive_digits >= 4) {
1093
+ *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
1094
+ state->cursor += 4;
1095
+ consecutive_digits -= 4;
1096
+ }
1097
+
1098
+ while (consecutive_digits) {
1099
+ *accumulator = *accumulator * 10 + (*state->cursor - '0');
1100
+ consecutive_digits--;
1101
+ state->cursor++;
1102
+ }
1103
+
1104
+ return (int)(state->cursor - start);
1105
+ }
1106
+ #endif
1107
+
1108
+ char next_char;
1109
+ while (rb_isdigit(next_char = peek(state))) {
1110
+ *accumulator = *accumulator * 10 + (next_char - '0');
1111
+ state->cursor++;
1112
+ }
1113
+ return (int)(state->cursor - start);
1114
+ }
1115
+
1116
+ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
1117
+ {
1118
+ bool integer = true;
1119
+ const char first_digit = *state->cursor;
1120
+
1121
+ // Variables for Ryu optimization - extract digits during parsing
1122
+ int32_t exponent = 0;
1123
+ int decimal_point_pos = -1;
1124
+ uint64_t mantissa = 0;
1125
+
1126
+ // Parse integer part and extract mantissa digits
1127
+ int mantissa_digits = json_parse_digits(state, &mantissa);
1128
+
1129
+ if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
1130
+ raise_parse_error_at("invalid number: %s", state, start);
1131
+ }
1132
+
1133
+ // Parse fractional part
1134
+ if (peek(state) == '.') {
1135
+ integer = false;
1136
+ decimal_point_pos = mantissa_digits; // Remember position of decimal point
1137
+ state->cursor++;
1138
+
1139
+ int fractional_digits = json_parse_digits(state, &mantissa);
1140
+ mantissa_digits += fractional_digits;
1141
+
1142
+ if (RB_UNLIKELY(!fractional_digits)) {
1143
+ raise_parse_error_at("invalid number: %s", state, start);
1144
+ }
1145
+ }
1146
+
1147
+ // Parse exponent
1148
+ if (rb_tolower(peek(state)) == 'e') {
1149
+ integer = false;
1150
+ state->cursor++;
1151
+
1152
+ bool negative_exponent = false;
1153
+ const char next_char = peek(state);
1154
+ if (next_char == '-' || next_char == '+') {
1155
+ negative_exponent = next_char == '-';
1156
+ state->cursor++;
1157
+ }
1158
+
1159
+ uint64_t abs_exponent = 0;
1160
+ int exponent_digits = json_parse_digits(state, &abs_exponent);
1161
+
1162
+ if (RB_UNLIKELY(!exponent_digits)) {
1163
+ raise_parse_error_at("invalid number: %s", state, start);
1164
+ }
1165
+
1166
+ exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent);
1167
+ }
1168
+
1169
+ if (integer) {
1170
+ return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
1171
+ }
1172
+
1173
+ // Adjust exponent based on decimal point position
1174
+ if (decimal_point_pos >= 0) {
1175
+ exponent -= (mantissa_digits - decimal_point_pos);
1176
+ }
1177
+
1178
+ return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
1179
+ }
1180
+
1181
+ static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
1182
+ {
1183
+ return json_parse_number(state, config, false, state->cursor);
1184
+ }
1185
+
1186
+ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
1187
+ {
1188
+ const char *start = state->cursor;
1189
+ state->cursor++;
1190
+ return json_parse_number(state, config, true, start);
1191
+ }
1192
+
895
1193
  static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
896
1194
  {
897
1195
  json_eat_whitespace(state);
898
- if (state->cursor >= state->end) {
899
- raise_parse_error("unexpected end of input", state);
900
- }
901
1196
 
902
- switch (*state->cursor) {
1197
+ switch (peek(state)) {
903
1198
  case 'n':
904
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) {
1199
+ if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) {
905
1200
  state->cursor += 4;
906
1201
  return json_push_value(state, config, Qnil);
907
1202
  }
@@ -909,7 +1204,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
909
1204
  raise_parse_error("unexpected token %s", state);
910
1205
  break;
911
1206
  case 't':
912
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
1207
+ if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) {
913
1208
  state->cursor += 4;
914
1209
  return json_push_value(state, config, Qtrue);
915
1210
  }
@@ -918,7 +1213,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
918
1213
  break;
919
1214
  case 'f':
920
1215
  // Note: memcmp with a small power of two compile to an integer comparison
921
- if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1216
+ if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
922
1217
  state->cursor += 5;
923
1218
  return json_push_value(state, config, Qfalse);
924
1219
  }
@@ -927,7 +1222,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
927
1222
  break;
928
1223
  case 'N':
929
1224
  // Note: memcmp with a small power of two compile to an integer comparison
930
- if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1225
+ if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
931
1226
  state->cursor += 3;
932
1227
  return json_push_value(state, config, CNaN);
933
1228
  }
@@ -935,16 +1230,16 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
935
1230
  raise_parse_error("unexpected token %s", state);
936
1231
  break;
937
1232
  case 'I':
938
- if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1233
+ if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) {
939
1234
  state->cursor += 8;
940
1235
  return json_push_value(state, config, CInfinity);
941
1236
  }
942
1237
 
943
1238
  raise_parse_error("unexpected token %s", state);
944
1239
  break;
945
- case '-':
1240
+ case '-': {
946
1241
  // Note: memcmp with a small power of two compile to an integer comparison
947
- if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1242
+ if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
948
1243
  if (config->allow_nan) {
949
1244
  state->cursor += 9;
950
1245
  return json_push_value(state, config, CMinusInfinity);
@@ -952,62 +1247,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
952
1247
  raise_parse_error("unexpected token %s", state);
953
1248
  }
954
1249
  }
955
- // Fallthrough
956
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
957
- bool integer = true;
958
-
959
- // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
960
- const char *start = state->cursor;
961
- state->cursor++;
962
-
963
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
964
- state->cursor++;
965
- }
966
-
967
- long integer_length = state->cursor - start;
968
-
969
- if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
970
- raise_parse_error_at("invalid number: %s", state, start);
971
- } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
972
- raise_parse_error_at("invalid number: %s", state, start);
973
- } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
974
- raise_parse_error_at("invalid number: %s", state, start);
975
- }
976
-
977
- if ((state->cursor < state->end) && (*state->cursor == '.')) {
978
- integer = false;
979
- state->cursor++;
980
-
981
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
982
- raise_parse_error("invalid number: %s", state);
983
- }
984
-
985
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
986
- state->cursor++;
987
- }
988
- }
989
-
990
- if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
991
- integer = false;
992
- state->cursor++;
993
- if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
994
- state->cursor++;
995
- }
996
-
997
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
998
- raise_parse_error("invalid number: %s", state);
999
- }
1000
-
1001
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1002
- state->cursor++;
1003
- }
1004
- }
1005
-
1006
- if (integer) {
1007
- return json_push_value(state, config, json_decode_integer(start, state->cursor));
1008
- }
1009
- return json_push_value(state, config, json_decode_float(config, start, state->cursor));
1250
+ return json_push_value(state, config, json_parse_negative_number(state, config));
1251
+ break;
1010
1252
  }
1253
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1254
+ return json_push_value(state, config, json_parse_positive_number(state, config));
1255
+ break;
1011
1256
  case '"': {
1012
1257
  // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
1013
1258
  return json_parse_string(state, config, false);
@@ -1018,7 +1263,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1018
1263
  json_eat_whitespace(state);
1019
1264
  long stack_head = state->stack->head;
1020
1265
 
1021
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1266
+ if (peek(state) == ']') {
1022
1267
  state->cursor++;
1023
1268
  return json_push_value(state, config, json_decode_array(state, config, 0));
1024
1269
  } else {
@@ -1033,26 +1278,26 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1033
1278
  while (true) {
1034
1279
  json_eat_whitespace(state);
1035
1280
 
1036
- if (state->cursor < state->end) {
1037
- if (*state->cursor == ']') {
1038
- state->cursor++;
1039
- long count = state->stack->head - stack_head;
1040
- state->current_nesting--;
1041
- state->in_array--;
1042
- return json_push_value(state, config, json_decode_array(state, config, count));
1043
- }
1281
+ const char next_char = peek(state);
1044
1282
 
1045
- if (*state->cursor == ',') {
1046
- state->cursor++;
1047
- if (config->allow_trailing_comma) {
1048
- json_eat_whitespace(state);
1049
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1050
- continue;
1051
- }
1283
+ if (RB_LIKELY(next_char == ',')) {
1284
+ state->cursor++;
1285
+ if (config->allow_trailing_comma) {
1286
+ json_eat_whitespace(state);
1287
+ if (peek(state) == ']') {
1288
+ continue;
1052
1289
  }
1053
- json_parse_any(state, config);
1054
- continue;
1055
1290
  }
1291
+ json_parse_any(state, config);
1292
+ continue;
1293
+ }
1294
+
1295
+ if (next_char == ']') {
1296
+ state->cursor++;
1297
+ long count = state->stack->head - stack_head;
1298
+ state->current_nesting--;
1299
+ state->in_array--;
1300
+ return json_push_value(state, config, json_decode_array(state, config, count));
1056
1301
  }
1057
1302
 
1058
1303
  raise_parse_error("expected ',' or ']' after array value", state);
@@ -1060,11 +1305,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1060
1305
  break;
1061
1306
  }
1062
1307
  case '{': {
1308
+ const char *object_start_cursor = state->cursor;
1309
+
1063
1310
  state->cursor++;
1064
1311
  json_eat_whitespace(state);
1065
1312
  long stack_head = state->stack->head;
1066
1313
 
1067
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1314
+ if (peek(state) == '}') {
1068
1315
  state->cursor++;
1069
1316
  return json_push_value(state, config, json_decode_object(state, config, 0));
1070
1317
  } else {
@@ -1073,13 +1320,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1073
1320
  rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
1074
1321
  }
1075
1322
 
1076
- if (*state->cursor != '"') {
1323
+ if (peek(state) != '"') {
1077
1324
  raise_parse_error("expected object key, got %s", state);
1078
1325
  }
1079
1326
  json_parse_string(state, config, true);
1080
1327
 
1081
1328
  json_eat_whitespace(state);
1082
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1329
+ if (peek(state) != ':') {
1083
1330
  raise_parse_error("expected ':' after object key", state);
1084
1331
  }
1085
1332
  state->cursor++;
@@ -1090,39 +1337,45 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1090
1337
  while (true) {
1091
1338
  json_eat_whitespace(state);
1092
1339
 
1093
- if (state->cursor < state->end) {
1094
- if (*state->cursor == '}') {
1095
- state->cursor++;
1096
- state->current_nesting--;
1097
- long count = state->stack->head - stack_head;
1098
- return json_push_value(state, config, json_decode_object(state, config, count));
1099
- }
1340
+ const char next_char = peek(state);
1341
+ if (next_char == '}') {
1342
+ state->cursor++;
1343
+ state->current_nesting--;
1344
+ size_t count = state->stack->head - stack_head;
1100
1345
 
1101
- if (*state->cursor == ',') {
1102
- state->cursor++;
1103
- json_eat_whitespace(state);
1346
+ // Temporary rewind cursor in case an error is raised
1347
+ const char *final_cursor = state->cursor;
1348
+ state->cursor = object_start_cursor;
1349
+ VALUE object = json_decode_object(state, config, count);
1350
+ state->cursor = final_cursor;
1104
1351
 
1105
- if (config->allow_trailing_comma) {
1106
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1107
- continue;
1108
- }
1109
- }
1352
+ return json_push_value(state, config, object);
1353
+ }
1110
1354
 
1111
- if (*state->cursor != '"') {
1112
- raise_parse_error("expected object key, got: %s", state);
1113
- }
1114
- json_parse_string(state, config, true);
1355
+ if (next_char == ',') {
1356
+ state->cursor++;
1357
+ json_eat_whitespace(state);
1115
1358
 
1116
- json_eat_whitespace(state);
1117
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1118
- raise_parse_error("expected ':' after object key, got: %s", state);
1359
+ if (config->allow_trailing_comma) {
1360
+ if (peek(state) == '}') {
1361
+ continue;
1119
1362
  }
1120
- state->cursor++;
1363
+ }
1121
1364
 
1122
- json_parse_any(state, config);
1365
+ if (RB_UNLIKELY(peek(state) != '"')) {
1366
+ raise_parse_error("expected object key, got: %s", state);
1367
+ }
1368
+ json_parse_string(state, config, true);
1123
1369
 
1124
- continue;
1370
+ json_eat_whitespace(state);
1371
+ if (RB_UNLIKELY(peek(state) != ':')) {
1372
+ raise_parse_error("expected ':' after object key, got: %s", state);
1125
1373
  }
1374
+ state->cursor++;
1375
+
1376
+ json_parse_any(state, config);
1377
+
1378
+ continue;
1126
1379
  }
1127
1380
 
1128
1381
  raise_parse_error("expected ',' or '}' after object value, got: %s", state);
@@ -1130,18 +1383,23 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1130
1383
  break;
1131
1384
  }
1132
1385
 
1386
+ case 0:
1387
+ raise_parse_error("unexpected end of input", state);
1388
+ break;
1389
+
1133
1390
  default:
1134
1391
  raise_parse_error("unexpected character: %s", state);
1135
1392
  break;
1136
1393
  }
1137
1394
 
1138
- raise_parse_error("unreacheable: %s", state);
1395
+ raise_parse_error("unreachable: %s", state);
1396
+ return Qundef;
1139
1397
  }
1140
1398
 
1141
1399
  static void json_ensure_eof(JSON_ParserState *state)
1142
1400
  {
1143
1401
  json_eat_whitespace(state);
1144
- if (state->cursor != state->end) {
1402
+ if (!eos(state)) {
1145
1403
  raise_parse_error("unexpected token at end of stream %s", state);
1146
1404
  }
1147
1405
  }
@@ -1178,13 +1436,15 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data)
1178
1436
  {
1179
1437
  JSON_ParserConfig *config = (JSON_ParserConfig *)data;
1180
1438
 
1181
- if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1182
- else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1183
- else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1184
- else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1185
- else if (key == sym_freeze) { config->freeze = RTEST(val); }
1186
- else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1187
- else if (key == sym_decimal_class) {
1439
+ if (key == sym_max_nesting) { config->max_nesting = RTEST(val) ? FIX2INT(val) : 0; }
1440
+ else if (key == sym_allow_nan) { config->allow_nan = RTEST(val); }
1441
+ else if (key == sym_allow_trailing_comma) { config->allow_trailing_comma = RTEST(val); }
1442
+ else if (key == sym_allow_control_characters) { config->allow_control_characters = RTEST(val); }
1443
+ else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); }
1444
+ else if (key == sym_freeze) { config->freeze = RTEST(val); }
1445
+ else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; }
1446
+ else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; }
1447
+ else if (key == sym_decimal_class) {
1188
1448
  if (RTEST(val)) {
1189
1449
  if (rb_respond_to(val, i_try_convert)) {
1190
1450
  config->decimal_class = val;
@@ -1257,6 +1517,7 @@ static void parser_config_init(JSON_ParserConfig *config, VALUE opts)
1257
1517
  */
1258
1518
  static VALUE cParserConfig_initialize(VALUE self, VALUE opts)
1259
1519
  {
1520
+ rb_check_frozen(self);
1260
1521
  GET_PARSER_CONFIG;
1261
1522
 
1262
1523
  parser_config_init(config, opts);
@@ -1352,7 +1613,7 @@ static const rb_data_type_t JSON_ParserConfig_type = {
1352
1613
  JSON_ParserConfig_memsize,
1353
1614
  },
1354
1615
  0, 0,
1355
- RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED,
1616
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_FROZEN_SHAREABLE,
1356
1617
  };
1357
1618
 
1358
1619
  static VALUE cJSON_parser_s_allocate(VALUE klass)
@@ -1396,15 +1657,13 @@ void Init_parser(void)
1396
1657
  sym_max_nesting = ID2SYM(rb_intern("max_nesting"));
1397
1658
  sym_allow_nan = ID2SYM(rb_intern("allow_nan"));
1398
1659
  sym_allow_trailing_comma = ID2SYM(rb_intern("allow_trailing_comma"));
1660
+ sym_allow_control_characters = ID2SYM(rb_intern("allow_control_characters"));
1399
1661
  sym_symbolize_names = ID2SYM(rb_intern("symbolize_names"));
1400
1662
  sym_freeze = ID2SYM(rb_intern("freeze"));
1401
1663
  sym_on_load = ID2SYM(rb_intern("on_load"));
1402
1664
  sym_decimal_class = ID2SYM(rb_intern("decimal_class"));
1665
+ sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key"));
1403
1666
 
1404
- i_chr = rb_intern("chr");
1405
- i_aset = rb_intern("[]=");
1406
- i_aref = rb_intern("[]");
1407
- i_leftshift = rb_intern("<<");
1408
1667
  i_new = rb_intern("new");
1409
1668
  i_try_convert = rb_intern("try_convert");
1410
1669
  i_uminus = rb_intern("-@");
@@ -1413,4 +1672,8 @@ void Init_parser(void)
1413
1672
  binary_encindex = rb_ascii8bit_encindex();
1414
1673
  utf8_encindex = rb_utf8_encindex();
1415
1674
  enc_utf8 = rb_utf8_encoding();
1675
+
1676
+ #ifdef HAVE_SIMD
1677
+ simd_impl = find_simd_implementation();
1678
+ #endif
1416
1679
  }