json 2.15.2.1 → 2.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,35 +1,7 @@
1
- #include "ruby.h"
2
- #include "ruby/encoding.h"
3
-
4
- /* shims */
5
- /* This is the fallback definition from Ruby 3.4 */
6
-
7
- #ifndef RBIMPL_STDBOOL_H
8
- #if defined(__cplusplus)
9
- # if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
10
- # include <cstdbool>
11
- # endif
12
- #elif defined(HAVE_STDBOOL_H)
13
- # include <stdbool.h>
14
- #elif !defined(HAVE__BOOL)
15
- typedef unsigned char _Bool;
16
- # define bool _Bool
17
- # define true ((_Bool)+1)
18
- # define false ((_Bool)+0)
19
- # define __bool_true_false_are_defined
20
- #endif
21
- #endif
22
-
1
+ #include "../json.h"
2
+ #include "../vendor/ryu.h"
23
3
  #include "../simd/simd.h"
24
4
 
25
- #ifndef RB_UNLIKELY
26
- #define RB_UNLIKELY(expr) expr
27
- #endif
28
-
29
- #ifndef RB_LIKELY
30
- #define RB_LIKELY(expr) expr
31
- #endif
32
-
33
5
  static VALUE mJSON, eNestingError, Encoding_UTF_8;
34
6
  static VALUE CNaN, CInfinity, CMinusInfinity;
35
7
 
@@ -44,7 +16,7 @@ static int utf8_encindex;
44
16
 
45
17
  #ifndef HAVE_RB_HASH_BULK_INSERT
46
18
  // For TruffleRuby
47
- void
19
+ static void
48
20
  rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
49
21
  {
50
22
  long index = 0;
@@ -61,6 +33,12 @@ rb_hash_bulk_insert(long count, const VALUE *pairs, VALUE hash)
61
33
  #define rb_hash_new_capa(n) rb_hash_new()
62
34
  #endif
63
35
 
36
+ #ifndef HAVE_RB_STR_TO_INTERNED_STR
37
+ static VALUE rb_str_to_interned_str(VALUE str)
38
+ {
39
+ return rb_funcall(rb_str_freeze(str), i_uminus, 0);
40
+ }
41
+ #endif
64
42
 
65
43
  /* name cache */
66
44
 
@@ -106,116 +84,104 @@ static void rvalue_cache_insert_at(rvalue_cache *cache, int index, VALUE rstring
106
84
  cache->entries[index] = rstring;
107
85
  }
108
86
 
109
- static inline int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
87
+ #define rstring_cache_memcmp memcmp
88
+
89
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
90
+ #if __has_builtin(__builtin_bswap64)
91
+ #undef rstring_cache_memcmp
92
+ static ALWAYS_INLINE() int rstring_cache_memcmp(const char *str, const char *rptr, const long length)
110
93
  {
111
- long rstring_length = RSTRING_LEN(rstring);
112
- if (length == rstring_length) {
113
- return memcmp(str, RSTRING_PTR(rstring), length);
114
- } else {
115
- return (int)(length - rstring_length);
94
+ // The libc memcmp has numerous complex optimizations, but in this particular case,
95
+ // we know the string is small (JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH), so being able to
96
+ // inline a simpler memcmp outperforms calling the libc version.
97
+ long i = 0;
98
+
99
+ for (; i + 8 <= length; i += 8) {
100
+ uint64_t a, b;
101
+ memcpy(&a, str + i, 8);
102
+ memcpy(&b, rptr + i, 8);
103
+ if (a != b) {
104
+ a = __builtin_bswap64(a);
105
+ b = __builtin_bswap64(b);
106
+ return (a < b) ? -1 : 1;
107
+ }
116
108
  }
109
+
110
+ for (; i < length; i++) {
111
+ if (str[i] != rptr[i]) {
112
+ return (str[i] < rptr[i]) ? -1 : 1;
113
+ }
114
+ }
115
+
116
+ return 0;
117
117
  }
118
+ #endif
119
+ #endif
118
120
 
119
- static VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
121
+ static ALWAYS_INLINE() int rstring_cache_cmp(const char *str, const long length, VALUE rstring)
120
122
  {
121
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
122
- // Common names aren't likely to be very long. So we just don't
123
- // cache names above an arbitrary threshold.
124
- return Qfalse;
125
- }
123
+ const char *rstring_ptr;
124
+ long rstring_length;
126
125
 
127
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
128
- // Simple heuristic, if the first character isn't a letter,
129
- // we're much less likely to see this string again.
130
- // We mostly want to cache strings that are likely to be repeated.
131
- return Qfalse;
126
+ RSTRING_GETMEM(rstring, rstring_ptr, rstring_length);
127
+
128
+ if (length == rstring_length) {
129
+ return rstring_cache_memcmp(str, rstring_ptr, length);
130
+ } else {
131
+ return (int)(length - rstring_length);
132
132
  }
133
+ }
133
134
 
135
+ static ALWAYS_INLINE() VALUE rstring_cache_fetch(rvalue_cache *cache, const char *str, const long length)
136
+ {
134
137
  int low = 0;
135
138
  int high = cache->length - 1;
136
- int mid = 0;
137
- int last_cmp = 0;
138
139
 
139
140
  while (low <= high) {
140
- mid = (high + low) >> 1;
141
+ int mid = (high + low) >> 1;
141
142
  VALUE entry = cache->entries[mid];
142
- last_cmp = rstring_cache_cmp(str, length, entry);
143
+ int cmp = rstring_cache_cmp(str, length, entry);
143
144
 
144
- if (last_cmp == 0) {
145
+ if (cmp == 0) {
145
146
  return entry;
146
- } else if (last_cmp > 0) {
147
+ } else if (cmp > 0) {
147
148
  low = mid + 1;
148
149
  } else {
149
150
  high = mid - 1;
150
151
  }
151
152
  }
152
153
 
153
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
154
- // We assume the overwhelming majority of names don't need to be escaped.
155
- // But if they do, we have to fallback to the slow path.
156
- return Qfalse;
157
- }
158
-
159
154
  VALUE rstring = build_interned_string(str, length);
160
155
 
161
156
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
162
- if (last_cmp > 0) {
163
- mid += 1;
164
- }
165
-
166
- rvalue_cache_insert_at(cache, mid, rstring);
157
+ rvalue_cache_insert_at(cache, low, rstring);
167
158
  }
168
159
  return rstring;
169
160
  }
170
161
 
171
162
  static VALUE rsymbol_cache_fetch(rvalue_cache *cache, const char *str, const long length)
172
163
  {
173
- if (RB_UNLIKELY(length > JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH)) {
174
- // Common names aren't likely to be very long. So we just don't
175
- // cache names above an arbitrary threshold.
176
- return Qfalse;
177
- }
178
-
179
- if (RB_UNLIKELY(!isalpha((unsigned char)str[0]))) {
180
- // Simple heuristic, if the first character isn't a letter,
181
- // we're much less likely to see this string again.
182
- // We mostly want to cache strings that are likely to be repeated.
183
- return Qfalse;
184
- }
185
-
186
164
  int low = 0;
187
165
  int high = cache->length - 1;
188
- int mid = 0;
189
- int last_cmp = 0;
190
166
 
191
167
  while (low <= high) {
192
- mid = (high + low) >> 1;
168
+ int mid = (high + low) >> 1;
193
169
  VALUE entry = cache->entries[mid];
194
- last_cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
170
+ int cmp = rstring_cache_cmp(str, length, rb_sym2str(entry));
195
171
 
196
- if (last_cmp == 0) {
172
+ if (cmp == 0) {
197
173
  return entry;
198
- } else if (last_cmp > 0) {
174
+ } else if (cmp > 0) {
199
175
  low = mid + 1;
200
176
  } else {
201
177
  high = mid - 1;
202
178
  }
203
179
  }
204
180
 
205
- if (RB_UNLIKELY(memchr(str, '\\', length))) {
206
- // We assume the overwhelming majority of names don't need to be escaped.
207
- // But if they do, we have to fallback to the slow path.
208
- return Qfalse;
209
- }
210
-
211
181
  VALUE rsymbol = build_symbol(str, length);
212
182
 
213
183
  if (cache->length < JSON_RVALUE_CACHE_CAPA) {
214
- if (last_cmp > 0) {
215
- mid += 1;
216
- }
217
-
218
- rvalue_cache_insert_at(cache, mid, rsymbol);
184
+ rvalue_cache_insert_at(cache, low, rsymbol);
219
185
  }
220
186
  return rsymbol;
221
187
  }
@@ -395,6 +361,22 @@ typedef struct JSON_ParserStateStruct {
395
361
  int current_nesting;
396
362
  } JSON_ParserState;
397
363
 
364
+ static inline size_t rest(JSON_ParserState *state) {
365
+ return state->end - state->cursor;
366
+ }
367
+
368
+ static inline bool eos(JSON_ParserState *state) {
369
+ return state->cursor >= state->end;
370
+ }
371
+
372
+ static inline char peek(JSON_ParserState *state)
373
+ {
374
+ if (RB_UNLIKELY(eos(state))) {
375
+ return 0;
376
+ }
377
+ return *state->cursor;
378
+ }
379
+
398
380
  static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out)
399
381
  {
400
382
  const char *cursor = state->cursor;
@@ -428,9 +410,14 @@ static void emit_parse_warning(const char *message, JSON_ParserState *state)
428
410
 
429
411
  #define PARSE_ERROR_FRAGMENT_LEN 32
430
412
 
431
- static VALUE build_parse_error_message(const char *format, JSON_ParserState *state, long line, long column)
413
+ #ifdef RBIMPL_ATTR_NORETURN
414
+ RBIMPL_ATTR_NORETURN()
415
+ #endif
416
+ static void raise_parse_error(const char *format, JSON_ParserState *state)
432
417
  {
433
418
  unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
419
+ long line, column;
420
+ cursor_position(state, &line, &column);
434
421
 
435
422
  const char *ptr = "EOF";
436
423
  if (state->cursor && state->cursor < state->end) {
@@ -465,23 +452,11 @@ static VALUE build_parse_error_message(const char *format, JSON_ParserState *sta
465
452
  VALUE msg = rb_sprintf(format, ptr);
466
453
  VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column);
467
454
  RB_GC_GUARD(msg);
468
- return message;
469
- }
470
455
 
471
- static VALUE parse_error_new(VALUE message, long line, long column)
472
- {
473
456
  VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message);
474
457
  rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line));
475
458
  rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column));
476
- return exc;
477
- }
478
-
479
- NORETURN(static) void raise_parse_error(const char *format, JSON_ParserState *state)
480
- {
481
- long line, column;
482
- cursor_position(state, &line, &column);
483
- VALUE message = build_parse_error_message(format, state, line, column);
484
- rb_exc_raise(parse_error_new(message, line, column));
459
+ rb_exc_raise(exc);
485
460
  }
486
461
 
487
462
  #ifdef RBIMPL_ATTR_NORETURN
@@ -537,61 +512,82 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
537
512
 
538
513
  static const rb_data_type_t JSON_ParserConfig_type;
539
514
 
540
- static const bool whitespace[256] = {
541
- [' '] = 1,
542
- ['\t'] = 1,
543
- ['\n'] = 1,
544
- ['\r'] = 1,
545
- ['/'] = 1,
546
- };
547
-
548
515
  static void
549
516
  json_eat_comments(JSON_ParserState *state)
550
517
  {
551
- if (state->cursor + 1 < state->end) {
552
- switch (state->cursor[1]) {
553
- case '/': {
554
- state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
555
- if (!state->cursor) {
556
- state->cursor = state->end;
557
- } else {
558
- state->cursor++;
559
- }
560
- break;
518
+ const char *start = state->cursor;
519
+ state->cursor++;
520
+
521
+ switch (peek(state)) {
522
+ case '/': {
523
+ state->cursor = memchr(state->cursor, '\n', state->end - state->cursor);
524
+ if (!state->cursor) {
525
+ state->cursor = state->end;
526
+ } else {
527
+ state->cursor++;
561
528
  }
562
- case '*': {
563
- state->cursor += 2;
564
- while (true) {
565
- state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
566
- if (!state->cursor) {
567
- raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
568
- } else {
569
- state->cursor++;
570
- if (state->cursor < state->end && *state->cursor == '/') {
571
- state->cursor++;
572
- break;
573
- }
574
- }
529
+ break;
530
+ }
531
+ case '*': {
532
+ state->cursor++;
533
+
534
+ while (true) {
535
+ const char *next_match = memchr(state->cursor, '*', state->end - state->cursor);
536
+ if (!next_match) {
537
+ raise_parse_error_at("unterminated comment, expected closing '*/'", state, start);
538
+ }
539
+
540
+ state->cursor = next_match + 1;
541
+ if (peek(state) == '/') {
542
+ state->cursor++;
543
+ break;
575
544
  }
576
- break;
577
545
  }
578
- default:
579
- raise_parse_error("unexpected token %s", state);
580
- break;
546
+ break;
581
547
  }
582
- } else {
583
- raise_parse_error("unexpected token %s", state);
548
+ default:
549
+ raise_parse_error_at("unexpected token %s", state, start);
550
+ break;
584
551
  }
585
552
  }
586
553
 
587
- static inline void
554
+ static ALWAYS_INLINE() void
588
555
  json_eat_whitespace(JSON_ParserState *state)
589
556
  {
590
- while (state->cursor < state->end && RB_UNLIKELY(whitespace[(unsigned char)*state->cursor])) {
591
- if (RB_LIKELY(*state->cursor != '/')) {
592
- state->cursor++;
593
- } else {
594
- json_eat_comments(state);
557
+ while (true) {
558
+ switch (peek(state)) {
559
+ case ' ':
560
+ state->cursor++;
561
+ break;
562
+ case '\n':
563
+ state->cursor++;
564
+
565
+ // Heuristic: if we see a newline, there is likely consecutive spaces after it.
566
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
567
+ while (rest(state) > 8) {
568
+ uint64_t chunk;
569
+ memcpy(&chunk, state->cursor, sizeof(uint64_t));
570
+ if (chunk == 0x2020202020202020) {
571
+ state->cursor += 8;
572
+ continue;
573
+ }
574
+
575
+ uint32_t consecutive_spaces = trailing_zeros64(chunk ^ 0x2020202020202020) / CHAR_BIT;
576
+ state->cursor += consecutive_spaces;
577
+ break;
578
+ }
579
+ #endif
580
+ break;
581
+ case '\t':
582
+ case '\r':
583
+ state->cursor++;
584
+ break;
585
+ case '/':
586
+ json_eat_comments(state);
587
+ break;
588
+
589
+ default:
590
+ return;
595
591
  }
596
592
  }
597
593
  }
@@ -622,11 +618,20 @@ static inline VALUE build_string(const char *start, const char *end, bool intern
622
618
  return result;
623
619
  }
624
620
 
621
+ static inline bool json_string_cacheable_p(const char *string, size_t length)
622
+ {
623
+ // We mostly want to cache strings that are likely to be repeated.
624
+ // Simple heuristics:
625
+ // - Common names aren't likely to be very long. So we just don't cache names above an arbitrary threshold.
626
+ // - If the first character isn't a letter, we're much less likely to see this string again.
627
+ return length <= JSON_RVALUE_CACHE_MAX_ENTRY_LENGTH && rb_isalpha(string[0]);
628
+ }
629
+
625
630
  static inline VALUE json_string_fastpath(JSON_ParserState *state, const char *string, const char *stringEnd, bool is_name, bool intern, bool symbolize)
626
631
  {
627
632
  size_t bufferSize = stringEnd - string;
628
633
 
629
- if (is_name && state->in_array) {
634
+ if (is_name && state->in_array && RB_LIKELY(json_string_cacheable_p(string, bufferSize))) {
630
635
  VALUE cached_key;
631
636
  if (RB_UNLIKELY(symbolize)) {
632
637
  cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
@@ -650,19 +655,6 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
650
655
  int unescape_len;
651
656
  char buf[4];
652
657
 
653
- if (is_name && state->in_array) {
654
- VALUE cached_key;
655
- if (RB_UNLIKELY(symbolize)) {
656
- cached_key = rsymbol_cache_fetch(&state->name_cache, string, bufferSize);
657
- } else {
658
- cached_key = rstring_cache_fetch(&state->name_cache, string, bufferSize);
659
- }
660
-
661
- if (RB_LIKELY(cached_key)) {
662
- return cached_key;
663
- }
664
- }
665
-
666
658
  VALUE result = rb_str_buf_new(bufferSize);
667
659
  rb_enc_associate_index(result, utf8_encindex);
668
660
  buffer = RSTRING_PTR(result);
@@ -755,33 +747,13 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
755
747
  if (symbolize) {
756
748
  result = rb_str_intern(result);
757
749
  } else if (intern) {
758
- result = rb_funcall(rb_str_freeze(result), i_uminus, 0);
750
+ result = rb_str_to_interned_str(result);
759
751
  }
760
752
 
761
753
  return result;
762
754
  }
763
755
 
764
756
  #define MAX_FAST_INTEGER_SIZE 18
765
- static inline VALUE fast_decode_integer(const char *p, const char *pe)
766
- {
767
- bool negative = false;
768
- if (*p == '-') {
769
- negative = true;
770
- p++;
771
- }
772
-
773
- long long memo = 0;
774
- while (p < pe) {
775
- memo *= 10;
776
- memo += *p - '0';
777
- p++;
778
- }
779
-
780
- if (negative) {
781
- memo = -memo;
782
- }
783
- return LL2NUM(memo);
784
- }
785
757
 
786
758
  static VALUE json_decode_large_integer(const char *start, long len)
787
759
  {
@@ -795,17 +767,27 @@ static VALUE json_decode_large_integer(const char *start, long len)
795
767
  }
796
768
 
797
769
  static inline VALUE
798
- json_decode_integer(const char *start, const char *end)
770
+ json_decode_integer(uint64_t mantissa, int mantissa_digits, bool negative, const char *start, const char *end)
799
771
  {
800
- long len = end - start;
801
- if (RB_LIKELY(len < MAX_FAST_INTEGER_SIZE)) {
802
- return fast_decode_integer(start, end);
772
+ if (RB_LIKELY(mantissa_digits < MAX_FAST_INTEGER_SIZE)) {
773
+ if (negative) {
774
+ return INT64T2NUM(-((int64_t)mantissa));
803
775
  }
804
- return json_decode_large_integer(start, len);
776
+ return UINT64T2NUM(mantissa);
777
+ }
778
+
779
+ return json_decode_large_integer(start, end - start);
805
780
  }
806
781
 
807
782
  static VALUE json_decode_large_float(const char *start, long len)
808
783
  {
784
+ if (RB_LIKELY(len < 64)) {
785
+ char buffer[64];
786
+ MEMCPY(buffer, start, char, len);
787
+ buffer[len] = '\0';
788
+ return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
789
+ }
790
+
809
791
  VALUE buffer_v;
810
792
  char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1);
811
793
  MEMCPY(buffer, start, char, len);
@@ -815,21 +797,24 @@ static VALUE json_decode_large_float(const char *start, long len)
815
797
  return number;
816
798
  }
817
799
 
818
- static VALUE json_decode_float(JSON_ParserConfig *config, const char *start, const char *end)
800
+ /* Ruby JSON optimized float decoder using vendored Ryu algorithm
801
+ * Accepts pre-extracted mantissa and exponent from first-pass validation
802
+ */
803
+ static inline VALUE json_decode_float(JSON_ParserConfig *config, uint64_t mantissa, int mantissa_digits, int32_t exponent, bool negative,
804
+ const char *start, const char *end)
819
805
  {
820
- long len = end - start;
821
-
822
806
  if (RB_UNLIKELY(config->decimal_class)) {
823
- VALUE text = rb_str_new(start, len);
807
+ VALUE text = rb_str_new(start, end - start);
824
808
  return rb_funcallv(config->decimal_class, config->decimal_method_id, 1, &text);
825
- } else if (RB_LIKELY(len < 64)) {
826
- char buffer[64];
827
- MEMCPY(buffer, start, char, len);
828
- buffer[len] = '\0';
829
- return DBL2NUM(rb_cstr_to_dbl(buffer, 1));
830
- } else {
831
- return json_decode_large_float(start, len);
832
809
  }
810
+
811
+ // Fall back to rb_cstr_to_dbl for potential subnormals (rare edge case)
812
+ // Ryu has rounding issues with subnormals around 1e-310 (< 2.225e-308)
813
+ if (RB_UNLIKELY(mantissa_digits > 17 || mantissa_digits + exponent < -307)) {
814
+ return json_decode_large_float(start, end - start);
815
+ }
816
+
817
+ return DBL2NUM(ryu_s2d_from_parts(mantissa, mantissa_digits, exponent, negative));
833
818
  }
834
819
 
835
820
  static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig *config, long count)
@@ -882,11 +867,6 @@ static void raise_duplicate_key_error(JSON_ParserState *state, VALUE duplicate_k
882
867
  rb_inspect(duplicate_key)
883
868
  );
884
869
 
885
- long line, column;
886
- cursor_position(state, &line, &column);
887
- rb_str_concat(message, build_parse_error_message("", state, line, column)) ;
888
- rb_exc_raise(parse_error_new(message, line, column));
889
-
890
870
  raise_parse_error(RSTRING_PTR(message), state);
891
871
  RB_GC_GUARD(message);
892
872
  }
@@ -956,17 +936,11 @@ static const bool string_scan_table[256] = {
956
936
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
957
937
  };
958
938
 
959
- #if (defined(__GNUC__ ) || defined(__clang__))
960
- #define FORCE_INLINE __attribute__((always_inline))
961
- #else
962
- #define FORCE_INLINE
963
- #endif
964
-
965
939
  #ifdef HAVE_SIMD
966
940
  static SIMD_Implementation simd_impl = SIMD_NONE;
967
941
  #endif /* HAVE_SIMD */
968
942
 
969
- static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
943
+ static ALWAYS_INLINE() bool string_scan(JSON_ParserState *state)
970
944
  {
971
945
  #ifdef HAVE_SIMD
972
946
  #if defined(HAVE_SIMD_NEON)
@@ -988,7 +962,7 @@ static inline bool FORCE_INLINE string_scan(JSON_ParserState *state)
988
962
  #endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */
989
963
  #endif /* HAVE_SIMD */
990
964
 
991
- while (state->cursor < state->end) {
965
+ while (!eos(state)) {
992
966
  if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) {
993
967
  return 1;
994
968
  }
@@ -1030,16 +1004,160 @@ static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig
1030
1004
  return Qfalse;
1031
1005
  }
1032
1006
 
1007
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1008
+ // From: https://lemire.me/blog/2022/01/21/swar-explained-parsing-eight-digits/
1009
+ // Additional References:
1010
+ // https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
1011
+ // http://0x80.pl/notesen/2014-10-12-parsing-decimal-numbers-part-1-swar.html
1012
+ static inline uint64_t decode_8digits_unrolled(uint64_t val) {
1013
+ const uint64_t mask = 0x000000FF000000FF;
1014
+ const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32)
1015
+ const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32)
1016
+ val -= 0x3030303030303030;
1017
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1018
+ val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32;
1019
+ return val;
1020
+ }
1021
+
1022
+ static inline uint64_t decode_4digits_unrolled(uint32_t val) {
1023
+ const uint32_t mask = 0x000000FF;
1024
+ const uint32_t mul1 = 100;
1025
+ val -= 0x30303030;
1026
+ val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8;
1027
+ val = ((val & mask) * mul1) + (((val >> 16) & mask));
1028
+ return val;
1029
+ }
1030
+ #endif
1031
+
1032
+ static inline int json_parse_digits(JSON_ParserState *state, uint64_t *accumulator)
1033
+ {
1034
+ const char *start = state->cursor;
1035
+
1036
+ #if JSON_CPU_LITTLE_ENDIAN_64BITS
1037
+ while (rest(state) >= sizeof(uint64_t)) {
1038
+ uint64_t next_8bytes;
1039
+ memcpy(&next_8bytes, state->cursor, sizeof(uint64_t));
1040
+
1041
+ // From: https://github.com/simdjson/simdjson/blob/32b301893c13d058095a07d9868edaaa42ee07aa/include/simdjson/generic/numberparsing.h#L333
1042
+ // Branchless version of: http://0x80.pl/articles/swar-digits-validate.html
1043
+ uint64_t match = (next_8bytes & 0xF0F0F0F0F0F0F0F0) | (((next_8bytes + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4);
1044
+
1045
+ if (match == 0x3333333333333333) { // 8 consecutive digits
1046
+ *accumulator = (*accumulator * 100000000) + decode_8digits_unrolled(next_8bytes);
1047
+ state->cursor += 8;
1048
+ continue;
1049
+ }
1050
+
1051
+ uint32_t consecutive_digits = trailing_zeros64(match ^ 0x3333333333333333) / CHAR_BIT;
1052
+
1053
+ if (consecutive_digits >= 4) {
1054
+ *accumulator = (*accumulator * 10000) + decode_4digits_unrolled((uint32_t)next_8bytes);
1055
+ state->cursor += 4;
1056
+ consecutive_digits -= 4;
1057
+ }
1058
+
1059
+ while (consecutive_digits) {
1060
+ *accumulator = *accumulator * 10 + (*state->cursor - '0');
1061
+ consecutive_digits--;
1062
+ state->cursor++;
1063
+ }
1064
+
1065
+ return (int)(state->cursor - start);
1066
+ }
1067
+ #endif
1068
+
1069
+ char next_char;
1070
+ while (rb_isdigit(next_char = peek(state))) {
1071
+ *accumulator = *accumulator * 10 + (next_char - '0');
1072
+ state->cursor++;
1073
+ }
1074
+ return (int)(state->cursor - start);
1075
+ }
1076
+
1077
+ static inline VALUE json_parse_number(JSON_ParserState *state, JSON_ParserConfig *config, bool negative, const char *start)
1078
+ {
1079
+ bool integer = true;
1080
+ const char first_digit = *state->cursor;
1081
+
1082
+ // Variables for Ryu optimization - extract digits during parsing
1083
+ int32_t exponent = 0;
1084
+ int decimal_point_pos = -1;
1085
+ uint64_t mantissa = 0;
1086
+
1087
+ // Parse integer part and extract mantissa digits
1088
+ int mantissa_digits = json_parse_digits(state, &mantissa);
1089
+
1090
+ if (RB_UNLIKELY((first_digit == '0' && mantissa_digits > 1) || (negative && mantissa_digits == 0))) {
1091
+ raise_parse_error_at("invalid number: %s", state, start);
1092
+ }
1093
+
1094
+ // Parse fractional part
1095
+ if (peek(state) == '.') {
1096
+ integer = false;
1097
+ decimal_point_pos = mantissa_digits; // Remember position of decimal point
1098
+ state->cursor++;
1099
+
1100
+ int fractional_digits = json_parse_digits(state, &mantissa);
1101
+ mantissa_digits += fractional_digits;
1102
+
1103
+ if (RB_UNLIKELY(!fractional_digits)) {
1104
+ raise_parse_error_at("invalid number: %s", state, start);
1105
+ }
1106
+ }
1107
+
1108
+ // Parse exponent
1109
+ if (rb_tolower(peek(state)) == 'e') {
1110
+ integer = false;
1111
+ state->cursor++;
1112
+
1113
+ bool negative_exponent = false;
1114
+ const char next_char = peek(state);
1115
+ if (next_char == '-' || next_char == '+') {
1116
+ negative_exponent = next_char == '-';
1117
+ state->cursor++;
1118
+ }
1119
+
1120
+ uint64_t abs_exponent = 0;
1121
+ int exponent_digits = json_parse_digits(state, &abs_exponent);
1122
+
1123
+ if (RB_UNLIKELY(!exponent_digits)) {
1124
+ raise_parse_error_at("invalid number: %s", state, start);
1125
+ }
1126
+
1127
+ exponent = negative_exponent ? -((int32_t)abs_exponent) : ((int32_t)abs_exponent);
1128
+ }
1129
+
1130
+ if (integer) {
1131
+ return json_decode_integer(mantissa, mantissa_digits, negative, start, state->cursor);
1132
+ }
1133
+
1134
+ // Adjust exponent based on decimal point position
1135
+ if (decimal_point_pos >= 0) {
1136
+ exponent -= (mantissa_digits - decimal_point_pos);
1137
+ }
1138
+
1139
+ return json_decode_float(config, mantissa, mantissa_digits, exponent, negative, start, state->cursor);
1140
+ }
1141
+
1142
+ static inline VALUE json_parse_positive_number(JSON_ParserState *state, JSON_ParserConfig *config)
1143
+ {
1144
+ return json_parse_number(state, config, false, state->cursor);
1145
+ }
1146
+
1147
+ static inline VALUE json_parse_negative_number(JSON_ParserState *state, JSON_ParserConfig *config)
1148
+ {
1149
+ const char *start = state->cursor;
1150
+ state->cursor++;
1151
+ return json_parse_number(state, config, true, start);
1152
+ }
1153
+
1033
1154
  static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1034
1155
  {
1035
1156
  json_eat_whitespace(state);
1036
- if (state->cursor >= state->end) {
1037
- raise_parse_error("unexpected end of input", state);
1038
- }
1039
1157
 
1040
- switch (*state->cursor) {
1158
+ switch (peek(state)) {
1041
1159
  case 'n':
1042
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "null", 4) == 0)) {
1160
+ if (rest(state) >= 4 && (memcmp(state->cursor, "null", 4) == 0)) {
1043
1161
  state->cursor += 4;
1044
1162
  return json_push_value(state, config, Qnil);
1045
1163
  }
@@ -1047,7 +1165,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1047
1165
  raise_parse_error("unexpected token %s", state);
1048
1166
  break;
1049
1167
  case 't':
1050
- if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
1168
+ if (rest(state) >= 4 && (memcmp(state->cursor, "true", 4) == 0)) {
1051
1169
  state->cursor += 4;
1052
1170
  return json_push_value(state, config, Qtrue);
1053
1171
  }
@@ -1056,7 +1174,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1056
1174
  break;
1057
1175
  case 'f':
1058
1176
  // Note: memcmp with a small power of two compile to an integer comparison
1059
- if ((state->end - state->cursor >= 5) && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1177
+ if (rest(state) >= 5 && (memcmp(state->cursor + 1, "alse", 4) == 0)) {
1060
1178
  state->cursor += 5;
1061
1179
  return json_push_value(state, config, Qfalse);
1062
1180
  }
@@ -1065,7 +1183,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1065
1183
  break;
1066
1184
  case 'N':
1067
1185
  // Note: memcmp with a small power of two compile to an integer comparison
1068
- if (config->allow_nan && (state->end - state->cursor >= 3) && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1186
+ if (config->allow_nan && rest(state) >= 3 && (memcmp(state->cursor + 1, "aN", 2) == 0)) {
1069
1187
  state->cursor += 3;
1070
1188
  return json_push_value(state, config, CNaN);
1071
1189
  }
@@ -1073,16 +1191,16 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1073
1191
  raise_parse_error("unexpected token %s", state);
1074
1192
  break;
1075
1193
  case 'I':
1076
- if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1194
+ if (config->allow_nan && rest(state) >= 8 && (memcmp(state->cursor, "Infinity", 8) == 0)) {
1077
1195
  state->cursor += 8;
1078
1196
  return json_push_value(state, config, CInfinity);
1079
1197
  }
1080
1198
 
1081
1199
  raise_parse_error("unexpected token %s", state);
1082
1200
  break;
1083
- case '-':
1201
+ case '-': {
1084
1202
  // Note: memcmp with a small power of two compile to an integer comparison
1085
- if ((state->end - state->cursor >= 9) && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1203
+ if (rest(state) >= 9 && (memcmp(state->cursor + 1, "Infinity", 8) == 0)) {
1086
1204
  if (config->allow_nan) {
1087
1205
  state->cursor += 9;
1088
1206
  return json_push_value(state, config, CMinusInfinity);
@@ -1090,62 +1208,12 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1090
1208
  raise_parse_error("unexpected token %s", state);
1091
1209
  }
1092
1210
  }
1093
- // Fallthrough
1094
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': {
1095
- bool integer = true;
1096
-
1097
- // /\A-?(0|[1-9]\d*)(\.\d+)?([Ee][-+]?\d+)?/
1098
- const char *start = state->cursor;
1099
- state->cursor++;
1100
-
1101
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1102
- state->cursor++;
1103
- }
1104
-
1105
- long integer_length = state->cursor - start;
1106
-
1107
- if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
1108
- raise_parse_error_at("invalid number: %s", state, start);
1109
- } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
1110
- raise_parse_error_at("invalid number: %s", state, start);
1111
- } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
1112
- raise_parse_error_at("invalid number: %s", state, start);
1113
- }
1114
-
1115
- if ((state->cursor < state->end) && (*state->cursor == '.')) {
1116
- integer = false;
1117
- state->cursor++;
1118
-
1119
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1120
- raise_parse_error("invalid number: %s", state);
1121
- }
1122
-
1123
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1124
- state->cursor++;
1125
- }
1126
- }
1127
-
1128
- if ((state->cursor < state->end) && ((*state->cursor == 'e') || (*state->cursor == 'E'))) {
1129
- integer = false;
1130
- state->cursor++;
1131
- if ((state->cursor < state->end) && ((*state->cursor == '+') || (*state->cursor == '-'))) {
1132
- state->cursor++;
1133
- }
1134
-
1135
- if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
1136
- raise_parse_error("invalid number: %s", state);
1137
- }
1138
-
1139
- while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
1140
- state->cursor++;
1141
- }
1142
- }
1143
-
1144
- if (integer) {
1145
- return json_push_value(state, config, json_decode_integer(start, state->cursor));
1146
- }
1147
- return json_push_value(state, config, json_decode_float(config, start, state->cursor));
1211
+ return json_push_value(state, config, json_parse_negative_number(state, config));
1212
+ break;
1148
1213
  }
1214
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
1215
+ return json_push_value(state, config, json_parse_positive_number(state, config));
1216
+ break;
1149
1217
  case '"': {
1150
1218
  // %r{\A"[^"\\\t\n\x00]*(?:\\[bfnrtu\\/"][^"\\]*)*"}
1151
1219
  return json_parse_string(state, config, false);
@@ -1156,7 +1224,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1156
1224
  json_eat_whitespace(state);
1157
1225
  long stack_head = state->stack->head;
1158
1226
 
1159
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1227
+ if (peek(state) == ']') {
1160
1228
  state->cursor++;
1161
1229
  return json_push_value(state, config, json_decode_array(state, config, 0));
1162
1230
  } else {
@@ -1171,26 +1239,26 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1171
1239
  while (true) {
1172
1240
  json_eat_whitespace(state);
1173
1241
 
1174
- if (state->cursor < state->end) {
1175
- if (*state->cursor == ']') {
1176
- state->cursor++;
1177
- long count = state->stack->head - stack_head;
1178
- state->current_nesting--;
1179
- state->in_array--;
1180
- return json_push_value(state, config, json_decode_array(state, config, count));
1181
- }
1242
+ const char next_char = peek(state);
1182
1243
 
1183
- if (*state->cursor == ',') {
1184
- state->cursor++;
1185
- if (config->allow_trailing_comma) {
1186
- json_eat_whitespace(state);
1187
- if ((state->cursor < state->end) && (*state->cursor == ']')) {
1188
- continue;
1189
- }
1244
+ if (RB_LIKELY(next_char == ',')) {
1245
+ state->cursor++;
1246
+ if (config->allow_trailing_comma) {
1247
+ json_eat_whitespace(state);
1248
+ if (peek(state) == ']') {
1249
+ continue;
1190
1250
  }
1191
- json_parse_any(state, config);
1192
- continue;
1193
1251
  }
1252
+ json_parse_any(state, config);
1253
+ continue;
1254
+ }
1255
+
1256
+ if (next_char == ']') {
1257
+ state->cursor++;
1258
+ long count = state->stack->head - stack_head;
1259
+ state->current_nesting--;
1260
+ state->in_array--;
1261
+ return json_push_value(state, config, json_decode_array(state, config, count));
1194
1262
  }
1195
1263
 
1196
1264
  raise_parse_error("expected ',' or ']' after array value", state);
@@ -1204,7 +1272,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1204
1272
  json_eat_whitespace(state);
1205
1273
  long stack_head = state->stack->head;
1206
1274
 
1207
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1275
+ if (peek(state) == '}') {
1208
1276
  state->cursor++;
1209
1277
  return json_push_value(state, config, json_decode_object(state, config, 0));
1210
1278
  } else {
@@ -1213,13 +1281,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1213
1281
  rb_raise(eNestingError, "nesting of %d is too deep", state->current_nesting);
1214
1282
  }
1215
1283
 
1216
- if (*state->cursor != '"') {
1284
+ if (peek(state) != '"') {
1217
1285
  raise_parse_error("expected object key, got %s", state);
1218
1286
  }
1219
1287
  json_parse_string(state, config, true);
1220
1288
 
1221
1289
  json_eat_whitespace(state);
1222
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1290
+ if (peek(state) != ':') {
1223
1291
  raise_parse_error("expected ':' after object key", state);
1224
1292
  }
1225
1293
  state->cursor++;
@@ -1230,46 +1298,45 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1230
1298
  while (true) {
1231
1299
  json_eat_whitespace(state);
1232
1300
 
1233
- if (state->cursor < state->end) {
1234
- if (*state->cursor == '}') {
1235
- state->cursor++;
1236
- state->current_nesting--;
1237
- size_t count = state->stack->head - stack_head;
1301
+ const char next_char = peek(state);
1302
+ if (next_char == '}') {
1303
+ state->cursor++;
1304
+ state->current_nesting--;
1305
+ size_t count = state->stack->head - stack_head;
1238
1306
 
1239
- // Temporary rewind cursor in case an error is raised
1240
- const char *final_cursor = state->cursor;
1241
- state->cursor = object_start_cursor;
1242
- VALUE object = json_decode_object(state, config, count);
1243
- state->cursor = final_cursor;
1307
+ // Temporary rewind cursor in case an error is raised
1308
+ const char *final_cursor = state->cursor;
1309
+ state->cursor = object_start_cursor;
1310
+ VALUE object = json_decode_object(state, config, count);
1311
+ state->cursor = final_cursor;
1244
1312
 
1245
- return json_push_value(state, config, object);
1246
- }
1313
+ return json_push_value(state, config, object);
1314
+ }
1247
1315
 
1248
- if (*state->cursor == ',') {
1249
- state->cursor++;
1250
- json_eat_whitespace(state);
1316
+ if (next_char == ',') {
1317
+ state->cursor++;
1318
+ json_eat_whitespace(state);
1251
1319
 
1252
- if (config->allow_trailing_comma) {
1253
- if ((state->cursor < state->end) && (*state->cursor == '}')) {
1254
- continue;
1255
- }
1320
+ if (config->allow_trailing_comma) {
1321
+ if (peek(state) == '}') {
1322
+ continue;
1256
1323
  }
1324
+ }
1257
1325
 
1258
- if (*state->cursor != '"') {
1259
- raise_parse_error("expected object key, got: %s", state);
1260
- }
1261
- json_parse_string(state, config, true);
1326
+ if (RB_UNLIKELY(peek(state) != '"')) {
1327
+ raise_parse_error("expected object key, got: %s", state);
1328
+ }
1329
+ json_parse_string(state, config, true);
1262
1330
 
1263
- json_eat_whitespace(state);
1264
- if ((state->cursor >= state->end) || (*state->cursor != ':')) {
1265
- raise_parse_error("expected ':' after object key, got: %s", state);
1266
- }
1267
- state->cursor++;
1331
+ json_eat_whitespace(state);
1332
+ if (RB_UNLIKELY(peek(state) != ':')) {
1333
+ raise_parse_error("expected ':' after object key, got: %s", state);
1334
+ }
1335
+ state->cursor++;
1268
1336
 
1269
- json_parse_any(state, config);
1337
+ json_parse_any(state, config);
1270
1338
 
1271
- continue;
1272
- }
1339
+ continue;
1273
1340
  }
1274
1341
 
1275
1342
  raise_parse_error("expected ',' or '}' after object value, got: %s", state);
@@ -1277,18 +1344,23 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
1277
1344
  break;
1278
1345
  }
1279
1346
 
1347
+ case 0:
1348
+ raise_parse_error("unexpected end of input", state);
1349
+ break;
1350
+
1280
1351
  default:
1281
1352
  raise_parse_error("unexpected character: %s", state);
1282
1353
  break;
1283
1354
  }
1284
1355
 
1285
1356
  raise_parse_error("unreachable: %s", state);
1357
+ return Qundef;
1286
1358
  }
1287
1359
 
1288
1360
  static void json_ensure_eof(JSON_ParserState *state)
1289
1361
  {
1290
1362
  json_eat_whitespace(state);
1291
- if (state->cursor != state->end) {
1363
+ if (!eos(state)) {
1292
1364
  raise_parse_error("unexpected token at end of stream %s", state);
1293
1365
  }
1294
1366
  }