oj 3.16.11 → 3.16.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 30aea721380a4e3edc306dd19906d8777f230a639ba3427e9394dd543a3a7e3b
4
- data.tar.gz: b024a9d4513c16c1bfe4fc3c4adeacb1afd9a0e670987476d32cfa2fa74e9b1e
3
+ metadata.gz: fc0290fa1cfe6af1094de1d7188836e0c09cb04f2f08401de118253026604650
4
+ data.tar.gz: de5258e96984a21afb2fac946fe28ad255926893a6157f2874445edf10aa8bbe
5
5
  SHA512:
6
- metadata.gz: 527ea1162cb135bbe16eefc10a7cb05444182767aca6fa0b6986622e52d7082bcec020c43e663251406c81602018f7d0842c2c5cee37aeca0269560e502d99dd
7
- data.tar.gz: e49e9f63e373cb0ec21f604f97899f87815b86ef5a5eafad30e7bddbd11e71156f92beaa1259c83609c2d45d2a8aac87c8b27e3e266fcc2bd99a1908327c796d
6
+ metadata.gz: d7870818fd86043a17b834756b67a4009a6f7ef60baf53b02a0b0d4431ccba723d9e533553bea04ae46ae9f233e447b79c4106e6827782bd0c2ffb9c332081a3
7
+ data.tar.gz: fd3966ac7fb5da9f1a5ebb68f4a8f5b9a5f9fa1a1255e93dfef078f66f00a6af5bb7e37676441f7d6229b29222741a2bc7b75164fd445a39b906ef904946d41b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 3.16.13 - 2025-12-05
4
+
5
+ - Fixed rails encoding for Hash and Array subclasses.
6
+
7
+ ## 3.16.12 - 2025-10-29
8
+
9
+ - Fixed dump realloc bug that occurred when using the compat mode dump options.
10
+
3
11
  ## 3.16.11 - 2025-05-29
4
12
 
5
13
  - Fixed range encoding with the :circular option
data/ext/oj/dump.c CHANGED
@@ -252,38 +252,46 @@ inline static size_t hixss_friendly_size(const uint8_t *str, size_t len) {
252
252
  }
253
253
 
254
254
  inline static long rails_xss_friendly_size(const uint8_t *str, size_t len) {
255
- long size = 0;
256
- uint8_t hi = 0;
255
+ long size = 0;
256
+ uint32_t hi = 0;
257
257
 
258
258
  #ifdef HAVE_SIMD_NEON
259
259
  size_t i = 0;
260
260
 
261
- uint8x16_t has_some_hibit = vdupq_n_u8(0);
262
- uint8x16_t hibit = vdupq_n_u8(0x80);
263
- for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
264
- size += sizeof(uint8x16_t);
261
+ if (len >= sizeof(uint8x16_t)) {
262
+ uint8x16_t has_some_hibit = vdupq_n_u8(0);
263
+ uint8x16_t hibit = vdupq_n_u8(0x80);
265
264
 
266
- uint8x16_t chunk = vld1q_u8(str);
265
+ for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
266
+ size += sizeof(uint8x16_t);
267
267
 
268
- // Check to see if any of these bytes have the high bit set.
269
- has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
268
+ uint8x16_t chunk = vld1q_u8(str);
270
269
 
271
- uint8x16_t tmp1 = vqtbl4q_u8(rails_xss_friendly_chars_neon[0], chunk);
272
- uint8x16_t tmp2 = vqtbl4q_u8(rails_xss_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
273
- uint8x16_t tmp3 = vqtbl4q_u8(rails_xss_friendly_chars_neon[2], veorq_u8(chunk, vdupq_n_u8(0x80)));
274
- uint8x16_t tmp4 = vqtbl4q_u8(rails_xss_friendly_chars_neon[3], veorq_u8(chunk, vdupq_n_u8(0xc0)));
275
- uint8x16_t result = vorrq_u8(tmp4, vorrq_u8(tmp3, vorrq_u8(tmp1, tmp2)));
276
- uint8_t tmp = vaddvq_u8(result);
277
- size += tmp;
270
+ // Check to see if any of these bytes have the high bit set.
271
+ has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
272
+
273
+ uint8x16_t tmp1 = vqtbl4q_u8(rails_xss_friendly_chars_neon[0], chunk);
274
+ uint8x16_t tmp2 = vqtbl4q_u8(rails_xss_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
275
+ uint8x16_t tmp3 = vqtbl4q_u8(rails_xss_friendly_chars_neon[2], veorq_u8(chunk, vdupq_n_u8(0x80)));
276
+ uint8x16_t tmp4 = vqtbl4q_u8(rails_xss_friendly_chars_neon[3], veorq_u8(chunk, vdupq_n_u8(0xc0)));
277
+ uint8x16_t result = vorrq_u8(tmp4, vorrq_u8(tmp3, vorrq_u8(tmp1, tmp2)));
278
+ uint8_t tmp = vaddvq_u8(result);
279
+ size += tmp;
280
+ }
281
+
282
+ // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
283
+ hi = vmaxvq_u8(has_some_hibit) != 0;
278
284
  }
279
285
 
280
- // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
281
- hi = vmaxvq_u8(has_some_hibit) != 0;
286
+ size_t len_remaining = len - i;
282
287
 
283
288
  for (; i < len; str++, i++) {
284
- size += rails_xss_friendly_chars[*str] - '0';
289
+ size += rails_xss_friendly_chars[*str];
285
290
  hi |= *str & 0x80;
286
291
  }
292
+
293
+ size -= (len_remaining * ((size_t)'0'));
294
+
287
295
  if (0 == hi) {
288
296
  return size;
289
297
  }
@@ -302,37 +310,43 @@ inline static long rails_xss_friendly_size(const uint8_t *str, size_t len) {
302
310
  }
303
311
 
304
312
  inline static size_t rails_friendly_size(const uint8_t *str, size_t len) {
305
- long size = 0;
306
- uint8_t hi = 0;
313
+ long size = 0;
314
+ uint32_t hi = 0;
307
315
  #ifdef HAVE_SIMD_NEON
308
- size_t i = 0;
316
+ size_t i = 0;
317
+ long extra = 0;
309
318
 
310
- uint8x16_t has_some_hibit = vdupq_n_u8(0);
311
- uint8x16_t hibit = vdupq_n_u8(0x80);
319
+ if (len >= sizeof(uint8x16_t)) {
320
+ uint8x16_t has_some_hibit = vdupq_n_u8(0);
321
+ uint8x16_t hibit = vdupq_n_u8(0x80);
312
322
 
313
- for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
314
- size += sizeof(uint8x16_t);
323
+ for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
324
+ size += sizeof(uint8x16_t);
315
325
 
316
- // See https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
317
- uint8x16_t chunk = vld1q_u8(str);
326
+ // See https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
327
+ uint8x16_t chunk = vld1q_u8(str);
318
328
 
319
- // Check to see if any of these bytes have the high bit set.
320
- has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
329
+ // Check to see if any of these bytes have the high bit set.
330
+ has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
321
331
 
322
- uint8x16_t tmp1 = vqtbl4q_u8(rails_friendly_chars_neon[0], chunk);
323
- uint8x16_t tmp2 = vqtbl4q_u8(rails_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
324
- uint8x16_t result = vorrq_u8(tmp1, tmp2);
325
- uint8_t tmp = vaddvq_u8(result);
326
- size += tmp;
327
- }
332
+ uint8x16_t tmp1 = vqtbl4q_u8(rails_friendly_chars_neon[0], chunk);
333
+ uint8x16_t tmp2 = vqtbl4q_u8(rails_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
334
+ uint8x16_t result = vorrq_u8(tmp1, tmp2);
335
+ uint8_t tmp = vaddvq_u8(result);
336
+ size += tmp;
337
+ }
328
338
 
329
- // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
330
- hi = vmaxvq_u8(has_some_hibit) != 0;
339
+ // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
340
+ hi = vmaxvq_u8(has_some_hibit) != 0;
341
+ }
331
342
 
332
- for (; i < len; str++, i++) {
333
- size += rails_friendly_chars[*str] - '0';
343
+ for (; i < len; str++, i++, extra++) {
344
+ size += rails_friendly_chars[*str];
334
345
  hi |= *str & 0x80;
335
346
  }
347
+
348
+ size -= (extra * ((size_t)'0'));
349
+
336
350
  if (0 == hi) {
337
351
  return size;
338
352
  }
@@ -896,6 +910,12 @@ void oj_dump_raw_json(VALUE obj, int depth, Out out) {
896
910
  }
897
911
  }
898
912
 
913
+ #if defined(__clang__) || defined(__GNUC__)
914
+ #define FORCE_INLINE __attribute__((always_inline))
915
+ #else
916
+ #define FORCE_INLINE
917
+ #endif
918
+
899
919
  #ifdef HAVE_SIMD_NEON
900
920
  typedef struct _neon_match_result {
901
921
  uint8x16_t needs_escape;
@@ -903,12 +923,6 @@ typedef struct _neon_match_result {
903
923
  bool do_unicode_validation;
904
924
  } neon_match_result;
905
925
 
906
- #if defined(__clang__) || defined(__GNUC__)
907
- #define FORCE_INLINE __attribute__((always_inline))
908
- #else
909
- #define FORCE_INLINE
910
- #endif
911
-
912
926
  static inline FORCE_INLINE neon_match_result
913
927
  neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool do_unicode_validation, bool has_hi) {
914
928
  neon_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
@@ -932,12 +946,83 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
932
946
 
933
947
  #endif /* HAVE_SIMD_NEON */
934
948
 
949
+ static inline FORCE_INLINE const char *process_character(char action,
950
+ const char *str,
951
+ const char *end,
952
+ Out out,
953
+ const char *orig,
954
+ bool do_unicode_validation,
955
+ const char **check_start_) {
956
+ const char *check_start = *check_start_;
957
+ switch (action) {
958
+ case '1':
959
+ if (do_unicode_validation && check_start <= str) {
960
+ if (0 != (0x80 & (uint8_t)*str)) {
961
+ if (0xC0 == (0xC0 & (uint8_t)*str)) {
962
+ *check_start_ = check_unicode(str, end, orig);
963
+ } else {
964
+ raise_invalid_unicode(orig, (int)(end - orig), (int)(str - orig));
965
+ }
966
+ }
967
+ }
968
+ *out->cur++ = *str;
969
+ break;
970
+ case '2':
971
+ *out->cur++ = '\\';
972
+ switch (*str) {
973
+ case '\\': *out->cur++ = '\\'; break;
974
+ case '\b': *out->cur++ = 'b'; break;
975
+ case '\t': *out->cur++ = 't'; break;
976
+ case '\n': *out->cur++ = 'n'; break;
977
+ case '\f': *out->cur++ = 'f'; break;
978
+ case '\r': *out->cur++ = 'r'; break;
979
+ default: *out->cur++ = *str; break;
980
+ }
981
+ break;
982
+ case '3': // Unicode
983
+ if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
984
+ if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
985
+ str = dump_unicode(str, end, out, orig);
986
+ } else {
987
+ *check_start_ = check_unicode(str, end, orig);
988
+ *out->cur++ = *str;
989
+ }
990
+ break;
991
+ }
992
+ str = dump_unicode(str, end, out, orig);
993
+ break;
994
+ case '6': // control characters
995
+ if (*(uint8_t *)str < 0x80) {
996
+ if (0 == (uint8_t)*str && out->opts->dump_opts.omit_null_byte) {
997
+ break;
998
+ }
999
+ APPEND_CHARS(out->cur, "\\u00", 4);
1000
+ dump_hex((uint8_t)*str, out);
1001
+ } else {
1002
+ if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
1003
+ if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
1004
+ str = dump_unicode(str, end, out, orig);
1005
+ } else {
1006
+ *check_start_ = check_unicode(str, end, orig);
1007
+ *out->cur++ = *str;
1008
+ }
1009
+ break;
1010
+ }
1011
+ str = dump_unicode(str, end, out, orig);
1012
+ }
1013
+ break;
1014
+ default: break; // ignore, should never happen if the table is correct
1015
+ }
1016
+
1017
+ return str;
1018
+ }
1019
+
935
1020
  void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out out) {
936
1021
  size_t size;
937
1022
  char *cmap;
938
1023
  #ifdef HAVE_SIMD_NEON
939
- uint8x16x4_t *cmap_neon = NULL;
940
- int neon_table_size;
1024
+ uint8x16x4_t *cmap_neon = NULL;
1025
+ int neon_table_size = 0;
941
1026
  #endif /* HAVE_SIMD_NEON */
942
1027
  const char *orig = str;
943
1028
  bool has_hi = false;
@@ -1036,171 +1121,83 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1036
1121
  #ifdef HAVE_SIMD_NEON
1037
1122
  const char *chunk_start;
1038
1123
  const char *chunk_end;
1039
- const char *cursor = str;
1040
- int neon_state = (cmap_neon != NULL) ? 1 : 4;
1124
+ const char *cursor = str;
1125
+ bool use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
1041
1126
  char matches[16];
1042
- bool do_hi_validation = false;
1043
- // uint64_t neon_match_mask = 0;
1044
1127
  #define SEARCH_FLUSH \
1045
1128
  if (str > cursor) { \
1046
1129
  APPEND_CHARS(out->cur, cursor, str - cursor); \
1047
1130
  cursor = str; \
1048
1131
  }
1049
1132
 
1050
- loop:
1051
1133
  #endif /* HAVE_SIMD_NEON */
1052
- for (; str < end; str++) {
1053
- char action = 0;
1054
1134
  #ifdef HAVE_SIMD_NEON
1055
- /* neon_state:
1056
- * 1: Scanning for matches. There must be at least
1057
- sizeof(uint8x16_t) bytes of input data to use SIMD and
1058
- cmap_neon must be non-null.
1059
- * 2: Matches have been found. Will set str to the position of the
1060
- * next match and set the state to 3.
1061
- * If there are no more matches it will transition to state 1.
1062
- * 4: Fallback to the scalar algorithm. Not enough data to use
1063
- * SIMD.
1064
- */
1065
- #define NEON_SET_STATE(state) \
1066
- neon_state = state; \
1067
- goto loop;
1068
- #define NEON_RETURN_TO_STATE(state) neon_state = state;
1069
- switch (neon_state) {
1070
- case 1: {
1071
- while (true) {
1072
- const char *chunk_ptr = NULL;
1073
- if (str + sizeof(uint8x16_t) <= end) {
1074
- chunk_ptr = str;
1075
- chunk_start = str;
1076
- chunk_end = str + sizeof(uint8x16_t);
1077
- } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
1078
- memset(out->cur, 'A', sizeof(uint8x16_t));
1079
- memcpy(out->cur, str, (end - str));
1080
- chunk_ptr = out->cur;
1081
- chunk_start = str;
1082
- chunk_end = end;
1083
- } else {
1084
- SEARCH_FLUSH;
1085
- NEON_SET_STATE(4);
1086
- break; /* Unreachable */
1087
- }
1088
- neon_match_result result = neon_update(chunk_ptr,
1089
- cmap_neon,
1090
- neon_table_size,
1091
- do_unicode_validation,
1092
- has_hi);
1093
- if ((result.do_unicode_validation) || vmaxvq_u8(result.needs_escape) != 0) {
1094
- SEARCH_FLUSH;
1095
- uint8x16_t actions = vaddq_u8(result.needs_escape, vdupq_n_u8('1'));
1096
- do_hi_validation = result.do_unicode_validation;
1097
- vst1q_u8((unsigned char *)matches, actions);
1098
- NEON_SET_STATE(2);
1099
- break; /* Unreachable */
1100
- }
1101
- str = chunk_end;
1102
- }
1103
- // We must have run out of data to use SIMD. Go to state 4.
1104
- SEARCH_FLUSH;
1105
- NEON_SET_STATE(4);
1106
- } break;
1107
- case 3:
1108
- cursor = str;
1109
- // This fall through is intentional. We return to state 3 after we process
1110
- // a byte (or multiple). We return to this state to ensure the cursor is
1111
- // pointing to the correct location. We then resume looking for matches
1112
- // within the previously processed chunk.
1113
- case 2:
1114
- if (str >= chunk_end) {
1115
- NEON_SET_STATE(1);
1116
- }
1117
- if (!do_hi_validation) {
1118
- long i = str - chunk_start;
1119
- for (; str < chunk_end; i++) {
1120
- if ((action = matches[i]) != '1') {
1121
- break;
1122
- }
1123
- *out->cur++ = *str++;
1124
- }
1125
- // The loop above may have advanced str and directly output them to out->cur.
1126
- // Ensure cursor is set appropriately.
1127
- cursor = str;
1128
- if (str >= chunk_end) {
1129
- // We must have advanced past the end... we are done.
1130
- NEON_SET_STATE(1);
1131
- }
1135
+ if (use_neon) {
1136
+ while (str < end) {
1137
+ const char *chunk_ptr = NULL;
1138
+ if (str + sizeof(uint8x16_t) <= end) {
1139
+ chunk_ptr = str;
1140
+ chunk_start = str;
1141
+ chunk_end = str + sizeof(uint8x16_t);
1142
+ } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
1143
+ memset(out->cur, 'A', sizeof(uint8x16_t));
1144
+ memcpy(out->cur, str, (end - str));
1145
+ chunk_ptr = out->cur;
1146
+ chunk_start = str;
1147
+ chunk_end = end;
1132
1148
  } else {
1133
- long match_index = str - chunk_start;
1134
- action = matches[match_index];
1149
+ break;
1135
1150
  }
1136
- NEON_RETURN_TO_STATE(3);
1137
- break;
1138
- case 4: action = cmap[(uint8_t)*str];
1139
- }
1140
- #undef NEON_SET_STATE
1141
- #undef NEON_RETURN_TO_STATE
1142
- #else
1143
- action = cmap[(uint8_t)*str];
1144
- #endif /* HAVE_SIMD_NEON */
1145
- switch (action) {
1146
- case '1':
1147
- if (do_unicode_validation && check_start <= str) {
1148
- if (0 != (0x80 & (uint8_t)*str)) {
1149
- if (0xC0 == (0xC0 & (uint8_t)*str)) {
1150
- check_start = check_unicode(str, end, orig);
1151
- } else {
1152
- raise_invalid_unicode(orig, (int)(end - orig), (int)(str - orig));
1151
+ neon_match_result result = neon_update(chunk_ptr,
1152
+ cmap_neon,
1153
+ neon_table_size,
1154
+ do_unicode_validation,
1155
+ has_hi);
1156
+ if ((result.do_unicode_validation) || vmaxvq_u8(result.needs_escape) != 0) {
1157
+ SEARCH_FLUSH;
1158
+ uint8x16_t actions = vaddq_u8(result.needs_escape, vdupq_n_u8('1'));
1159
+ uint8_t num_matches = vaddvq_u8(vandq_u8(result.needs_escape, vdupq_n_u8(0x1)));
1160
+ vst1q_u8((unsigned char *)matches, actions);
1161
+ bool process_each = result.do_unicode_validation || (num_matches > sizeof(uint8x16_t) / 2);
1162
+ // If no byte in this chunk had the high bit set then we can skip
1163
+ // all of the '1' bytes by directly copying them to the output.
1164
+ if (!process_each) {
1165
+ while (str < chunk_end) {
1166
+ long i = str - chunk_start;
1167
+ char action;
1168
+ while (str < chunk_end && (action = matches[i++]) == '1') {
1169
+ *out->cur++ = *str++;
1170
+ }
1171
+ cursor = str;
1172
+ if (str >= chunk_end) {
1173
+ break;
1174
+ }
1175
+ str = process_character(action, str, end, out, orig, do_unicode_validation, &check_start);
1176
+ str++;
1153
1177
  }
1154
- }
1155
- }
1156
- *out->cur++ = *str;
1157
- break;
1158
- case '2':
1159
- *out->cur++ = '\\';
1160
- switch (*str) {
1161
- case '\\': *out->cur++ = '\\'; break;
1162
- case '\b': *out->cur++ = 'b'; break;
1163
- case '\t': *out->cur++ = 't'; break;
1164
- case '\n': *out->cur++ = 'n'; break;
1165
- case '\f': *out->cur++ = 'f'; break;
1166
- case '\r': *out->cur++ = 'r'; break;
1167
- default: *out->cur++ = *str; break;
1168
- }
1169
- break;
1170
- case '3': // Unicode
1171
- if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
1172
- if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
1173
- str = dump_unicode(str, end, out, orig);
1174
1178
  } else {
1175
- check_start = check_unicode(str, end, orig);
1176
- *out->cur++ = *str;
1177
- }
1178
- break;
1179
- }
1180
- str = dump_unicode(str, end, out, orig);
1181
- break;
1182
- case '6': // control characters
1183
- if (*(uint8_t *)str < 0x80) {
1184
- if (0 == (uint8_t)*str && out->opts->dump_opts.omit_null_byte) {
1185
- break;
1186
- }
1187
- APPEND_CHARS(out->cur, "\\u00", 4);
1188
- dump_hex((uint8_t)*str, out);
1189
- } else {
1190
- if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
1191
- if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
1192
- str = dump_unicode(str, end, out, orig);
1193
- } else {
1194
- check_start = check_unicode(str, end, orig);
1195
- *out->cur++ = *str;
1179
+ while (str < chunk_end) {
1180
+ long match_index = str - chunk_start;
1181
+ str = process_character(matches[match_index],
1182
+ str,
1183
+ end,
1184
+ out,
1185
+ orig,
1186
+ do_unicode_validation,
1187
+ &check_start);
1188
+ str++;
1196
1189
  }
1197
- break;
1198
1190
  }
1199
- str = dump_unicode(str, end, out, orig);
1191
+ cursor = str;
1192
+ continue;
1200
1193
  }
1201
- break;
1202
- default: break; // ignore, should never happen if the table is correct
1194
+ str = chunk_end;
1203
1195
  }
1196
+ SEARCH_FLUSH;
1197
+ }
1198
+ #endif /* HAVE_SIMD_NEON */
1199
+ for (; str < end; str++) {
1200
+ str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
1204
1201
  }
1205
1202
  *out->cur++ = '"';
1206
1203
  }
data/ext/oj/dump_compat.c CHANGED
@@ -148,10 +148,10 @@ static void dump_array(VALUE a, int depth, Out out, bool as_ok) {
148
148
  } else {
149
149
  size = d2 * out->indent + 2;
150
150
  }
151
- assure_size(out, size * cnt);
152
151
  cnt--;
153
152
  for (i = 0; i <= cnt; i++) {
154
153
  if (out->opts->dump_opts.use) {
154
+ assure_size(out, size);
155
155
  if (0 < out->opts->dump_opts.array_size) {
156
156
  APPEND_CHARS(out->cur, out->opts->dump_opts.array_nl, out->opts->dump_opts.array_size);
157
157
  }
data/ext/oj/extconf.rb CHANGED
@@ -35,13 +35,12 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
35
35
 
36
36
  dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
37
37
 
38
- if with_config('--with-sse42')
39
- if try_cflags('-msse4.2')
40
- $CPPFLAGS += ' -msse4.2'
41
- dflags['OJ_USE_SSE4_2'] = 1
42
- else
43
- warn 'SSE 4.2 is not supported on this platform.'
44
- end
38
+ # Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
39
+ # Falls back to SSE2 or compiler defaults if not available
40
+ if try_cflags('-msse4.2')
41
+ $CPPFLAGS += ' -msse4.2'
42
+ elsif try_cflags('-msse2')
43
+ $CPPFLAGS += ' -msse2'
45
44
  end
46
45
 
47
46
  if enable_config('trace-log', false)
data/ext/oj/parse.c CHANGED
@@ -15,12 +15,9 @@
15
15
  #include "mem.h"
16
16
  #include "oj.h"
17
17
  #include "rxclass.h"
18
+ #include "simd.h"
18
19
  #include "val_stack.h"
19
20
 
20
- #ifdef OJ_USE_SSE4_2
21
- #include <nmmintrin.h>
22
- #endif
23
-
24
21
  // Workaround in case INFINITY is not defined in math.h or if the OS is CentOS
25
22
  #define OJ_INFINITY (1.0 / 0.0)
26
23
 
@@ -202,23 +199,143 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
202
199
  return str;
203
200
  }
204
201
 
205
- #ifdef OJ_USE_SSE4_2
206
- static inline const char *scan_string_SIMD(const char *str, const char *end) {
207
- static const char chars[16] = "\x00\\\"";
208
- const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
209
- const char *_end = (const char *)(end - 16);
202
+ #ifdef HAVE_SIMD_SSE4_2
203
+ // Optimized SIMD string scanner using SSE4.2 instructions
204
+ // Uses prefetching and processes multiple chunks in parallel to reduce latency
205
+ static inline const char *scan_string_SSE42(const char *str, const char *end) {
206
+ static const char chars[16] = "\x00\\\"";
207
+ const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
208
+ const char *safe_end_64 = end - 64;
209
+ const char *safe_end_16 = end - 16;
210
+
211
+ // Process 64 bytes at a time with parallel SIMD operations
212
+ // This reduces pipeline stalls and improves instruction-level parallelism
213
+ while (str <= safe_end_64) {
214
+ // Prefetch next cache line for better memory throughput
215
+ __builtin_prefetch(str + 64, 0, 0);
216
+
217
+ // Load and compare 4 chunks in parallel
218
+ const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
219
+ const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
220
+ const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
221
+ const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
222
+
223
+ const int r0 = _mm_cmpestri(terminate,
224
+ 3,
225
+ chunk0,
226
+ 16,
227
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
228
+ if (__builtin_expect(r0 != 16, 0))
229
+ return str + r0;
230
+
231
+ const int r1 = _mm_cmpestri(terminate,
232
+ 3,
233
+ chunk1,
234
+ 16,
235
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
236
+ if (__builtin_expect(r1 != 16, 0))
237
+ return str + 16 + r1;
238
+
239
+ const int r2 = _mm_cmpestri(terminate,
240
+ 3,
241
+ chunk2,
242
+ 16,
243
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
244
+ if (__builtin_expect(r2 != 16, 0))
245
+ return str + 32 + r2;
246
+
247
+ const int r3 = _mm_cmpestri(terminate,
248
+ 3,
249
+ chunk3,
250
+ 16,
251
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
252
+ if (__builtin_expect(r3 != 16, 0))
253
+ return str + 48 + r3;
254
+
255
+ str += 64;
256
+ }
210
257
 
211
- for (; str <= _end; str += 16) {
258
+ // Handle remaining 16-byte chunks
259
+ for (; str <= safe_end_16; str += 16) {
212
260
  const __m128i string = _mm_loadu_si128((const __m128i *)str);
213
261
  const int r = _mm_cmpestri(terminate,
214
262
  3,
215
263
  string,
216
264
  16,
217
265
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
218
- if (r != 16) {
219
- str = (char *)(str + r);
220
- return str;
221
- }
266
+ if (r != 16)
267
+ return str + r;
268
+ }
269
+
270
+ return scan_string_noSIMD(str, end);
271
+ }
272
+ #endif
273
+
274
+ #ifdef HAVE_SIMD_SSE2
275
+ // Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
276
+ // Uses SSE2 instructions with prefetching and parallel processing
277
+ static inline const char *scan_string_SSE2(const char *str, const char *end) {
278
+ const char *safe_end_64 = end - 64;
279
+ const char *safe_end_16 = end - 16;
280
+
281
+ // Create comparison vectors for our three special characters
282
+ const __m128i null_char = _mm_setzero_si128();
283
+ const __m128i backslash = _mm_set1_epi8('\\');
284
+ const __m128i quote = _mm_set1_epi8('"');
285
+
286
+ // Process 64 bytes at a time for better throughput
287
+ while (str <= safe_end_64) {
288
+ __builtin_prefetch(str + 64, 0, 0);
289
+
290
+ // Load 4 chunks
291
+ const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
292
+ const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
293
+ const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
294
+ const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
295
+
296
+ // Compare all chunks (allows CPU to parallelize)
297
+ const __m128i cmp0 = _mm_or_si128(
298
+ _mm_or_si128(_mm_cmpeq_epi8(chunk0, null_char), _mm_cmpeq_epi8(chunk0, backslash)),
299
+ _mm_cmpeq_epi8(chunk0, quote));
300
+ const __m128i cmp1 = _mm_or_si128(
301
+ _mm_or_si128(_mm_cmpeq_epi8(chunk1, null_char), _mm_cmpeq_epi8(chunk1, backslash)),
302
+ _mm_cmpeq_epi8(chunk1, quote));
303
+ const __m128i cmp2 = _mm_or_si128(
304
+ _mm_or_si128(_mm_cmpeq_epi8(chunk2, null_char), _mm_cmpeq_epi8(chunk2, backslash)),
305
+ _mm_cmpeq_epi8(chunk2, quote));
306
+ const __m128i cmp3 = _mm_or_si128(
307
+ _mm_or_si128(_mm_cmpeq_epi8(chunk3, null_char), _mm_cmpeq_epi8(chunk3, backslash)),
308
+ _mm_cmpeq_epi8(chunk3, quote));
309
+
310
+ // Convert to masks
311
+ int mask0 = _mm_movemask_epi8(cmp0);
312
+ if (__builtin_expect(mask0 != 0, 0))
313
+ return str + __builtin_ctz(mask0);
314
+
315
+ int mask1 = _mm_movemask_epi8(cmp1);
316
+ if (__builtin_expect(mask1 != 0, 0))
317
+ return str + 16 + __builtin_ctz(mask1);
318
+
319
+ int mask2 = _mm_movemask_epi8(cmp2);
320
+ if (__builtin_expect(mask2 != 0, 0))
321
+ return str + 32 + __builtin_ctz(mask2);
322
+
323
+ int mask3 = _mm_movemask_epi8(cmp3);
324
+ if (__builtin_expect(mask3 != 0, 0))
325
+ return str + 48 + __builtin_ctz(mask3);
326
+
327
+ str += 64;
328
+ }
329
+
330
+ // Handle remaining 16-byte chunks
331
+ for (; str <= safe_end_16; str += 16) {
332
+ const __m128i chunk = _mm_loadu_si128((const __m128i *)str);
333
+ const __m128i matches = _mm_or_si128(
334
+ _mm_or_si128(_mm_cmpeq_epi8(chunk, null_char), _mm_cmpeq_epi8(chunk, backslash)),
335
+ _mm_cmpeq_epi8(chunk, quote));
336
+ int mask = _mm_movemask_epi8(matches);
337
+ if (mask != 0)
338
+ return str + __builtin_ctz(mask);
222
339
  }
223
340
 
224
341
  return scan_string_noSIMD(str, end);
@@ -228,9 +345,12 @@ static inline const char *scan_string_SIMD(const char *str, const char *end) {
228
345
  static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
229
346
 
230
347
  void oj_scanner_init(void) {
231
- #ifdef OJ_USE_SSE4_2
232
- scan_func = scan_string_SIMD;
348
+ #ifdef HAVE_SIMD_SSE4_2
349
+ scan_func = scan_string_SSE42;
350
+ #elif defined(HAVE_SIMD_SSE2)
351
+ scan_func = scan_string_SSE2;
233
352
  #endif
353
+ // Note: ARM NEON string scanning would be added here if needed
234
354
  }
235
355
 
236
356
  // entered at /
data/ext/oj/rails.c CHANGED
@@ -661,13 +661,15 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
661
661
  Encoder e = OJ_R_ALLOC(struct _encoder);
662
662
 
663
663
  e->opts = oj_default_options;
664
- e->arg = Qnil;
665
664
  copy_opts(&ropts, &e->ropts);
666
665
 
667
666
  if (1 <= argc && Qnil != *argv) {
668
- oj_parse_options(*argv, &e->opts);
669
667
  e->arg = *argv;
668
+ } else {
669
+ e->arg = rb_hash_new();
670
670
  }
671
+ oj_parse_options(*argv, &e->opts);
672
+
671
673
  return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
672
674
  }
673
675
 
data/ext/oj/simd.h CHANGED
@@ -1,10 +1,47 @@
1
1
  #ifndef OJ_SIMD_H
2
2
  #define OJ_SIMD_H
3
3
 
4
+ // SIMD architecture detection and configuration
5
+ // This header provides unified SIMD support across different CPU architectures
6
+
7
+ // x86/x86_64 SIMD detection
8
+ #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
9
+ #define HAVE_SIMD_X86 1
10
+
11
+ // SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
12
+ // Enabled automatically when compiler has -msse4.2 flag
13
+ #if defined(__SSE4_2__)
14
+ #define HAVE_SIMD_SSE4_2 1
15
+ #include <nmmintrin.h>
16
+ #endif
17
+
18
+ // SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
19
+ #if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
20
+ #define HAVE_SIMD_SSE2 1
21
+ #include <emmintrin.h>
22
+ #endif
23
+
24
+ #endif // x86/x86_64
25
+
26
+ // ARM NEON detection
4
27
  #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
5
28
  #define HAVE_SIMD_NEON 1
6
29
  #define SIMD_MINIMUM_THRESHOLD 6
7
30
  #include <arm_neon.h>
8
31
  #endif
9
32
 
10
- #endif /* OJ_SIMD_H */
33
+ // Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
34
+ #if defined(HAVE_SIMD_SSE4_2)
35
+ #define HAVE_SIMD_STRING_SCAN 1
36
+ #define SIMD_TYPE "SSE4.2"
37
+ #elif defined(HAVE_SIMD_NEON)
38
+ #define HAVE_SIMD_STRING_SCAN 1
39
+ #define SIMD_TYPE "NEON"
40
+ #elif defined(HAVE_SIMD_SSE2)
41
+ #define HAVE_SIMD_STRING_SCAN 1
42
+ #define SIMD_TYPE "SSE2"
43
+ #else
44
+ #define SIMD_TYPE "none"
45
+ #endif
46
+
47
+ #endif /* OJ_SIMD_H */
data/lib/oj/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Oj
2
2
  # Current version of the module.
3
- VERSION = '3.16.11'
3
+ VERSION = '3.16.13'
4
4
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oj
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.16.11
4
+ version: 3.16.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Ohler
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2025-05-30 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
12
  - !ruby/object:Gem::Dependency
13
13
  name: bigdecimal
@@ -91,9 +91,9 @@ executables: []
91
91
  extensions:
92
92
  - ext/oj/extconf.rb
93
93
  extra_rdoc_files:
94
- - README.md
95
- - LICENSE
96
94
  - CHANGELOG.md
95
+ - LICENSE
96
+ - README.md
97
97
  - RELEASE_NOTES.md
98
98
  - pages/Advanced.md
99
99
  - pages/Compatibility.md
@@ -229,7 +229,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
229
229
  - !ruby/object:Gem::Version
230
230
  version: '0'
231
231
  requirements: []
232
- rubygems_version: 3.6.2
232
+ rubygems_version: 3.6.9
233
233
  specification_version: 4
234
234
  summary: A fast JSON parser and serializer.
235
235
  test_files: []