oj 3.16.13 → 3.16.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc0290fa1cfe6af1094de1d7188836e0c09cb04f2f08401de118253026604650
4
- data.tar.gz: de5258e96984a21afb2fac946fe28ad255926893a6157f2874445edf10aa8bbe
3
+ metadata.gz: 07c7a6ebc6baf4f02b93760ed3d7440cdc9b2af85d0976e525f34978ab823c45
4
+ data.tar.gz: ed5a5a2e16f8df836a098de0da60657688049cfcf820d3142b4122302ae4e946
5
5
  SHA512:
6
- metadata.gz: d7870818fd86043a17b834756b67a4009a6f7ef60baf53b02a0b0d4431ccba723d9e533553bea04ae46ae9f233e447b79c4106e6827782bd0c2ffb9c332081a3
7
- data.tar.gz: fd3966ac7fb5da9f1a5ebb68f4a8f5b9a5f9fa1a1255e93dfef078f66f00a6af5bb7e37676441f7d6229b29222741a2bc7b75164fd445a39b906ef904946d41b
6
+ metadata.gz: d8f5c73fbc30a2985c2352e466f8ef9762f831d15b9c0a8b9394d66a5613910fd5aca4c4bdc49c40230128d791244ad67f6f9852c4778e89828a4358bb997c11
7
+ data.tar.gz: dbc5fe50d1ad89cb23918c950ac849052a95defb7e5509d7653236eb3569c811685e2c4c6546a8ab38c5a346aad0ccdd9770c855bc53f96c0c5312c7d61fab3e
data/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 3.16.16 - 2026-03-13
4
+
5
+ - Not closed arrays and objects are reported corrected in the usual parser.
6
+
7
+ ## 3.16.15 - 2026-02-05
8
+
9
+ - Fixed by putting the ostruct dependency back until a better way is found to conditionally include it.
10
+
11
+ ## 3.16.14 - 2026-02-04
12
+
13
+ - Fixed SSE issue #989.
14
+
15
+ - Removed ostruct dependency.
16
+
17
+ - Removed generic object JSON gem tests.
18
+
3
19
  ## 3.16.13 - 2025-12-05
4
20
 
5
21
  - Fixed rails encoding for Hash and Array subclasses.
data/ext/oj/dump.c CHANGED
@@ -201,6 +201,45 @@ void initialize_neon(void) {
201
201
  }
202
202
  #endif
203
203
 
204
+ #ifdef HAVE_SIMD_SSE4_2
205
+
206
+ static __m128i hibit_friendly_chars_sse42[8];
207
+
208
+ // From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
209
+ inline static OJ_TARGET_SSE42 uint32_t _mm_sum_epu8(const __m128i v) {
210
+ __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
211
+ return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
212
+ }
213
+
214
+ inline static OJ_TARGET_SSE42 size_t hibit_friendly_size_sse42(const uint8_t *str, size_t len) {
215
+ size_t size = 0;
216
+ size_t i = 0;
217
+
218
+ for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i), str += sizeof(__m128i)) {
219
+ size += sizeof(__m128i);
220
+
221
+ __m128i chunk = _mm_loadu_si128((__m128i *)str);
222
+ __m128i tmp = vector_lookup_sse42(chunk, hibit_friendly_chars_sse42, 8);
223
+ size += _mm_sum_epu8(tmp);
224
+ }
225
+ size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
226
+ return total;
227
+ }
228
+
229
+ void OJ_TARGET_SSE42 initialize_sse42(void) {
230
+ for (int i = 0; i < 8; i++) {
231
+ hibit_friendly_chars_sse42[i] = _mm_sub_epi8(
232
+ _mm_loadu_si128((__m128i *)(hibit_friendly_chars + i * sizeof(__m128i))),
233
+ _mm_set1_epi8('1'));
234
+ }
235
+ }
236
+
237
+ #else
238
+
239
+ #define SIMD_TARGET
240
+
241
+ #endif /* HAVE_SIMD_SSE4_2 */
242
+
204
243
  inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
205
244
  #ifdef HAVE_SIMD_NEON
206
245
  size_t size = 0;
@@ -220,6 +259,13 @@ inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
220
259
 
221
260
  size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
222
261
  return total;
262
+ #elif defined(HAVE_SIMD_SSE4_2)
263
+ if (SIMD_Impl == SIMD_SSE42) {
264
+ if (len >= sizeof(__m128i)) {
265
+ return hibit_friendly_size_sse42(str, len);
266
+ }
267
+ }
268
+ return calculate_string_size(str, len, hibit_friendly_chars);
223
269
  #else
224
270
  return calculate_string_size(str, len, hibit_friendly_chars);
225
271
  #endif
@@ -944,6 +990,34 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
944
990
  return result;
945
991
  }
946
992
 
993
+ #elif defined(HAVE_SIMD_SSE4_2)
994
+ typedef struct _sse42_match_result {
995
+ __m128i actions;
996
+ bool needs_escape;
997
+ int escape_mask;
998
+ bool has_some_hibit;
999
+ bool do_unicode_validation;
1000
+ } sse42_match_result;
1001
+
1002
+ static inline OJ_TARGET_SSE42 sse42_match_result
1003
+ sse42_update(const char *str, __m128i *cmap_sse42, int sse42_tab_size, bool do_unicode_validation, bool has_hi) {
1004
+ sse42_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
1005
+
1006
+ __m128i chunk = _mm_loadu_si128((__m128i *)str);
1007
+ __m128i actions = vector_lookup_sse42(chunk, cmap_sse42, sse42_tab_size);
1008
+ __m128i needs_escape = _mm_xor_si128(_mm_cmpeq_epi8(actions, _mm_setzero_si128()), _mm_set1_epi8(0xFF));
1009
+ result.actions = _mm_add_epi8(actions, _mm_set1_epi8('1'));
1010
+
1011
+ result.escape_mask = _mm_movemask_epi8(needs_escape);
1012
+ result.needs_escape = result.escape_mask != 0;
1013
+ if (has_hi && do_unicode_validation) {
1014
+ __m128i has_some_hibit = _mm_and_si128(chunk, _mm_set1_epi8(0x80));
1015
+ result.has_some_hibit = _mm_movemask_epi8(has_some_hibit) != 0;
1016
+ result.do_unicode_validation = has_hi && do_unicode_validation && result.has_some_hibit;
1017
+ }
1018
+ return result;
1019
+ }
1020
+
947
1021
  #endif /* HAVE_SIMD_NEON */
948
1022
 
949
1023
  static inline FORCE_INLINE const char *process_character(char action,
@@ -1023,6 +1097,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1023
1097
  #ifdef HAVE_SIMD_NEON
1024
1098
  uint8x16x4_t *cmap_neon = NULL;
1025
1099
  int neon_table_size = 0;
1100
+ #elif defined(HAVE_SIMD_SSE4_2)
1101
+ __m128i *cmap_sse42 = NULL;
1102
+ int sse42_tab_size;
1026
1103
  #endif /* HAVE_SIMD_NEON */
1027
1104
  const char *orig = str;
1028
1105
  bool has_hi = false;
@@ -1091,6 +1168,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1091
1168
  #ifdef HAVE_SIMD_NEON
1092
1169
  cmap_neon = hibit_friendly_chars_neon;
1093
1170
  neon_table_size = 2;
1171
+ #elif defined(HAVE_SIMD_SSE4_2)
1172
+ cmap_sse42 = hibit_friendly_chars_sse42;
1173
+ sse42_tab_size = 8;
1094
1174
  #endif /* HAVE_NEON_SIMD */
1095
1175
  size = hibit_friendly_size((uint8_t *)str, cnt);
1096
1176
  }
@@ -1118,21 +1198,32 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1118
1198
  if (is_sym) {
1119
1199
  *out->cur++ = ':';
1120
1200
  }
1121
- #ifdef HAVE_SIMD_NEON
1122
- const char *chunk_start;
1123
- const char *chunk_end;
1124
- const char *cursor = str;
1125
- bool use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
1126
- char matches[16];
1201
+
1202
+ #if defined(HAVE_SIMD_NEON) || defined(HAVE_SIMD_SSE4_2)
1203
+
1127
1204
  #define SEARCH_FLUSH \
1128
1205
  if (str > cursor) { \
1129
1206
  APPEND_CHARS(out->cur, cursor, str - cursor); \
1130
1207
  cursor = str; \
1131
1208
  }
1132
1209
 
1133
- #endif /* HAVE_SIMD_NEON */
1210
+ const char *chunk_start;
1211
+ const char *chunk_end;
1212
+ const char *cursor = str;
1213
+ char matches[16];
1214
+ #endif /* HAVE_SIMD_NEON || HAVE_SIMD_SSE4_2 */
1215
+
1216
+ #if defined(HAVE_SIMD_NEON)
1217
+ bool use_simd = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
1218
+ #elif defined(HAVE_SIMD_SSE4_2)
1219
+ bool use_simd = false;
1220
+ if (SIMD_Impl == SIMD_SSE42) {
1221
+ use_simd = (cmap_sse42 != NULL && cnt >= (sizeof(__m128i))) ? true : false;
1222
+ }
1223
+ #endif
1224
+
1134
1225
  #ifdef HAVE_SIMD_NEON
1135
- if (use_neon) {
1226
+ if (use_simd) {
1136
1227
  while (str < end) {
1137
1228
  const char *chunk_ptr = NULL;
1138
1229
  if (str + sizeof(uint8x16_t) <= end) {
@@ -1195,7 +1286,55 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1195
1286
  }
1196
1287
  SEARCH_FLUSH;
1197
1288
  }
1198
- #endif /* HAVE_SIMD_NEON */
1289
+ #endif
1290
+
1291
+ #ifdef HAVE_SIMD_SSE4_2
1292
+ if (SIMD_Impl == SIMD_SSE42) {
1293
+ if (use_simd) {
1294
+ while (str < end) {
1295
+ const char *chunk_ptr = NULL;
1296
+ if (str + sizeof(__m128i) <= end) {
1297
+ chunk_ptr = str;
1298
+ chunk_start = str;
1299
+ chunk_end = str + sizeof(__m128i);
1300
+ } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
1301
+ memset(out->cur, 'A', sizeof(__m128i));
1302
+ memcpy(out->cur, str, (end - str));
1303
+ chunk_ptr = out->cur;
1304
+ chunk_start = str;
1305
+ chunk_end = end;
1306
+ } else {
1307
+ break;
1308
+ }
1309
+ sse42_match_result result = sse42_update(chunk_ptr,
1310
+ cmap_sse42,
1311
+ sse42_tab_size,
1312
+ do_unicode_validation,
1313
+ has_hi);
1314
+ if ((result.do_unicode_validation) || result.needs_escape) {
1315
+ SEARCH_FLUSH;
1316
+ _mm_storeu_si128((__m128i *)matches, result.actions);
1317
+ while (str < chunk_end) {
1318
+ long match_index = str - chunk_start;
1319
+ str = process_character(matches[match_index],
1320
+ str,
1321
+ end,
1322
+ out,
1323
+ orig,
1324
+ do_unicode_validation,
1325
+ &check_start);
1326
+ str++;
1327
+ }
1328
+ cursor = str;
1329
+ continue;
1330
+ }
1331
+ str = chunk_end;
1332
+ }
1333
+ SEARCH_FLUSH;
1334
+ }
1335
+ }
1336
+ #endif /* HAVE_SIMD_SSE4_2 */
1337
+
1199
1338
  for (; str < end; str++) {
1200
1339
  str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
1201
1340
  }
data/ext/oj/extconf.rb CHANGED
@@ -35,11 +35,15 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
35
35
 
36
36
  dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
37
37
 
38
- # Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
39
- # Falls back to SSE2 or compiler defaults if not available
40
- if try_cflags('-msse4.2')
41
- $CPPFLAGS += ' -msse4.2'
42
- elsif try_cflags('-msse2')
38
+ # SIMD optimizations use runtime CPU detection and function-level target attributes
39
+ # We do NOT add global -msse4.2/-msse2 flags here because:
40
+ # 1. It would cause illegal instruction errors on CPUs without SSE4.2
41
+ # 2. The code uses __attribute__((target("sse4.2"))) for SSE4.2 functions
42
+ # 3. Runtime detection in oj_get_simd_implementation() selects the right path
43
+ #
44
+ # We only add -msse2 if available, since SSE2 is baseline for all x86_64 CPUs
45
+ # and needed for compiling the SSE2 fallback code on 32-bit x86
46
+ if try_cflags('-msse2')
43
47
  $CPPFLAGS += ' -msse2'
44
48
  end
45
49
 
data/ext/oj/oj.c CHANGED
@@ -167,6 +167,8 @@ pthread_mutex_t oj_cache_mutex;
167
167
  VALUE oj_cache_mutex = Qnil;
168
168
  #endif
169
169
 
170
+ SIMD_Implementation SIMD_Impl = SIMD_NONE;
171
+
170
172
  extern void oj_parser_init();
171
173
 
172
174
  const char oj_json_class[] = "json_class";
@@ -1780,6 +1782,78 @@ static VALUE mem_report(VALUE self) {
1780
1782
  *
1781
1783
  * - *:wab* specifically for WAB data exchange.
1782
1784
  */
1785
+
1786
+ // =============================================================================
1787
+ // Runtime SIMD CPU detection
1788
+ // Cross-platform support for Windows (MSVC), Linux, and macOS (GCC/Clang)
1789
+ // =============================================================================
1790
+ SIMD_Implementation oj_get_simd_implementation(void) {
1791
+ #ifdef HAVE_SIMD_X86
1792
+ // x86/x86_64 runtime detection
1793
+
1794
+ #if defined(_MSC_VER)
1795
+ // MSVC: Use __cpuid intrinsic
1796
+ int cpu_info[4];
1797
+ __cpuid(cpu_info, 1);
1798
+
1799
+ // Check for SSE4.2 (bit 20 of ECX)
1800
+ if (cpu_info[2] & (1 << 20)) {
1801
+ return SIMD_SSE42;
1802
+ }
1803
+ // Check for SSE2 (bit 26 of EDX)
1804
+ if (cpu_info[3] & (1 << 26)) {
1805
+ return SIMD_SSE2;
1806
+ }
1807
+
1808
+ #elif defined(__GNUC__) || defined(__clang__)
1809
+ // GCC/Clang: Use __builtin_cpu_supports if available
1810
+ #if defined(__has_builtin)
1811
+ #if __has_builtin(__builtin_cpu_supports)
1812
+ #define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
1813
+ #endif
1814
+ #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
1815
+ // GCC 4.8+ has __builtin_cpu_supports
1816
+ #define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
1817
+ #endif
1818
+
1819
+ #ifdef OJ_HAS_BUILTIN_CPU_SUPPORTS
1820
+ #ifdef HAVE_SIMD_SSE4_2
1821
+ if (__builtin_cpu_supports("sse4.2")) {
1822
+ return SIMD_SSE42;
1823
+ }
1824
+ #endif
1825
+ #ifdef HAVE_SIMD_SSE2
1826
+ if (__builtin_cpu_supports("sse2")) {
1827
+ return SIMD_SSE2;
1828
+ }
1829
+ #endif
1830
+ #else
1831
+ // Fallback: Use CPUID instruction directly
1832
+ unsigned int eax, ebx, ecx, edx;
1833
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
1834
+ // Check for SSE4.2 (bit 20 of ECX)
1835
+ if (ecx & (1 << 20)) {
1836
+ return SIMD_SSE42;
1837
+ }
1838
+ // Check for SSE2 (bit 26 of EDX)
1839
+ if (edx & (1 << 26)) {
1840
+ return SIMD_SSE2;
1841
+ }
1842
+ }
1843
+ #endif // OJ_HAS_BUILTIN_CPU_SUPPORTS
1844
+
1845
+ #endif // _MSC_VER vs GCC/Clang
1846
+
1847
+ #endif // HAVE_SIMD_X86
1848
+
1849
+ #ifdef HAVE_SIMD_NEON
1850
+ // ARM NEON is always available on ARM64 and detected at compile time
1851
+ return SIMD_NEON;
1852
+ #endif
1853
+
1854
+ return SIMD_NONE;
1855
+ }
1856
+
1783
1857
  void Init_oj(void) {
1784
1858
  int err = 0;
1785
1859
 
@@ -2080,10 +2154,18 @@ void Init_oj(void) {
2080
2154
  #endif
2081
2155
  oj_init_doc();
2082
2156
 
2157
+ SIMD_Impl = oj_get_simd_implementation();
2158
+
2083
2159
  oj_parser_init();
2084
2160
  oj_scanner_init();
2085
2161
 
2086
2162
  #ifdef HAVE_SIMD_NEON
2087
2163
  initialize_neon();
2088
2164
  #endif /* HAVE_SIMD_NEON */
2165
+
2166
+ #ifdef HAVE_SIMD_SSE4_2
2167
+ if (SIMD_Impl == SIMD_SSE42) {
2168
+ initialize_sse42();
2169
+ }
2170
+ #endif /* HAVE_SIMD_SSE4_2 */
2089
2171
  }
data/ext/oj/parse.c CHANGED
@@ -199,10 +199,36 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
199
199
  return str;
200
200
  }
201
201
 
202
+ #ifdef HAVE_SIMD_NEON
203
+
204
+ static inline const char *string_scan_neon(const char *str, const char *end) {
205
+ const uint8x16_t null_char = vdupq_n_u8(0);
206
+ const uint8x16_t backslash = vdupq_n_u8('\\');
207
+ const uint8x16_t quote = vdupq_n_u8('"');
208
+
209
+ while (str + sizeof(uint8x16_t) <= end) {
210
+ uint8x16_t chunk = vld1q_u8((const uint8_t *)str);
211
+ uint8x16_t tmp = vorrq_u8(vorrq_u8(vceqq_u8(chunk, null_char), vceqq_u8(chunk, backslash)),
212
+ vceqq_u8(chunk, quote));
213
+ const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(tmp), 4);
214
+ uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
215
+ if (mask != 0) {
216
+ mask &= 0x8888888888888888ull;
217
+ return str + (OJ_CTZ64(mask) >> 2);
218
+ }
219
+ str += sizeof(uint8x16_t);
220
+ }
221
+
222
+ return scan_string_noSIMD(str, end);
223
+ }
224
+
225
+ #endif
226
+
202
227
  #ifdef HAVE_SIMD_SSE4_2
203
228
  // Optimized SIMD string scanner using SSE4.2 instructions
204
229
  // Uses prefetching and processes multiple chunks in parallel to reduce latency
205
- static inline const char *scan_string_SSE42(const char *str, const char *end) {
230
+ // Note: OJ_TARGET_SSE42 attribute allows this to compile even without global -msse4.2
231
+ static OJ_TARGET_SSE42 const char *scan_string_SSE42(const char *str, const char *end) {
206
232
  static const char chars[16] = "\x00\\\"";
207
233
  const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
208
234
  const char *safe_end_64 = end - 64;
@@ -212,7 +238,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
212
238
  // This reduces pipeline stalls and improves instruction-level parallelism
213
239
  while (str <= safe_end_64) {
214
240
  // Prefetch next cache line for better memory throughput
215
- __builtin_prefetch(str + 64, 0, 0);
241
+ OJ_PREFETCH(str + 64);
216
242
 
217
243
  // Load and compare 4 chunks in parallel
218
244
  const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
@@ -225,7 +251,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
225
251
  chunk0,
226
252
  16,
227
253
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
228
- if (__builtin_expect(r0 != 16, 0))
254
+ if (OJ_UNLIKELY(r0 != 16))
229
255
  return str + r0;
230
256
 
231
257
  const int r1 = _mm_cmpestri(terminate,
@@ -233,7 +259,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
233
259
  chunk1,
234
260
  16,
235
261
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
236
- if (__builtin_expect(r1 != 16, 0))
262
+ if (OJ_UNLIKELY(r1 != 16))
237
263
  return str + 16 + r1;
238
264
 
239
265
  const int r2 = _mm_cmpestri(terminate,
@@ -241,7 +267,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
241
267
  chunk2,
242
268
  16,
243
269
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
244
- if (__builtin_expect(r2 != 16, 0))
270
+ if (OJ_UNLIKELY(r2 != 16))
245
271
  return str + 32 + r2;
246
272
 
247
273
  const int r3 = _mm_cmpestri(terminate,
@@ -249,7 +275,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
249
275
  chunk3,
250
276
  16,
251
277
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
252
- if (__builtin_expect(r3 != 16, 0))
278
+ if (OJ_UNLIKELY(r3 != 16))
253
279
  return str + 48 + r3;
254
280
 
255
281
  str += 64;
@@ -274,7 +300,8 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
274
300
  #ifdef HAVE_SIMD_SSE2
275
301
  // Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
276
302
  // Uses SSE2 instructions with prefetching and parallel processing
277
- static inline const char *scan_string_SSE2(const char *str, const char *end) {
303
+ // Note: OJ_TARGET_SSE2 attribute allows this to compile even without global -msse2
304
+ static OJ_TARGET_SSE2 const char *scan_string_SSE2(const char *str, const char *end) {
278
305
  const char *safe_end_64 = end - 64;
279
306
  const char *safe_end_16 = end - 16;
280
307
 
@@ -285,7 +312,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
285
312
 
286
313
  // Process 64 bytes at a time for better throughput
287
314
  while (str <= safe_end_64) {
288
- __builtin_prefetch(str + 64, 0, 0);
315
+ OJ_PREFETCH(str + 64);
289
316
 
290
317
  // Load 4 chunks
291
318
  const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
@@ -309,20 +336,20 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
309
336
 
310
337
  // Convert to masks
311
338
  int mask0 = _mm_movemask_epi8(cmp0);
312
- if (__builtin_expect(mask0 != 0, 0))
313
- return str + __builtin_ctz(mask0);
339
+ if (OJ_UNLIKELY(mask0 != 0))
340
+ return str + OJ_CTZ(mask0);
314
341
 
315
342
  int mask1 = _mm_movemask_epi8(cmp1);
316
- if (__builtin_expect(mask1 != 0, 0))
317
- return str + 16 + __builtin_ctz(mask1);
343
+ if (OJ_UNLIKELY(mask1 != 0))
344
+ return str + 16 + OJ_CTZ(mask1);
318
345
 
319
346
  int mask2 = _mm_movemask_epi8(cmp2);
320
- if (__builtin_expect(mask2 != 0, 0))
321
- return str + 32 + __builtin_ctz(mask2);
347
+ if (OJ_UNLIKELY(mask2 != 0))
348
+ return str + 32 + OJ_CTZ(mask2);
322
349
 
323
350
  int mask3 = _mm_movemask_epi8(cmp3);
324
- if (__builtin_expect(mask3 != 0, 0))
325
- return str + 48 + __builtin_ctz(mask3);
351
+ if (OJ_UNLIKELY(mask3 != 0))
352
+ return str + 48 + OJ_CTZ(mask3);
326
353
 
327
354
  str += 64;
328
355
  }
@@ -335,7 +362,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
335
362
  _mm_cmpeq_epi8(chunk, quote));
336
363
  int mask = _mm_movemask_epi8(matches);
337
364
  if (mask != 0)
338
- return str + __builtin_ctz(mask);
365
+ return str + OJ_CTZ(mask);
339
366
  }
340
367
 
341
368
  return scan_string_noSIMD(str, end);
@@ -345,12 +372,22 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
345
372
  static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
346
373
 
347
374
  void oj_scanner_init(void) {
375
+ // Use runtime CPU detection to select the best SIMD implementation
376
+ // This ensures we don't crash on CPUs that don't support SSE4.2
377
+ SIMD_Implementation impl = oj_get_simd_implementation();
378
+
379
+ switch (impl) {
348
380
  #ifdef HAVE_SIMD_SSE4_2
349
- scan_func = scan_string_SSE42;
350
- #elif defined(HAVE_SIMD_SSE2)
351
- scan_func = scan_string_SSE2;
381
+ case SIMD_SSE42: scan_func = scan_string_SSE42; break;
382
+ #endif
383
+ #ifdef HAVE_SIMD_SSE2
384
+ case SIMD_SSE2: scan_func = scan_string_SSE2; break;
385
+ #endif
386
+ #ifdef HAVE_SIMD_NEON
387
+ case SIMD_NEON: scan_func = string_scan_neon; break;
352
388
  #endif
353
- // Note: ARM NEON string scanning would be added here if needed
389
+ default: scan_func = scan_string_noSIMD; break;
390
+ }
354
391
  }
355
392
 
356
393
  // entered at /
data/ext/oj/parser.c CHANGED
@@ -1371,6 +1371,13 @@ static VALUE parser_parse(VALUE self, VALUE json) {
1371
1371
  p->start(p);
1372
1372
  parse(p, ptr);
1373
1373
 
1374
+ if (0 < p->depth) {
1375
+ if (OBJECT_FUN == p->stack[p->depth]) {
1376
+ parse_error(p, "Object is not closed");
1377
+ } else {
1378
+ parse_error(p, "Array is not closed");
1379
+ }
1380
+ }
1374
1381
  return p->result(p);
1375
1382
  }
1376
1383
 
data/ext/oj/rails.c CHANGED
@@ -668,7 +668,7 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
668
668
  } else {
669
669
  e->arg = rb_hash_new();
670
670
  }
671
- oj_parse_options(*argv, &e->opts);
671
+ oj_parse_options(e->arg, &e->opts);
672
672
 
673
673
  return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
674
674
  }
data/ext/oj/simd.h CHANGED
@@ -3,45 +3,192 @@
3
3
 
4
4
  // SIMD architecture detection and configuration
5
5
  // This header provides unified SIMD support across different CPU architectures
6
+ // with cross-platform runtime detection (Windows/Linux/Mac)
6
7
 
8
+ // SIMD implementation enum - used for runtime selection
9
+ typedef enum _simd_implementation { SIMD_NONE, SIMD_NEON, SIMD_SSE2, SIMD_SSE42 } SIMD_Implementation;
10
+
11
+ // Define in oj.c.
12
+ extern SIMD_Implementation SIMD_Impl;
13
+
14
+ // Runtime CPU detection function (implemented in oj.c)
15
+ SIMD_Implementation oj_get_simd_implementation(void);
16
+
17
+ // =============================================================================
18
+ // Compiler compatibility macros
19
+ // =============================================================================
20
+
21
+ // Branch prediction hints
22
+ #if defined(__GNUC__) || defined(__clang__)
23
+ #define OJ_LIKELY(x) __builtin_expect(!!(x), 1)
24
+ #define OJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
25
+ #else
26
+ #define OJ_LIKELY(x) (x)
27
+ #define OJ_UNLIKELY(x) (x)
28
+ #endif
29
+
30
+ // Prefetch hints
31
+ #if defined(__GNUC__) || defined(__clang__)
32
+ #define OJ_PREFETCH(addr) __builtin_prefetch(addr, 0, 0)
33
+ #elif defined(_MSC_VER)
34
+ #include <intrin.h>
35
+ #define OJ_PREFETCH(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0)
36
+ #else
37
+ #define OJ_PREFETCH(addr) ((void)0)
38
+ #endif
39
+
40
+ // Count trailing zeros (for SSE2 mask scanning)
41
+ #if defined(__GNUC__) || defined(__clang__)
42
+ #define OJ_CTZ(x) __builtin_ctz(x)
43
+ #define OJ_CTZ64(x) __builtin_ctzll(x)
44
+ #elif defined(_MSC_VER)
45
+ #include <intrin.h>
46
+ static __inline int oj_ctz_msvc(unsigned int x) {
47
+ unsigned long index;
48
+ if (0 == x) {
49
+ return 32;
50
+ }
51
+ _BitScanForward(&index, x);
52
+ return (int)index;
53
+ }
54
+ static __inline int oj_ctz64_msvc(uint64_t x) {
55
+ unsigned long index;
56
+ if (_BitScanForward64(&index, x)) {
57
+ return (int)index;
58
+ }
59
+ return 64;
60
+ }
61
+ #define OJ_CTZ(x) oj_ctz_msvc(x)
62
+ #define OJ_CTZ64(x) oj_ctz64_msvc(x)
63
+ #else
64
+ // Fallback: naive implementation
65
+ static inline int oj_ctz_fallback(unsigned int x) {
66
+ int count = 0;
67
+ while ((x & 1) == 0 && count < 32) {
68
+ x >>= 1;
69
+ count++;
70
+ }
71
+ return count;
72
+ }
73
+
74
+ static inline int oj_ctz64_fallback(uint64_t x) {
75
+ int count = 0;
76
+ while ((x & 1) == 0 && count < 64) {
77
+ x >>= 1;
78
+ count++;
79
+ }
80
+ return count;
81
+ }
82
+ #define OJ_CTZ(x) oj_ctz_fallback(x)
83
+ #define OJ_CTZ64(x) oj_ctz64_fallback(x)
84
+ #endif
85
+
86
+ // =============================================================================
7
87
  // x86/x86_64 SIMD detection
88
+ // =============================================================================
8
89
  #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
9
90
  #define HAVE_SIMD_X86 1
10
91
 
11
- // SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
12
- // Enabled automatically when compiler has -msse4.2 flag
13
- #if defined(__SSE4_2__)
92
+ // Include appropriate SIMD headers
93
+ #if defined(_MSC_VER)
94
+ // MSVC: use intrin.h for all intrinsics
95
+ #include <intrin.h>
14
96
  #define HAVE_SIMD_SSE4_2 1
15
- #include <nmmintrin.h>
97
+ #define HAVE_SIMD_SSE2 1
98
+ #elif defined(__GNUC__) || defined(__clang__)
99
+ // GCC/Clang: check for header availability and include them
100
+ // We include headers but use target attributes to enable instructions per-function
101
+ // Include cpuid.h for __get_cpuid fallback when __builtin_cpu_supports is unavailable
102
+ #if __has_include(<cpuid.h>)
103
+ #include <cpuid.h>
16
104
  #endif
17
-
18
- // SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
19
- #if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
105
+ #if defined(__SSE4_2__) || defined(__SSE2__)
106
+ // If any SSE is enabled globally, x86intrin.h should be available
107
+ #include <x86intrin.h>
108
+ #define HAVE_SIMD_SSE4_2 1
20
109
  #define HAVE_SIMD_SSE2 1
110
+ #else
111
+ // Try to include headers anyway for target attribute functions
112
+ #if __has_include(<x86intrin.h>)
113
+ #include <x86intrin.h>
114
+ #define HAVE_SIMD_SSE4_2 1
115
+ #define HAVE_SIMD_SSE2 1
116
+ #elif __has_include(<nmmintrin.h>)
117
+ #include <nmmintrin.h>
118
+ #define HAVE_SIMD_SSE4_2 1
119
+ #define HAVE_SIMD_SSE2 1
120
+ #elif __has_include(<emmintrin.h>)
21
121
  #include <emmintrin.h>
122
+ #define HAVE_SIMD_SSE2 1
123
+ #endif
124
+ #endif
125
+ #endif
126
+
127
+ // Target attribute macros for function-level SIMD enabling
128
+ #if defined(__clang__) || defined(__GNUC__)
129
+ #define OJ_TARGET_SSE42 __attribute__((target("sse4.2")))
130
+ #define OJ_TARGET_SSE2 __attribute__((target("sse2")))
131
+ #else
132
+ // MSVC doesn't need target attributes - intrinsics are always available
133
+ #define OJ_TARGET_SSE42
134
+ #define OJ_TARGET_SSE2
22
135
  #endif
23
136
 
24
137
  #endif // x86/x86_64
25
138
 
139
+ // =============================================================================
26
140
  // ARM NEON detection
141
+ // =============================================================================
27
142
  #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
28
143
  #define HAVE_SIMD_NEON 1
29
144
  #define SIMD_MINIMUM_THRESHOLD 6
30
145
  #include <arm_neon.h>
31
146
  #endif
32
147
 
33
- // Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
34
- #if defined(HAVE_SIMD_SSE4_2)
148
+ // =============================================================================
149
+ // SIMD type string for debugging/logging
150
+ // =============================================================================
151
+ #if defined(HAVE_SIMD_SSE4_2) || defined(HAVE_SIMD_SSE2)
35
152
  #define HAVE_SIMD_STRING_SCAN 1
36
- #define SIMD_TYPE "SSE4.2"
153
+ #define SIMD_TYPE "x86 (runtime detected)"
37
154
  #elif defined(HAVE_SIMD_NEON)
38
155
  #define HAVE_SIMD_STRING_SCAN 1
39
156
  #define SIMD_TYPE "NEON"
40
- #elif defined(HAVE_SIMD_SSE2)
41
- #define HAVE_SIMD_STRING_SCAN 1
42
- #define SIMD_TYPE "SSE2"
43
157
  #else
44
158
  #define SIMD_TYPE "none"
45
159
  #endif
46
160
 
161
+ #if defined(HAVE_SIMD_SSE4_2)
162
+
163
+ #define SIMD_MINIMUM_THRESHOLD 6
164
+
165
+ extern void initialize_sse42(void);
166
+
167
+ static inline OJ_TARGET_SSE42 __m128i vector_lookup_sse42(__m128i input, __m128i *lookup_table, int tab_size) {
168
+ // Extract high 4 bits to determine which 16-byte chunk (0-15)
169
+ __m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));
170
+
171
+ // Extract low 4 bits for index within the chunk (0-15)
172
+ __m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));
173
+
174
+ // Perform lookups in all 16 tables
175
+ __m128i results[16];
176
+ for (int i = 0; i < tab_size; i++) {
177
+ results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
178
+ }
179
+
180
+ // Create masks for each chunk and blend results
181
+ __m128i final_result = _mm_setzero_si128();
182
+
183
+ for (int i = 0; i < tab_size; i++) {
184
+ __m128i mask = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
185
+ __m128i masked_result = _mm_and_si128(mask, results[i]);
186
+ final_result = _mm_or_si128(final_result, masked_result);
187
+ }
188
+
189
+ return final_result;
190
+ }
191
+
192
+ #endif
193
+
47
194
  #endif /* OJ_SIMD_H */
data/lib/oj/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Oj
2
2
  # Current version of the module.
3
- VERSION = '3.16.13'
3
+ VERSION = '3.16.16'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oj
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.16.13
4
+ version: 3.16.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Ohler
@@ -23,20 +23,6 @@ dependencies:
23
23
  - - ">="
24
24
  - !ruby/object:Gem::Version
25
25
  version: '3.0'
26
- - !ruby/object:Gem::Dependency
27
- name: ostruct
28
- requirement: !ruby/object:Gem::Requirement
29
- requirements:
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: '0.2'
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: !ruby/object:Gem::Requirement
36
- requirements:
37
- - - ">="
38
- - !ruby/object:Gem::Version
39
- version: '0.2'
40
26
  - !ruby/object:Gem::Dependency
41
27
  name: minitest
42
28
  requirement: !ruby/object:Gem::Requirement
@@ -85,6 +71,20 @@ dependencies:
85
71
  - - "~>"
86
72
  - !ruby/object:Gem::Version
87
73
  version: '3.0'
74
+ - !ruby/object:Gem::Dependency
75
+ name: ostruct
76
+ requirement: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0.2'
81
+ type: :runtime
82
+ prerelease: false
83
+ version_requirements: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0.2'
88
88
  description: The fastest JSON parser and object serializer.
89
89
  email: peter@ohler.com
90
90
  executables: []