oj 3.16.13 → 3.16.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc0290fa1cfe6af1094de1d7188836e0c09cb04f2f08401de118253026604650
4
- data.tar.gz: de5258e96984a21afb2fac946fe28ad255926893a6157f2874445edf10aa8bbe
3
+ metadata.gz: c97d24d950284c4f108b8ff933e4358250d53b69f9300b3faf2b8b7f29fdb5b7
4
+ data.tar.gz: 3112c763244b2f558e4f2a8f0ae79c01dcea9ec4281a915a2e39a404a34bf8f1
5
5
  SHA512:
6
- metadata.gz: d7870818fd86043a17b834756b67a4009a6f7ef60baf53b02a0b0d4431ccba723d9e533553bea04ae46ae9f233e447b79c4106e6827782bd0c2ffb9c332081a3
7
- data.tar.gz: fd3966ac7fb5da9f1a5ebb68f4a8f5b9a5f9fa1a1255e93dfef078f66f00a6af5bb7e37676441f7d6229b29222741a2bc7b75164fd445a39b906ef904946d41b
6
+ metadata.gz: 3f32303a7e78478137b76fbadf4bcab93d164d60918fd5b8c20bccdd7cf0476e290e1290e4bc114f3fd6a3afaac0fb3c9df61c91ede88bfb9a9762d098731688
7
+ data.tar.gz: bcc7411ae7cde8ff0457563f67d99093b09c3d70a36a13c5db1144dbf983d5f864f204f421a966be30d3a2c3610f82033ed08c6c0c86d7ff2e8b391b4cd5b02f
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 3.16.14 - 2026-02-04
4
+
5
+ - Fixed SSE issue #989.
6
+
7
+ - Removed ostruct dependency.
8
+
9
+ - Removed generic object JSON gem tests.
10
+
3
11
  ## 3.16.13 - 2025-12-05
4
12
 
5
13
  - Fixed rails encoding for Hash and Array subclasses.
data/ext/oj/dump.c CHANGED
@@ -201,6 +201,45 @@ void initialize_neon(void) {
201
201
  }
202
202
  #endif
203
203
 
204
+ #ifdef HAVE_SIMD_SSE4_2
205
+
206
+ static __m128i hibit_friendly_chars_sse42[8];
207
+
208
+ // From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
209
+ inline uint32_t _mm_sum_epu8(const __m128i v) {
210
+ __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
211
+ return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
212
+ }
213
+
214
+ inline static OJ_TARGET_SSE42 size_t hibit_friendly_size_sse42(const uint8_t *str, size_t len) {
215
+ size_t size = 0;
216
+ size_t i = 0;
217
+
218
+ for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i), str += sizeof(__m128i)) {
219
+ size += sizeof(__m128i);
220
+
221
+ __m128i chunk = _mm_loadu_si128((__m128i *)str);
222
+ __m128i tmp = vector_lookup_sse42(chunk, hibit_friendly_chars_sse42, 8);
223
+ size += _mm_sum_epu8(tmp);
224
+ }
225
+ size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
226
+ return total;
227
+ }
228
+
229
+ void OJ_TARGET_SSE42 initialize_sse42(void) {
230
+ for (int i = 0; i < 8; i++) {
231
+ hibit_friendly_chars_sse42[i] = _mm_sub_epi8(
232
+ _mm_loadu_si128((__m128i *)(hibit_friendly_chars + i * sizeof(__m128i))),
233
+ _mm_set1_epi8('1'));
234
+ }
235
+ }
236
+
237
+ #else
238
+
239
+ #define SIMD_TARGET
240
+
241
+ #endif /* HAVE_SIMD_SSE4_2 */
242
+
204
243
  inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
205
244
  #ifdef HAVE_SIMD_NEON
206
245
  size_t size = 0;
@@ -220,6 +259,13 @@ inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
220
259
 
221
260
  size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
222
261
  return total;
262
+ #elif defined(HAVE_SIMD_SSE4_2)
263
+ if (SIMD_Impl == SIMD_SSE42) {
264
+ if (len >= sizeof(__m128i)) {
265
+ return hibit_friendly_size_sse42(str, len);
266
+ }
267
+ }
268
+ return calculate_string_size(str, len, hibit_friendly_chars);
223
269
  #else
224
270
  return calculate_string_size(str, len, hibit_friendly_chars);
225
271
  #endif
@@ -944,6 +990,34 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
944
990
  return result;
945
991
  }
946
992
 
993
+ #elif defined(HAVE_SIMD_SSE4_2)
994
+ typedef struct _sse42_match_result {
995
+ __m128i actions;
996
+ bool needs_escape;
997
+ int escape_mask;
998
+ bool has_some_hibit;
999
+ bool do_unicode_validation;
1000
+ } sse42_match_result;
1001
+
1002
+ static inline OJ_TARGET_SSE42 sse42_match_result
1003
+ sse42_update(const char *str, __m128i *cmap_sse42, int sse42_tab_size, bool do_unicode_validation, bool has_hi) {
1004
+ sse42_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
1005
+
1006
+ __m128i chunk = _mm_loadu_si128((__m128i *)str);
1007
+ __m128i actions = vector_lookup_sse42(chunk, cmap_sse42, sse42_tab_size);
1008
+ __m128i needs_escape = _mm_xor_si128(_mm_cmpeq_epi8(actions, _mm_setzero_si128()), _mm_set1_epi8(0xFF));
1009
+ result.actions = _mm_add_epi8(actions, _mm_set1_epi8('1'));
1010
+
1011
+ result.escape_mask = _mm_movemask_epi8(needs_escape);
1012
+ result.needs_escape = result.escape_mask != 0;
1013
+ if (has_hi && do_unicode_validation) {
1014
+ __m128i has_some_hibit = _mm_and_si128(chunk, _mm_set1_epi8(0x80));
1015
+ result.has_some_hibit = _mm_movemask_epi8(has_some_hibit) != 0;
1016
+ result.do_unicode_validation = has_hi && do_unicode_validation && result.has_some_hibit;
1017
+ }
1018
+ return result;
1019
+ }
1020
+
947
1021
  #endif /* HAVE_SIMD_NEON */
948
1022
 
949
1023
  static inline FORCE_INLINE const char *process_character(char action,
@@ -1023,6 +1097,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1023
1097
  #ifdef HAVE_SIMD_NEON
1024
1098
  uint8x16x4_t *cmap_neon = NULL;
1025
1099
  int neon_table_size = 0;
1100
+ #elif defined(HAVE_SIMD_SSE4_2)
1101
+ __m128i *cmap_sse42 = NULL;
1102
+ int sse42_tab_size;
1026
1103
  #endif /* HAVE_SIMD_NEON */
1027
1104
  const char *orig = str;
1028
1105
  bool has_hi = false;
@@ -1091,6 +1168,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1091
1168
  #ifdef HAVE_SIMD_NEON
1092
1169
  cmap_neon = hibit_friendly_chars_neon;
1093
1170
  neon_table_size = 2;
1171
+ #elif defined(HAVE_SIMD_SSE4_2)
1172
+ cmap_sse42 = hibit_friendly_chars_sse42;
1173
+ sse42_tab_size = 8;
1094
1174
  #endif /* HAVE_NEON_SIMD */
1095
1175
  size = hibit_friendly_size((uint8_t *)str, cnt);
1096
1176
  }
@@ -1118,21 +1198,32 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1118
1198
  if (is_sym) {
1119
1199
  *out->cur++ = ':';
1120
1200
  }
1121
- #ifdef HAVE_SIMD_NEON
1122
- const char *chunk_start;
1123
- const char *chunk_end;
1124
- const char *cursor = str;
1125
- bool use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
1126
- char matches[16];
1201
+
1202
+ #if defined(HAVE_SIMD_NEON) || defined(HAVE_SIMD_SSE4_2)
1203
+
1127
1204
  #define SEARCH_FLUSH \
1128
1205
  if (str > cursor) { \
1129
1206
  APPEND_CHARS(out->cur, cursor, str - cursor); \
1130
1207
  cursor = str; \
1131
1208
  }
1132
1209
 
1133
- #endif /* HAVE_SIMD_NEON */
1210
+ const char *chunk_start;
1211
+ const char *chunk_end;
1212
+ const char *cursor = str;
1213
+ char matches[16];
1214
+ #endif /* HAVE_SIMD_NEON || HAVE_SIMD_SSE4_2 */
1215
+
1216
+ #if defined(HAVE_SIMD_NEON)
1217
+ bool use_simd = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
1218
+ #elif defined(HAVE_SIMD_SSE4_2)
1219
+ bool use_simd = false;
1220
+ if (SIMD_Impl == SIMD_SSE42) {
1221
+ use_simd = (cmap_sse42 != NULL && cnt >= (sizeof(__m128i))) ? true : false;
1222
+ }
1223
+ #endif
1224
+
1134
1225
  #ifdef HAVE_SIMD_NEON
1135
- if (use_neon) {
1226
+ if (use_simd) {
1136
1227
  while (str < end) {
1137
1228
  const char *chunk_ptr = NULL;
1138
1229
  if (str + sizeof(uint8x16_t) <= end) {
@@ -1195,7 +1286,55 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1195
1286
  }
1196
1287
  SEARCH_FLUSH;
1197
1288
  }
1198
- #endif /* HAVE_SIMD_NEON */
1289
+ #endif
1290
+
1291
+ #ifdef HAVE_SIMD_SSE4_2
1292
+ if (SIMD_Impl == SIMD_SSE42) {
1293
+ if (use_simd) {
1294
+ while (str < end) {
1295
+ const char *chunk_ptr = NULL;
1296
+ if (str + sizeof(__m128i) <= end) {
1297
+ chunk_ptr = str;
1298
+ chunk_start = str;
1299
+ chunk_end = str + sizeof(__m128i);
1300
+ } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
1301
+ memset(out->cur, 'A', sizeof(__m128i));
1302
+ memcpy(out->cur, str, (end - str));
1303
+ chunk_ptr = out->cur;
1304
+ chunk_start = str;
1305
+ chunk_end = end;
1306
+ } else {
1307
+ break;
1308
+ }
1309
+ sse42_match_result result = sse42_update(chunk_ptr,
1310
+ cmap_sse42,
1311
+ sse42_tab_size,
1312
+ do_unicode_validation,
1313
+ has_hi);
1314
+ if ((result.do_unicode_validation) || result.needs_escape) {
1315
+ SEARCH_FLUSH;
1316
+ _mm_storeu_si128((__m128i *)matches, result.actions);
1317
+ while (str < chunk_end) {
1318
+ long match_index = str - chunk_start;
1319
+ str = process_character(matches[match_index],
1320
+ str,
1321
+ end,
1322
+ out,
1323
+ orig,
1324
+ do_unicode_validation,
1325
+ &check_start);
1326
+ str++;
1327
+ }
1328
+ cursor = str;
1329
+ continue;
1330
+ }
1331
+ str = chunk_end;
1332
+ }
1333
+ SEARCH_FLUSH;
1334
+ }
1335
+ }
1336
+ #endif /* HAVE_SIMD_SSE4_2 */
1337
+
1199
1338
  for (; str < end; str++) {
1200
1339
  str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
1201
1340
  }
data/ext/oj/extconf.rb CHANGED
@@ -35,11 +35,15 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
35
35
 
36
36
  dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
37
37
 
38
- # Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
39
- # Falls back to SSE2 or compiler defaults if not available
40
- if try_cflags('-msse4.2')
41
- $CPPFLAGS += ' -msse4.2'
42
- elsif try_cflags('-msse2')
38
+ # SIMD optimizations use runtime CPU detection and function-level target attributes
39
+ # We do NOT add global -msse4.2/-msse2 flags here because:
40
+ # 1. It would cause illegal instruction errors on CPUs without SSE4.2
41
+ # 2. The code uses __attribute__((target("sse4.2"))) for SSE4.2 functions
42
+ # 3. Runtime detection in oj_get_simd_implementation() selects the right path
43
+ #
44
+ # We only add -msse2 if available, since SSE2 is baseline for all x86_64 CPUs
45
+ # and needed for compiling the SSE2 fallback code on 32-bit x86
46
+ if try_cflags('-msse2')
43
47
  $CPPFLAGS += ' -msse2'
44
48
  end
45
49
 
data/ext/oj/oj.c CHANGED
@@ -167,6 +167,8 @@ pthread_mutex_t oj_cache_mutex;
167
167
  VALUE oj_cache_mutex = Qnil;
168
168
  #endif
169
169
 
170
+ SIMD_Implementation SIMD_Impl = SIMD_NONE;
171
+
170
172
  extern void oj_parser_init();
171
173
 
172
174
  const char oj_json_class[] = "json_class";
@@ -1780,6 +1782,78 @@ static VALUE mem_report(VALUE self) {
1780
1782
  *
1781
1783
  * - *:wab* specifically for WAB data exchange.
1782
1784
  */
1785
+
1786
+ // =============================================================================
1787
+ // Runtime SIMD CPU detection
1788
+ // Cross-platform support for Windows (MSVC), Linux, and macOS (GCC/Clang)
1789
+ // =============================================================================
1790
+ SIMD_Implementation oj_get_simd_implementation(void) {
1791
+ #ifdef HAVE_SIMD_X86
1792
+ // x86/x86_64 runtime detection
1793
+
1794
+ #if defined(_MSC_VER)
1795
+ // MSVC: Use __cpuid intrinsic
1796
+ int cpu_info[4];
1797
+ __cpuid(cpu_info, 1);
1798
+
1799
+ // Check for SSE4.2 (bit 20 of ECX)
1800
+ if (cpu_info[2] & (1 << 20)) {
1801
+ return SIMD_SSE42;
1802
+ }
1803
+ // Check for SSE2 (bit 26 of EDX)
1804
+ if (cpu_info[3] & (1 << 26)) {
1805
+ return SIMD_SSE2;
1806
+ }
1807
+
1808
+ #elif defined(__GNUC__) || defined(__clang__)
1809
+ // GCC/Clang: Use __builtin_cpu_supports if available
1810
+ #if defined(__has_builtin)
1811
+ #if __has_builtin(__builtin_cpu_supports)
1812
+ #define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
1813
+ #endif
1814
+ #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
1815
+ // GCC 4.8+ has __builtin_cpu_supports
1816
+ #define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
1817
+ #endif
1818
+
1819
+ #ifdef OJ_HAS_BUILTIN_CPU_SUPPORTS
1820
+ #ifdef HAVE_SIMD_SSE4_2
1821
+ if (__builtin_cpu_supports("sse4.2")) {
1822
+ return SIMD_SSE42;
1823
+ }
1824
+ #endif
1825
+ #ifdef HAVE_SIMD_SSE2
1826
+ if (__builtin_cpu_supports("sse2")) {
1827
+ return SIMD_SSE2;
1828
+ }
1829
+ #endif
1830
+ #else
1831
+ // Fallback: Use CPUID instruction directly
1832
+ unsigned int eax, ebx, ecx, edx;
1833
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
1834
+ // Check for SSE4.2 (bit 20 of ECX)
1835
+ if (ecx & (1 << 20)) {
1836
+ return SIMD_SSE42;
1837
+ }
1838
+ // Check for SSE2 (bit 26 of EDX)
1839
+ if (edx & (1 << 26)) {
1840
+ return SIMD_SSE2;
1841
+ }
1842
+ }
1843
+ #endif // OJ_HAS_BUILTIN_CPU_SUPPORTS
1844
+
1845
+ #endif // _MSC_VER vs GCC/Clang
1846
+
1847
+ #endif // HAVE_SIMD_X86
1848
+
1849
+ #ifdef HAVE_SIMD_NEON
1850
+ // ARM NEON is always available on ARM64 and detected at compile time
1851
+ return SIMD_NEON;
1852
+ #endif
1853
+
1854
+ return SIMD_NONE;
1855
+ }
1856
+
1783
1857
  void Init_oj(void) {
1784
1858
  int err = 0;
1785
1859
 
@@ -2080,10 +2154,18 @@ void Init_oj(void) {
2080
2154
  #endif
2081
2155
  oj_init_doc();
2082
2156
 
2157
+ SIMD_Impl = oj_get_simd_implementation();
2158
+
2083
2159
  oj_parser_init();
2084
2160
  oj_scanner_init();
2085
2161
 
2086
2162
  #ifdef HAVE_SIMD_NEON
2087
2163
  initialize_neon();
2088
2164
  #endif /* HAVE_SIMD_NEON */
2165
+
2166
+ #ifdef HAVE_SIMD_SSE4_2
2167
+ if (SIMD_Impl == SIMD_SSE42) {
2168
+ initialize_sse42();
2169
+ }
2170
+ #endif /* HAVE_SIMD_SSE4_2 */
2089
2171
  }
data/ext/oj/parse.c CHANGED
@@ -202,7 +202,8 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
202
202
  #ifdef HAVE_SIMD_SSE4_2
203
203
  // Optimized SIMD string scanner using SSE4.2 instructions
204
204
  // Uses prefetching and processes multiple chunks in parallel to reduce latency
205
- static inline const char *scan_string_SSE42(const char *str, const char *end) {
205
+ // Note: OJ_TARGET_SSE42 attribute allows this to compile even without global -msse4.2
206
+ static OJ_TARGET_SSE42 const char *scan_string_SSE42(const char *str, const char *end) {
206
207
  static const char chars[16] = "\x00\\\"";
207
208
  const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
208
209
  const char *safe_end_64 = end - 64;
@@ -212,7 +213,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
212
213
  // This reduces pipeline stalls and improves instruction-level parallelism
213
214
  while (str <= safe_end_64) {
214
215
  // Prefetch next cache line for better memory throughput
215
- __builtin_prefetch(str + 64, 0, 0);
216
+ OJ_PREFETCH(str + 64);
216
217
 
217
218
  // Load and compare 4 chunks in parallel
218
219
  const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
@@ -225,7 +226,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
225
226
  chunk0,
226
227
  16,
227
228
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
228
- if (__builtin_expect(r0 != 16, 0))
229
+ if (OJ_UNLIKELY(r0 != 16))
229
230
  return str + r0;
230
231
 
231
232
  const int r1 = _mm_cmpestri(terminate,
@@ -233,7 +234,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
233
234
  chunk1,
234
235
  16,
235
236
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
236
- if (__builtin_expect(r1 != 16, 0))
237
+ if (OJ_UNLIKELY(r1 != 16))
237
238
  return str + 16 + r1;
238
239
 
239
240
  const int r2 = _mm_cmpestri(terminate,
@@ -241,7 +242,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
241
242
  chunk2,
242
243
  16,
243
244
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
244
- if (__builtin_expect(r2 != 16, 0))
245
+ if (OJ_UNLIKELY(r2 != 16))
245
246
  return str + 32 + r2;
246
247
 
247
248
  const int r3 = _mm_cmpestri(terminate,
@@ -249,7 +250,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
249
250
  chunk3,
250
251
  16,
251
252
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
252
- if (__builtin_expect(r3 != 16, 0))
253
+ if (OJ_UNLIKELY(r3 != 16))
253
254
  return str + 48 + r3;
254
255
 
255
256
  str += 64;
@@ -274,7 +275,8 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
274
275
  #ifdef HAVE_SIMD_SSE2
275
276
  // Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
276
277
  // Uses SSE2 instructions with prefetching and parallel processing
277
- static inline const char *scan_string_SSE2(const char *str, const char *end) {
278
+ // Note: OJ_TARGET_SSE2 attribute allows this to compile even without global -msse2
279
+ static OJ_TARGET_SSE2 const char *scan_string_SSE2(const char *str, const char *end) {
278
280
  const char *safe_end_64 = end - 64;
279
281
  const char *safe_end_16 = end - 16;
280
282
 
@@ -285,7 +287,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
285
287
 
286
288
  // Process 64 bytes at a time for better throughput
287
289
  while (str <= safe_end_64) {
288
- __builtin_prefetch(str + 64, 0, 0);
290
+ OJ_PREFETCH(str + 64);
289
291
 
290
292
  // Load 4 chunks
291
293
  const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
@@ -309,20 +311,20 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
309
311
 
310
312
  // Convert to masks
311
313
  int mask0 = _mm_movemask_epi8(cmp0);
312
- if (__builtin_expect(mask0 != 0, 0))
313
- return str + __builtin_ctz(mask0);
314
+ if (OJ_UNLIKELY(mask0 != 0))
315
+ return str + OJ_CTZ(mask0);
314
316
 
315
317
  int mask1 = _mm_movemask_epi8(cmp1);
316
- if (__builtin_expect(mask1 != 0, 0))
317
- return str + 16 + __builtin_ctz(mask1);
318
+ if (OJ_UNLIKELY(mask1 != 0))
319
+ return str + 16 + OJ_CTZ(mask1);
318
320
 
319
321
  int mask2 = _mm_movemask_epi8(cmp2);
320
- if (__builtin_expect(mask2 != 0, 0))
321
- return str + 32 + __builtin_ctz(mask2);
322
+ if (OJ_UNLIKELY(mask2 != 0))
323
+ return str + 32 + OJ_CTZ(mask2);
322
324
 
323
325
  int mask3 = _mm_movemask_epi8(cmp3);
324
- if (__builtin_expect(mask3 != 0, 0))
325
- return str + 48 + __builtin_ctz(mask3);
326
+ if (OJ_UNLIKELY(mask3 != 0))
327
+ return str + 48 + OJ_CTZ(mask3);
326
328
 
327
329
  str += 64;
328
330
  }
@@ -335,7 +337,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
335
337
  _mm_cmpeq_epi8(chunk, quote));
336
338
  int mask = _mm_movemask_epi8(matches);
337
339
  if (mask != 0)
338
- return str + __builtin_ctz(mask);
340
+ return str + OJ_CTZ(mask);
339
341
  }
340
342
 
341
343
  return scan_string_noSIMD(str, end);
@@ -345,11 +347,19 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
345
347
  static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
346
348
 
347
349
  void oj_scanner_init(void) {
350
+ // Use runtime CPU detection to select the best SIMD implementation
351
+ // This ensures we don't crash on CPUs that don't support SSE4.2
352
+ SIMD_Implementation impl = oj_get_simd_implementation();
353
+
354
+ switch (impl) {
348
355
  #ifdef HAVE_SIMD_SSE4_2
349
- scan_func = scan_string_SSE42;
350
- #elif defined(HAVE_SIMD_SSE2)
351
- scan_func = scan_string_SSE2;
356
+ case SIMD_SSE42: scan_func = scan_string_SSE42; break;
357
+ #endif
358
+ #ifdef HAVE_SIMD_SSE2
359
+ case SIMD_SSE2: scan_func = scan_string_SSE2; break;
352
360
  #endif
361
+ default: scan_func = scan_string_noSIMD; break;
362
+ }
353
363
  // Note: ARM NEON string scanning would be added here if needed
354
364
  }
355
365
 
data/ext/oj/rails.c CHANGED
@@ -668,7 +668,7 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
668
668
  } else {
669
669
  e->arg = rb_hash_new();
670
670
  }
671
- oj_parse_options(*argv, &e->opts);
671
+ oj_parse_options(e->arg, &e->opts);
672
672
 
673
673
  return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
674
674
  }
data/ext/oj/simd.h CHANGED
@@ -3,45 +3,170 @@
3
3
 
4
4
  // SIMD architecture detection and configuration
5
5
  // This header provides unified SIMD support across different CPU architectures
6
+ // with cross-platform runtime detection (Windows/Linux/Mac)
6
7
 
8
+ // SIMD implementation enum - used for runtime selection
9
+ typedef enum _simd_implementation { SIMD_NONE, SIMD_NEON, SIMD_SSE2, SIMD_SSE42 } SIMD_Implementation;
10
+
11
+ // Define in oj.c.
12
+ extern SIMD_Implementation SIMD_Impl;
13
+
14
+ // Runtime CPU detection function (implemented in oj.c)
15
+ SIMD_Implementation oj_get_simd_implementation(void);
16
+
17
+ // =============================================================================
18
+ // Compiler compatibility macros
19
+ // =============================================================================
20
+
21
+ // Branch prediction hints
22
+ #if defined(__GNUC__) || defined(__clang__)
23
+ #define OJ_LIKELY(x) __builtin_expect(!!(x), 1)
24
+ #define OJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
25
+ #else
26
+ #define OJ_LIKELY(x) (x)
27
+ #define OJ_UNLIKELY(x) (x)
28
+ #endif
29
+
30
+ // Prefetch hints
31
+ #if defined(__GNUC__) || defined(__clang__)
32
+ #define OJ_PREFETCH(addr) __builtin_prefetch(addr, 0, 0)
33
+ #elif defined(_MSC_VER)
34
+ #include <intrin.h>
35
+ #define OJ_PREFETCH(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0)
36
+ #else
37
+ #define OJ_PREFETCH(addr) ((void)0)
38
+ #endif
39
+
40
+ // Count trailing zeros (for SSE2 mask scanning)
41
+ #if defined(__GNUC__) || defined(__clang__)
42
+ #define OJ_CTZ(x) __builtin_ctz(x)
43
+ #elif defined(_MSC_VER)
44
+ #include <intrin.h>
45
+ static __inline int oj_ctz_msvc(unsigned int x) {
46
+ unsigned long index;
47
+ _BitScanForward(&index, x);
48
+ return (int)index;
49
+ }
50
+ #define OJ_CTZ(x) oj_ctz_msvc(x)
51
+ #else
52
+ // Fallback: naive implementation
53
+ static inline int oj_ctz_fallback(unsigned int x) {
54
+ int count = 0;
55
+ while ((x & 1) == 0 && count < 32) {
56
+ x >>= 1;
57
+ count++;
58
+ }
59
+ return count;
60
+ }
61
+ #define OJ_CTZ(x) oj_ctz_fallback(x)
62
+ #endif
63
+
64
+ // =============================================================================
7
65
  // x86/x86_64 SIMD detection
66
+ // =============================================================================
8
67
  #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
9
68
  #define HAVE_SIMD_X86 1
10
69
 
11
- // SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
12
- // Enabled automatically when compiler has -msse4.2 flag
13
- #if defined(__SSE4_2__)
70
+ // Include appropriate SIMD headers
71
+ #if defined(_MSC_VER)
72
+ // MSVC: use intrin.h for all intrinsics
73
+ #include <intrin.h>
14
74
  #define HAVE_SIMD_SSE4_2 1
15
- #include <nmmintrin.h>
75
+ #define HAVE_SIMD_SSE2 1
76
+ #elif defined(__GNUC__) || defined(__clang__)
77
+ // GCC/Clang: check for header availability and include them
78
+ // We include headers but use target attributes to enable instructions per-function
79
+ // Include cpuid.h for __get_cpuid fallback when __builtin_cpu_supports is unavailable
80
+ #if __has_include(<cpuid.h>)
81
+ #include <cpuid.h>
16
82
  #endif
17
-
18
- // SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
19
- #if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
83
+ #if defined(__SSE4_2__) || defined(__SSE2__)
84
+ // If any SSE is enabled globally, x86intrin.h should be available
85
+ #include <x86intrin.h>
86
+ #define HAVE_SIMD_SSE4_2 1
87
+ #define HAVE_SIMD_SSE2 1
88
+ #else
89
+ // Try to include headers anyway for target attribute functions
90
+ #if __has_include(<x86intrin.h>)
91
+ #include <x86intrin.h>
92
+ #define HAVE_SIMD_SSE4_2 1
93
+ #define HAVE_SIMD_SSE2 1
94
+ #elif __has_include(<nmmintrin.h>)
95
+ #include <nmmintrin.h>
96
+ #define HAVE_SIMD_SSE4_2 1
20
97
  #define HAVE_SIMD_SSE2 1
98
+ #elif __has_include(<emmintrin.h>)
21
99
  #include <emmintrin.h>
100
+ #define HAVE_SIMD_SSE2 1
101
+ #endif
102
+ #endif
103
+ #endif
104
+
105
+ // Target attribute macros for function-level SIMD enabling
106
+ #if defined(__clang__) || defined(__GNUC__)
107
+ #define OJ_TARGET_SSE42 __attribute__((target("sse4.2")))
108
+ #define OJ_TARGET_SSE2 __attribute__((target("sse2")))
109
+ #else
110
+ // MSVC doesn't need target attributes - intrinsics are always available
111
+ #define OJ_TARGET_SSE42
112
+ #define OJ_TARGET_SSE2
22
113
  #endif
23
114
 
24
115
  #endif // x86/x86_64
25
116
 
117
+ // =============================================================================
26
118
  // ARM NEON detection
119
+ // =============================================================================
27
120
  #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
28
121
  #define HAVE_SIMD_NEON 1
29
122
  #define SIMD_MINIMUM_THRESHOLD 6
30
123
  #include <arm_neon.h>
31
124
  #endif
32
125
 
33
- // Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
34
- #if defined(HAVE_SIMD_SSE4_2)
126
+ // =============================================================================
127
+ // SIMD type string for debugging/logging
128
+ // =============================================================================
129
+ #if defined(HAVE_SIMD_SSE4_2) || defined(HAVE_SIMD_SSE2)
35
130
  #define HAVE_SIMD_STRING_SCAN 1
36
- #define SIMD_TYPE "SSE4.2"
131
+ #define SIMD_TYPE "x86 (runtime detected)"
37
132
  #elif defined(HAVE_SIMD_NEON)
38
133
  #define HAVE_SIMD_STRING_SCAN 1
39
134
  #define SIMD_TYPE "NEON"
40
- #elif defined(HAVE_SIMD_SSE2)
41
- #define HAVE_SIMD_STRING_SCAN 1
42
- #define SIMD_TYPE "SSE2"
43
135
  #else
44
136
  #define SIMD_TYPE "none"
45
137
  #endif
46
138
 
139
+ #if defined(HAVE_SIMD_SSE4_2)
140
+
141
+ #define SIMD_MINIMUM_THRESHOLD 6
142
+
143
+ extern void initialize_sse42(void);
144
+
145
+ static inline OJ_TARGET_SSE42 __m128i vector_lookup_sse42(__m128i input, __m128i *lookup_table, int tab_size) {
146
+ // Extract high 4 bits to determine which 16-byte chunk (0-15)
147
+ __m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));
148
+
149
+ // Extract low 4 bits for index within the chunk (0-15)
150
+ __m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));
151
+
152
+ // Perform lookups in all 16 tables
153
+ __m128i results[16];
154
+ for (int i = 0; i < tab_size; i++) {
155
+ results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
156
+ }
157
+
158
+ // Create masks for each chunk and blend results
159
+ __m128i final_result = _mm_setzero_si128();
160
+
161
+ for (int i = 0; i < tab_size; i++) {
162
+ __m128i mask = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
163
+ __m128i masked_result = _mm_and_si128(mask, results[i]);
164
+ final_result = _mm_or_si128(final_result, masked_result);
165
+ }
166
+
167
+ return final_result;
168
+ }
169
+
170
+ #endif
171
+
47
172
  #endif /* OJ_SIMD_H */
data/lib/oj/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Oj
2
2
  # Current version of the module.
3
- VERSION = '3.16.13'
3
+ VERSION = '3.16.14'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oj
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.16.13
4
+ version: 3.16.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Ohler
@@ -23,20 +23,6 @@ dependencies:
23
23
  - - ">="
24
24
  - !ruby/object:Gem::Version
25
25
  version: '3.0'
26
- - !ruby/object:Gem::Dependency
27
- name: ostruct
28
- requirement: !ruby/object:Gem::Requirement
29
- requirements:
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: '0.2'
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: !ruby/object:Gem::Requirement
36
- requirements:
37
- - - ">="
38
- - !ruby/object:Gem::Version
39
- version: '0.2'
40
26
  - !ruby/object:Gem::Dependency
41
27
  name: minitest
42
28
  requirement: !ruby/object:Gem::Requirement
@@ -229,7 +215,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
229
215
  - !ruby/object:Gem::Version
230
216
  version: '0'
231
217
  requirements: []
232
- rubygems_version: 3.6.9
218
+ rubygems_version: 4.0.3
233
219
  specification_version: 4
234
220
  summary: A fast JSON parser and serializer.
235
221
  test_files: []