oj 3.16.12 → 3.16.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6e2053531cd4c7c7b49bf16ddafbfda868e67e66c1bfbec1b06daaa0ba3f1c45
4
- data.tar.gz: ce029b2f90660922dd8fb335c23463a1fe1d81317d56e48f2cfddc9271de2d15
3
+ metadata.gz: c97d24d950284c4f108b8ff933e4358250d53b69f9300b3faf2b8b7f29fdb5b7
4
+ data.tar.gz: 3112c763244b2f558e4f2a8f0ae79c01dcea9ec4281a915a2e39a404a34bf8f1
5
5
  SHA512:
6
- metadata.gz: deb7f1447b5022adad6d7387b8a8bfd866d399abc2d9e434f7e6d321fa73cb1738ff9aa7ee22ac064c455d5d3951ba7469a30720c9474af5e96c70eaa5b5303a
7
- data.tar.gz: b9d28d76c714947c1e6b133225e348838ad13f1c8b7dc82f0fee261272259cd0e83b911f2f2762c55a719f93e669275ad32d393f371c264a4e587e49b1c3a84b
6
+ metadata.gz: 3f32303a7e78478137b76fbadf4bcab93d164d60918fd5b8c20bccdd7cf0476e290e1290e4bc114f3fd6a3afaac0fb3c9df61c91ede88bfb9a9762d098731688
7
+ data.tar.gz: bcc7411ae7cde8ff0457563f67d99093b09c3d70a36a13c5db1144dbf983d5f864f204f421a966be30d3a2c3610f82033ed08c6c0c86d7ff2e8b391b4cd5b02f
data/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 3.16.14 - 2026-02-04
4
+
5
+ - Fixed SSE issue #989.
6
+
7
+ - Removed ostruct dependency.
8
+
9
+ - Removed generic object JSON gem tests.
10
+
11
+ ## 3.16.13 - 2025-12-05
12
+
13
+ - Fixed rails encoding for Hash and Array subclasses.
14
+
3
15
  ## 3.16.12 - 2025-10-29
4
16
 
5
17
  - Fixed dump realloc bug that occurred when using the compat mode dump options.
data/ext/oj/dump.c CHANGED
@@ -201,6 +201,45 @@ void initialize_neon(void) {
201
201
  }
202
202
  #endif
203
203
 
204
+ #ifdef HAVE_SIMD_SSE4_2
205
+
206
+ static __m128i hibit_friendly_chars_sse42[8];
207
+
208
+ // From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
209
+ inline uint32_t _mm_sum_epu8(const __m128i v) {
210
+ __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
211
+ return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
212
+ }
213
+
214
+ inline static OJ_TARGET_SSE42 size_t hibit_friendly_size_sse42(const uint8_t *str, size_t len) {
215
+ size_t size = 0;
216
+ size_t i = 0;
217
+
218
+ for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i), str += sizeof(__m128i)) {
219
+ size += sizeof(__m128i);
220
+
221
+ __m128i chunk = _mm_loadu_si128((__m128i *)str);
222
+ __m128i tmp = vector_lookup_sse42(chunk, hibit_friendly_chars_sse42, 8);
223
+ size += _mm_sum_epu8(tmp);
224
+ }
225
+ size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
226
+ return total;
227
+ }
228
+
229
+ void OJ_TARGET_SSE42 initialize_sse42(void) {
230
+ for (int i = 0; i < 8; i++) {
231
+ hibit_friendly_chars_sse42[i] = _mm_sub_epi8(
232
+ _mm_loadu_si128((__m128i *)(hibit_friendly_chars + i * sizeof(__m128i))),
233
+ _mm_set1_epi8('1'));
234
+ }
235
+ }
236
+
237
+ #else
238
+
239
+ #define SIMD_TARGET
240
+
241
+ #endif /* HAVE_SIMD_SSE4_2 */
242
+
204
243
  inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
205
244
  #ifdef HAVE_SIMD_NEON
206
245
  size_t size = 0;
@@ -220,6 +259,13 @@ inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
220
259
 
221
260
  size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
222
261
  return total;
262
+ #elif defined(HAVE_SIMD_SSE4_2)
263
+ if (SIMD_Impl == SIMD_SSE42) {
264
+ if (len >= sizeof(__m128i)) {
265
+ return hibit_friendly_size_sse42(str, len);
266
+ }
267
+ }
268
+ return calculate_string_size(str, len, hibit_friendly_chars);
223
269
  #else
224
270
  return calculate_string_size(str, len, hibit_friendly_chars);
225
271
  #endif
@@ -944,6 +990,34 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
944
990
  return result;
945
991
  }
946
992
 
993
+ #elif defined(HAVE_SIMD_SSE4_2)
994
+ typedef struct _sse42_match_result {
995
+ __m128i actions;
996
+ bool needs_escape;
997
+ int escape_mask;
998
+ bool has_some_hibit;
999
+ bool do_unicode_validation;
1000
+ } sse42_match_result;
1001
+
1002
+ static inline OJ_TARGET_SSE42 sse42_match_result
1003
+ sse42_update(const char *str, __m128i *cmap_sse42, int sse42_tab_size, bool do_unicode_validation, bool has_hi) {
1004
+ sse42_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
1005
+
1006
+ __m128i chunk = _mm_loadu_si128((__m128i *)str);
1007
+ __m128i actions = vector_lookup_sse42(chunk, cmap_sse42, sse42_tab_size);
1008
+ __m128i needs_escape = _mm_xor_si128(_mm_cmpeq_epi8(actions, _mm_setzero_si128()), _mm_set1_epi8(0xFF));
1009
+ result.actions = _mm_add_epi8(actions, _mm_set1_epi8('1'));
1010
+
1011
+ result.escape_mask = _mm_movemask_epi8(needs_escape);
1012
+ result.needs_escape = result.escape_mask != 0;
1013
+ if (has_hi && do_unicode_validation) {
1014
+ __m128i has_some_hibit = _mm_and_si128(chunk, _mm_set1_epi8(0x80));
1015
+ result.has_some_hibit = _mm_movemask_epi8(has_some_hibit) != 0;
1016
+ result.do_unicode_validation = has_hi && do_unicode_validation && result.has_some_hibit;
1017
+ }
1018
+ return result;
1019
+ }
1020
+
947
1021
  #endif /* HAVE_SIMD_NEON */
948
1022
 
949
1023
  static inline FORCE_INLINE const char *process_character(char action,
@@ -1023,6 +1097,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1023
1097
  #ifdef HAVE_SIMD_NEON
1024
1098
  uint8x16x4_t *cmap_neon = NULL;
1025
1099
  int neon_table_size = 0;
1100
+ #elif defined(HAVE_SIMD_SSE4_2)
1101
+ __m128i *cmap_sse42 = NULL;
1102
+ int sse42_tab_size;
1026
1103
  #endif /* HAVE_SIMD_NEON */
1027
1104
  const char *orig = str;
1028
1105
  bool has_hi = false;
@@ -1091,6 +1168,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1091
1168
  #ifdef HAVE_SIMD_NEON
1092
1169
  cmap_neon = hibit_friendly_chars_neon;
1093
1170
  neon_table_size = 2;
1171
+ #elif defined(HAVE_SIMD_SSE4_2)
1172
+ cmap_sse42 = hibit_friendly_chars_sse42;
1173
+ sse42_tab_size = 8;
1094
1174
  #endif /* HAVE_NEON_SIMD */
1095
1175
  size = hibit_friendly_size((uint8_t *)str, cnt);
1096
1176
  }
@@ -1118,21 +1198,32 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1118
1198
  if (is_sym) {
1119
1199
  *out->cur++ = ':';
1120
1200
  }
1121
- #ifdef HAVE_SIMD_NEON
1122
- const char *chunk_start;
1123
- const char *chunk_end;
1124
- const char *cursor = str;
1125
- bool use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
1126
- char matches[16];
1201
+
1202
+ #if defined(HAVE_SIMD_NEON) || defined(HAVE_SIMD_SSE4_2)
1203
+
1127
1204
  #define SEARCH_FLUSH \
1128
1205
  if (str > cursor) { \
1129
1206
  APPEND_CHARS(out->cur, cursor, str - cursor); \
1130
1207
  cursor = str; \
1131
1208
  }
1132
1209
 
1133
- #endif /* HAVE_SIMD_NEON */
1210
+ const char *chunk_start;
1211
+ const char *chunk_end;
1212
+ const char *cursor = str;
1213
+ char matches[16];
1214
+ #endif /* HAVE_SIMD_NEON || HAVE_SIMD_SSE4_2 */
1215
+
1216
+ #if defined(HAVE_SIMD_NEON)
1217
+ bool use_simd = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
1218
+ #elif defined(HAVE_SIMD_SSE4_2)
1219
+ bool use_simd = false;
1220
+ if (SIMD_Impl == SIMD_SSE42) {
1221
+ use_simd = (cmap_sse42 != NULL && cnt >= (sizeof(__m128i))) ? true : false;
1222
+ }
1223
+ #endif
1224
+
1134
1225
  #ifdef HAVE_SIMD_NEON
1135
- if (use_neon) {
1226
+ if (use_simd) {
1136
1227
  while (str < end) {
1137
1228
  const char *chunk_ptr = NULL;
1138
1229
  if (str + sizeof(uint8x16_t) <= end) {
@@ -1195,7 +1286,55 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
1195
1286
  }
1196
1287
  SEARCH_FLUSH;
1197
1288
  }
1198
- #endif /* HAVE_SIMD_NEON */
1289
+ #endif
1290
+
1291
+ #ifdef HAVE_SIMD_SSE4_2
1292
+ if (SIMD_Impl == SIMD_SSE42) {
1293
+ if (use_simd) {
1294
+ while (str < end) {
1295
+ const char *chunk_ptr = NULL;
1296
+ if (str + sizeof(__m128i) <= end) {
1297
+ chunk_ptr = str;
1298
+ chunk_start = str;
1299
+ chunk_end = str + sizeof(__m128i);
1300
+ } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
1301
+ memset(out->cur, 'A', sizeof(__m128i));
1302
+ memcpy(out->cur, str, (end - str));
1303
+ chunk_ptr = out->cur;
1304
+ chunk_start = str;
1305
+ chunk_end = end;
1306
+ } else {
1307
+ break;
1308
+ }
1309
+ sse42_match_result result = sse42_update(chunk_ptr,
1310
+ cmap_sse42,
1311
+ sse42_tab_size,
1312
+ do_unicode_validation,
1313
+ has_hi);
1314
+ if ((result.do_unicode_validation) || result.needs_escape) {
1315
+ SEARCH_FLUSH;
1316
+ _mm_storeu_si128((__m128i *)matches, result.actions);
1317
+ while (str < chunk_end) {
1318
+ long match_index = str - chunk_start;
1319
+ str = process_character(matches[match_index],
1320
+ str,
1321
+ end,
1322
+ out,
1323
+ orig,
1324
+ do_unicode_validation,
1325
+ &check_start);
1326
+ str++;
1327
+ }
1328
+ cursor = str;
1329
+ continue;
1330
+ }
1331
+ str = chunk_end;
1332
+ }
1333
+ SEARCH_FLUSH;
1334
+ }
1335
+ }
1336
+ #endif /* HAVE_SIMD_SSE4_2 */
1337
+
1199
1338
  for (; str < end; str++) {
1200
1339
  str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
1201
1340
  }
data/ext/oj/extconf.rb CHANGED
@@ -35,13 +35,16 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
35
35
 
36
36
  dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
37
37
 
38
- if with_config('--with-sse42')
39
- if try_cflags('-msse4.2')
40
- $CPPFLAGS += ' -msse4.2'
41
- dflags['OJ_USE_SSE4_2'] = 1
42
- else
43
- warn 'SSE 4.2 is not supported on this platform.'
44
- end
38
+ # SIMD optimizations use runtime CPU detection and function-level target attributes
39
+ # We do NOT add global -msse4.2/-msse2 flags here because:
40
+ # 1. It would cause illegal instruction errors on CPUs without SSE4.2
41
+ # 2. The code uses __attribute__((target("sse4.2"))) for SSE4.2 functions
42
+ # 3. Runtime detection in oj_get_simd_implementation() selects the right path
43
+ #
44
+ # We only add -msse2 if available, since SSE2 is baseline for all x86_64 CPUs
45
+ # and needed for compiling the SSE2 fallback code on 32-bit x86
46
+ if try_cflags('-msse2')
47
+ $CPPFLAGS += ' -msse2'
45
48
  end
46
49
 
47
50
  if enable_config('trace-log', false)
data/ext/oj/oj.c CHANGED
@@ -167,6 +167,8 @@ pthread_mutex_t oj_cache_mutex;
167
167
  VALUE oj_cache_mutex = Qnil;
168
168
  #endif
169
169
 
170
+ SIMD_Implementation SIMD_Impl = SIMD_NONE;
171
+
170
172
  extern void oj_parser_init();
171
173
 
172
174
  const char oj_json_class[] = "json_class";
@@ -1780,6 +1782,78 @@ static VALUE mem_report(VALUE self) {
1780
1782
  *
1781
1783
  * - *:wab* specifically for WAB data exchange.
1782
1784
  */
1785
+
1786
+ // =============================================================================
1787
+ // Runtime SIMD CPU detection
1788
+ // Cross-platform support for Windows (MSVC), Linux, and macOS (GCC/Clang)
1789
+ // =============================================================================
1790
+ SIMD_Implementation oj_get_simd_implementation(void) {
1791
+ #ifdef HAVE_SIMD_X86
1792
+ // x86/x86_64 runtime detection
1793
+
1794
+ #if defined(_MSC_VER)
1795
+ // MSVC: Use __cpuid intrinsic
1796
+ int cpu_info[4];
1797
+ __cpuid(cpu_info, 1);
1798
+
1799
+ // Check for SSE4.2 (bit 20 of ECX)
1800
+ if (cpu_info[2] & (1 << 20)) {
1801
+ return SIMD_SSE42;
1802
+ }
1803
+ // Check for SSE2 (bit 26 of EDX)
1804
+ if (cpu_info[3] & (1 << 26)) {
1805
+ return SIMD_SSE2;
1806
+ }
1807
+
1808
+ #elif defined(__GNUC__) || defined(__clang__)
1809
+ // GCC/Clang: Use __builtin_cpu_supports if available
1810
+ #if defined(__has_builtin)
1811
+ #if __has_builtin(__builtin_cpu_supports)
1812
+ #define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
1813
+ #endif
1814
+ #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
1815
+ // GCC 4.8+ has __builtin_cpu_supports
1816
+ #define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
1817
+ #endif
1818
+
1819
+ #ifdef OJ_HAS_BUILTIN_CPU_SUPPORTS
1820
+ #ifdef HAVE_SIMD_SSE4_2
1821
+ if (__builtin_cpu_supports("sse4.2")) {
1822
+ return SIMD_SSE42;
1823
+ }
1824
+ #endif
1825
+ #ifdef HAVE_SIMD_SSE2
1826
+ if (__builtin_cpu_supports("sse2")) {
1827
+ return SIMD_SSE2;
1828
+ }
1829
+ #endif
1830
+ #else
1831
+ // Fallback: Use CPUID instruction directly
1832
+ unsigned int eax, ebx, ecx, edx;
1833
+ if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
1834
+ // Check for SSE4.2 (bit 20 of ECX)
1835
+ if (ecx & (1 << 20)) {
1836
+ return SIMD_SSE42;
1837
+ }
1838
+ // Check for SSE2 (bit 26 of EDX)
1839
+ if (edx & (1 << 26)) {
1840
+ return SIMD_SSE2;
1841
+ }
1842
+ }
1843
+ #endif // OJ_HAS_BUILTIN_CPU_SUPPORTS
1844
+
1845
+ #endif // _MSC_VER vs GCC/Clang
1846
+
1847
+ #endif // HAVE_SIMD_X86
1848
+
1849
+ #ifdef HAVE_SIMD_NEON
1850
+ // ARM NEON is always available on ARM64 and detected at compile time
1851
+ return SIMD_NEON;
1852
+ #endif
1853
+
1854
+ return SIMD_NONE;
1855
+ }
1856
+
1783
1857
  void Init_oj(void) {
1784
1858
  int err = 0;
1785
1859
 
@@ -2080,10 +2154,18 @@ void Init_oj(void) {
2080
2154
  #endif
2081
2155
  oj_init_doc();
2082
2156
 
2157
+ SIMD_Impl = oj_get_simd_implementation();
2158
+
2083
2159
  oj_parser_init();
2084
2160
  oj_scanner_init();
2085
2161
 
2086
2162
  #ifdef HAVE_SIMD_NEON
2087
2163
  initialize_neon();
2088
2164
  #endif /* HAVE_SIMD_NEON */
2165
+
2166
+ #ifdef HAVE_SIMD_SSE4_2
2167
+ if (SIMD_Impl == SIMD_SSE42) {
2168
+ initialize_sse42();
2169
+ }
2170
+ #endif /* HAVE_SIMD_SSE4_2 */
2089
2171
  }
data/ext/oj/parse.c CHANGED
@@ -15,12 +15,9 @@
15
15
  #include "mem.h"
16
16
  #include "oj.h"
17
17
  #include "rxclass.h"
18
+ #include "simd.h"
18
19
  #include "val_stack.h"
19
20
 
20
- #ifdef OJ_USE_SSE4_2
21
- #include <nmmintrin.h>
22
- #endif
23
-
24
21
  // Workaround in case INFINITY is not defined in math.h or if the OS is CentOS
25
22
  #define OJ_INFINITY (1.0 / 0.0)
26
23
 
@@ -202,23 +199,145 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
202
199
  return str;
203
200
  }
204
201
 
205
- #ifdef OJ_USE_SSE4_2
206
- static inline const char *scan_string_SIMD(const char *str, const char *end) {
207
- static const char chars[16] = "\x00\\\"";
208
- const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
209
- const char *_end = (const char *)(end - 16);
202
+ #ifdef HAVE_SIMD_SSE4_2
203
+ // Optimized SIMD string scanner using SSE4.2 instructions
204
+ // Uses prefetching and processes multiple chunks in parallel to reduce latency
205
+ // Note: OJ_TARGET_SSE42 attribute allows this to compile even without global -msse4.2
206
+ static OJ_TARGET_SSE42 const char *scan_string_SSE42(const char *str, const char *end) {
207
+ static const char chars[16] = "\x00\\\"";
208
+ const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
209
+ const char *safe_end_64 = end - 64;
210
+ const char *safe_end_16 = end - 16;
211
+
212
+ // Process 64 bytes at a time with parallel SIMD operations
213
+ // This reduces pipeline stalls and improves instruction-level parallelism
214
+ while (str <= safe_end_64) {
215
+ // Prefetch next cache line for better memory throughput
216
+ OJ_PREFETCH(str + 64);
217
+
218
+ // Load and compare 4 chunks in parallel
219
+ const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
220
+ const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
221
+ const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
222
+ const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
223
+
224
+ const int r0 = _mm_cmpestri(terminate,
225
+ 3,
226
+ chunk0,
227
+ 16,
228
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
229
+ if (OJ_UNLIKELY(r0 != 16))
230
+ return str + r0;
231
+
232
+ const int r1 = _mm_cmpestri(terminate,
233
+ 3,
234
+ chunk1,
235
+ 16,
236
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
237
+ if (OJ_UNLIKELY(r1 != 16))
238
+ return str + 16 + r1;
239
+
240
+ const int r2 = _mm_cmpestri(terminate,
241
+ 3,
242
+ chunk2,
243
+ 16,
244
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
245
+ if (OJ_UNLIKELY(r2 != 16))
246
+ return str + 32 + r2;
247
+
248
+ const int r3 = _mm_cmpestri(terminate,
249
+ 3,
250
+ chunk3,
251
+ 16,
252
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
253
+ if (OJ_UNLIKELY(r3 != 16))
254
+ return str + 48 + r3;
255
+
256
+ str += 64;
257
+ }
210
258
 
211
- for (; str <= _end; str += 16) {
259
+ // Handle remaining 16-byte chunks
260
+ for (; str <= safe_end_16; str += 16) {
212
261
  const __m128i string = _mm_loadu_si128((const __m128i *)str);
213
262
  const int r = _mm_cmpestri(terminate,
214
263
  3,
215
264
  string,
216
265
  16,
217
266
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
218
- if (r != 16) {
219
- str = (char *)(str + r);
220
- return str;
221
- }
267
+ if (r != 16)
268
+ return str + r;
269
+ }
270
+
271
+ return scan_string_noSIMD(str, end);
272
+ }
273
+ #endif
274
+
275
+ #ifdef HAVE_SIMD_SSE2
276
+ // Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
277
+ // Uses SSE2 instructions with prefetching and parallel processing
278
+ // Note: OJ_TARGET_SSE2 attribute allows this to compile even without global -msse2
279
+ static OJ_TARGET_SSE2 const char *scan_string_SSE2(const char *str, const char *end) {
280
+ const char *safe_end_64 = end - 64;
281
+ const char *safe_end_16 = end - 16;
282
+
283
+ // Create comparison vectors for our three special characters
284
+ const __m128i null_char = _mm_setzero_si128();
285
+ const __m128i backslash = _mm_set1_epi8('\\');
286
+ const __m128i quote = _mm_set1_epi8('"');
287
+
288
+ // Process 64 bytes at a time for better throughput
289
+ while (str <= safe_end_64) {
290
+ OJ_PREFETCH(str + 64);
291
+
292
+ // Load 4 chunks
293
+ const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
294
+ const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
295
+ const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
296
+ const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
297
+
298
+ // Compare all chunks (allows CPU to parallelize)
299
+ const __m128i cmp0 = _mm_or_si128(
300
+ _mm_or_si128(_mm_cmpeq_epi8(chunk0, null_char), _mm_cmpeq_epi8(chunk0, backslash)),
301
+ _mm_cmpeq_epi8(chunk0, quote));
302
+ const __m128i cmp1 = _mm_or_si128(
303
+ _mm_or_si128(_mm_cmpeq_epi8(chunk1, null_char), _mm_cmpeq_epi8(chunk1, backslash)),
304
+ _mm_cmpeq_epi8(chunk1, quote));
305
+ const __m128i cmp2 = _mm_or_si128(
306
+ _mm_or_si128(_mm_cmpeq_epi8(chunk2, null_char), _mm_cmpeq_epi8(chunk2, backslash)),
307
+ _mm_cmpeq_epi8(chunk2, quote));
308
+ const __m128i cmp3 = _mm_or_si128(
309
+ _mm_or_si128(_mm_cmpeq_epi8(chunk3, null_char), _mm_cmpeq_epi8(chunk3, backslash)),
310
+ _mm_cmpeq_epi8(chunk3, quote));
311
+
312
+ // Convert to masks
313
+ int mask0 = _mm_movemask_epi8(cmp0);
314
+ if (OJ_UNLIKELY(mask0 != 0))
315
+ return str + OJ_CTZ(mask0);
316
+
317
+ int mask1 = _mm_movemask_epi8(cmp1);
318
+ if (OJ_UNLIKELY(mask1 != 0))
319
+ return str + 16 + OJ_CTZ(mask1);
320
+
321
+ int mask2 = _mm_movemask_epi8(cmp2);
322
+ if (OJ_UNLIKELY(mask2 != 0))
323
+ return str + 32 + OJ_CTZ(mask2);
324
+
325
+ int mask3 = _mm_movemask_epi8(cmp3);
326
+ if (OJ_UNLIKELY(mask3 != 0))
327
+ return str + 48 + OJ_CTZ(mask3);
328
+
329
+ str += 64;
330
+ }
331
+
332
+ // Handle remaining 16-byte chunks
333
+ for (; str <= safe_end_16; str += 16) {
334
+ const __m128i chunk = _mm_loadu_si128((const __m128i *)str);
335
+ const __m128i matches = _mm_or_si128(
336
+ _mm_or_si128(_mm_cmpeq_epi8(chunk, null_char), _mm_cmpeq_epi8(chunk, backslash)),
337
+ _mm_cmpeq_epi8(chunk, quote));
338
+ int mask = _mm_movemask_epi8(matches);
339
+ if (mask != 0)
340
+ return str + OJ_CTZ(mask);
222
341
  }
223
342
 
224
343
  return scan_string_noSIMD(str, end);
@@ -228,9 +347,20 @@ static inline const char *scan_string_SIMD(const char *str, const char *end) {
228
347
  static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
229
348
 
230
349
  void oj_scanner_init(void) {
231
- #ifdef OJ_USE_SSE4_2
232
- scan_func = scan_string_SIMD;
350
+ // Use runtime CPU detection to select the best SIMD implementation
351
+ // This ensures we don't crash on CPUs that don't support SSE4.2
352
+ SIMD_Implementation impl = oj_get_simd_implementation();
353
+
354
+ switch (impl) {
355
+ #ifdef HAVE_SIMD_SSE4_2
356
+ case SIMD_SSE42: scan_func = scan_string_SSE42; break;
357
+ #endif
358
+ #ifdef HAVE_SIMD_SSE2
359
+ case SIMD_SSE2: scan_func = scan_string_SSE2; break;
233
360
  #endif
361
+ default: scan_func = scan_string_noSIMD; break;
362
+ }
363
+ // Note: ARM NEON string scanning would be added here if needed
234
364
  }
235
365
 
236
366
  // entered at /
data/ext/oj/rails.c CHANGED
@@ -661,13 +661,15 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
661
661
  Encoder e = OJ_R_ALLOC(struct _encoder);
662
662
 
663
663
  e->opts = oj_default_options;
664
- e->arg = Qnil;
665
664
  copy_opts(&ropts, &e->ropts);
666
665
 
667
666
  if (1 <= argc && Qnil != *argv) {
668
- oj_parse_options(*argv, &e->opts);
669
667
  e->arg = *argv;
668
+ } else {
669
+ e->arg = rb_hash_new();
670
670
  }
671
+ oj_parse_options(e->arg, &e->opts);
672
+
671
673
  return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
672
674
  }
673
675
 
data/ext/oj/simd.h CHANGED
@@ -1,10 +1,172 @@
1
1
  #ifndef OJ_SIMD_H
2
2
  #define OJ_SIMD_H
3
3
 
4
+ // SIMD architecture detection and configuration
5
+ // This header provides unified SIMD support across different CPU architectures
6
+ // with cross-platform runtime detection (Windows/Linux/Mac)
7
+
8
+ // SIMD implementation enum - used for runtime selection
9
+ typedef enum _simd_implementation { SIMD_NONE, SIMD_NEON, SIMD_SSE2, SIMD_SSE42 } SIMD_Implementation;
10
+
11
+ // Define in oj.c.
12
+ extern SIMD_Implementation SIMD_Impl;
13
+
14
+ // Runtime CPU detection function (implemented in oj.c)
15
+ SIMD_Implementation oj_get_simd_implementation(void);
16
+
17
+ // =============================================================================
18
+ // Compiler compatibility macros
19
+ // =============================================================================
20
+
21
+ // Branch prediction hints
22
+ #if defined(__GNUC__) || defined(__clang__)
23
+ #define OJ_LIKELY(x) __builtin_expect(!!(x), 1)
24
+ #define OJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
25
+ #else
26
+ #define OJ_LIKELY(x) (x)
27
+ #define OJ_UNLIKELY(x) (x)
28
+ #endif
29
+
30
+ // Prefetch hints
31
+ #if defined(__GNUC__) || defined(__clang__)
32
+ #define OJ_PREFETCH(addr) __builtin_prefetch(addr, 0, 0)
33
+ #elif defined(_MSC_VER)
34
+ #include <intrin.h>
35
+ #define OJ_PREFETCH(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0)
36
+ #else
37
+ #define OJ_PREFETCH(addr) ((void)0)
38
+ #endif
39
+
40
+ // Count trailing zeros (for SSE2 mask scanning)
41
+ #if defined(__GNUC__) || defined(__clang__)
42
+ #define OJ_CTZ(x) __builtin_ctz(x)
43
+ #elif defined(_MSC_VER)
44
+ #include <intrin.h>
45
+ static __inline int oj_ctz_msvc(unsigned int x) {
46
+ unsigned long index;
47
+ _BitScanForward(&index, x);
48
+ return (int)index;
49
+ }
50
+ #define OJ_CTZ(x) oj_ctz_msvc(x)
51
+ #else
52
+ // Fallback: naive implementation
53
+ static inline int oj_ctz_fallback(unsigned int x) {
54
+ int count = 0;
55
+ while ((x & 1) == 0 && count < 32) {
56
+ x >>= 1;
57
+ count++;
58
+ }
59
+ return count;
60
+ }
61
+ #define OJ_CTZ(x) oj_ctz_fallback(x)
62
+ #endif
63
+
64
+ // =============================================================================
65
+ // x86/x86_64 SIMD detection
66
+ // =============================================================================
67
+ #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
68
+ #define HAVE_SIMD_X86 1
69
+
70
+ // Include appropriate SIMD headers
71
+ #if defined(_MSC_VER)
72
+ // MSVC: use intrin.h for all intrinsics
73
+ #include <intrin.h>
74
+ #define HAVE_SIMD_SSE4_2 1
75
+ #define HAVE_SIMD_SSE2 1
76
+ #elif defined(__GNUC__) || defined(__clang__)
77
+ // GCC/Clang: check for header availability and include them
78
+ // We include headers but use target attributes to enable instructions per-function
79
+ // Include cpuid.h for __get_cpuid fallback when __builtin_cpu_supports is unavailable
80
+ #if __has_include(<cpuid.h>)
81
+ #include <cpuid.h>
82
+ #endif
83
+ #if defined(__SSE4_2__) || defined(__SSE2__)
84
+ // If any SSE is enabled globally, x86intrin.h should be available
85
+ #include <x86intrin.h>
86
+ #define HAVE_SIMD_SSE4_2 1
87
+ #define HAVE_SIMD_SSE2 1
88
+ #else
89
+ // Try to include headers anyway for target attribute functions
90
+ #if __has_include(<x86intrin.h>)
91
+ #include <x86intrin.h>
92
+ #define HAVE_SIMD_SSE4_2 1
93
+ #define HAVE_SIMD_SSE2 1
94
+ #elif __has_include(<nmmintrin.h>)
95
+ #include <nmmintrin.h>
96
+ #define HAVE_SIMD_SSE4_2 1
97
+ #define HAVE_SIMD_SSE2 1
98
+ #elif __has_include(<emmintrin.h>)
99
+ #include <emmintrin.h>
100
+ #define HAVE_SIMD_SSE2 1
101
+ #endif
102
+ #endif
103
+ #endif
104
+
105
+ // Target attribute macros for function-level SIMD enabling
106
+ #if defined(__clang__) || defined(__GNUC__)
107
+ #define OJ_TARGET_SSE42 __attribute__((target("sse4.2")))
108
+ #define OJ_TARGET_SSE2 __attribute__((target("sse2")))
109
+ #else
110
+ // MSVC doesn't need target attributes - intrinsics are always available
111
+ #define OJ_TARGET_SSE42
112
+ #define OJ_TARGET_SSE2
113
+ #endif
114
+
115
+ #endif // x86/x86_64
116
+
117
+ // =============================================================================
118
+ // ARM NEON detection
119
+ // =============================================================================
4
120
  #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
5
121
  #define HAVE_SIMD_NEON 1
6
122
  #define SIMD_MINIMUM_THRESHOLD 6
7
123
  #include <arm_neon.h>
8
124
  #endif
9
125
 
10
- #endif /* OJ_SIMD_H */
126
+ // =============================================================================
127
+ // SIMD type string for debugging/logging
128
+ // =============================================================================
129
+ #if defined(HAVE_SIMD_SSE4_2) || defined(HAVE_SIMD_SSE2)
130
+ #define HAVE_SIMD_STRING_SCAN 1
131
+ #define SIMD_TYPE "x86 (runtime detected)"
132
+ #elif defined(HAVE_SIMD_NEON)
133
+ #define HAVE_SIMD_STRING_SCAN 1
134
+ #define SIMD_TYPE "NEON"
135
+ #else
136
+ #define SIMD_TYPE "none"
137
+ #endif
138
+
139
+ #if defined(HAVE_SIMD_SSE4_2)
140
+
141
+ #define SIMD_MINIMUM_THRESHOLD 6
142
+
143
+ extern void initialize_sse42(void);
144
+
145
+ static inline OJ_TARGET_SSE42 __m128i vector_lookup_sse42(__m128i input, __m128i *lookup_table, int tab_size) {
146
+ // Extract high 4 bits to determine which 16-byte chunk (0-15)
147
+ __m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));
148
+
149
+ // Extract low 4 bits for index within the chunk (0-15)
150
+ __m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));
151
+
152
+ // Perform lookups in all 16 tables
153
+ __m128i results[16];
154
+ for (int i = 0; i < tab_size; i++) {
155
+ results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
156
+ }
157
+
158
+ // Create masks for each chunk and blend results
159
+ __m128i final_result = _mm_setzero_si128();
160
+
161
+ for (int i = 0; i < tab_size; i++) {
162
+ __m128i mask = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
163
+ __m128i masked_result = _mm_and_si128(mask, results[i]);
164
+ final_result = _mm_or_si128(final_result, masked_result);
165
+ }
166
+
167
+ return final_result;
168
+ }
169
+
170
+ #endif
171
+
172
+ #endif /* OJ_SIMD_H */
data/lib/oj/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Oj
2
2
  # Current version of the module.
3
- VERSION = '3.16.12'
3
+ VERSION = '3.16.14'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oj
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.16.12
4
+ version: 3.16.14
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Ohler
@@ -23,20 +23,6 @@ dependencies:
23
23
  - - ">="
24
24
  - !ruby/object:Gem::Version
25
25
  version: '3.0'
26
- - !ruby/object:Gem::Dependency
27
- name: ostruct
28
- requirement: !ruby/object:Gem::Requirement
29
- requirements:
30
- - - ">="
31
- - !ruby/object:Gem::Version
32
- version: '0.2'
33
- type: :runtime
34
- prerelease: false
35
- version_requirements: !ruby/object:Gem::Requirement
36
- requirements:
37
- - - ">="
38
- - !ruby/object:Gem::Version
39
- version: '0.2'
40
26
  - !ruby/object:Gem::Dependency
41
27
  name: minitest
42
28
  requirement: !ruby/object:Gem::Requirement
@@ -229,7 +215,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
229
215
  - !ruby/object:Gem::Version
230
216
  version: '0'
231
217
  requirements: []
232
- rubygems_version: 3.6.9
218
+ rubygems_version: 4.0.3
233
219
  specification_version: 4
234
220
  summary: A fast JSON parser and serializer.
235
221
  test_files: []