RubyGems - oj - Versions diffs - 3.16.12 → 3.16.14 - Mend

oj 3.16.12 → 3.16.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6e2053531cd4c7c7b49bf16ddafbfda868e67e66c1bfbec1b06daaa0ba3f1c45
-  data.tar.gz: ce029b2f90660922dd8fb335c23463a1fe1d81317d56e48f2cfddc9271de2d15
+  metadata.gz: c97d24d950284c4f108b8ff933e4358250d53b69f9300b3faf2b8b7f29fdb5b7
+  data.tar.gz: 3112c763244b2f558e4f2a8f0ae79c01dcea9ec4281a915a2e39a404a34bf8f1
 SHA512:
-  metadata.gz: deb7f1447b5022adad6d7387b8a8bfd866d399abc2d9e434f7e6d321fa73cb1738ff9aa7ee22ac064c455d5d3951ba7469a30720c9474af5e96c70eaa5b5303a
-  data.tar.gz: b9d28d76c714947c1e6b133225e348838ad13f1c8b7dc82f0fee261272259cd0e83b911f2f2762c55a719f93e669275ad32d393f371c264a4e587e49b1c3a84b
+  metadata.gz: 3f32303a7e78478137b76fbadf4bcab93d164d60918fd5b8c20bccdd7cf0476e290e1290e4bc114f3fd6a3afaac0fb3c9df61c91ede88bfb9a9762d098731688
+  data.tar.gz: bcc7411ae7cde8ff0457563f67d99093b09c3d70a36a13c5db1144dbf983d5f864f204f421a966be30d3a2c3610f82033ed08c6c0c86d7ff2e8b391b4cd5b02f

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,17 @@
 # CHANGELOG
+## 3.16.14 - 2026-02-04
+-  Fixed SSE issue #989.
+- Removed ostruct dependency.
+- Removed generic object JSON gem tests.
+## 3.16.13 - 2025-12-05
+- Fixed rails encoding for Hash and Array subclasses.
 ## 3.16.12 - 2025-10-29
 - Fixed dump realloc bug that occurred when using the compat mode dump options.

data/ext/oj/dump.c CHANGED Viewed

@@ -201,6 +201,45 @@ void initialize_neon(void) {
 }
 #endif
+#ifdef HAVE_SIMD_SSE4_2
+static __m128i hibit_friendly_chars_sse42[8];
+// From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
+inline uint32_t _mm_sum_epu8(const __m128i v) {
+    __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
+    return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
+}
+inline static OJ_TARGET_SSE42 size_t hibit_friendly_size_sse42(const uint8_t *str, size_t len) {
+    size_t size = 0;
+    size_t i    = 0;
+    for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i), str += sizeof(__m128i)) {
+        size += sizeof(__m128i);
+        __m128i chunk = _mm_loadu_si128((__m128i *)str);
+        __m128i tmp   = vector_lookup_sse42(chunk, hibit_friendly_chars_sse42, 8);
+        size += _mm_sum_epu8(tmp);
+    }
+    size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
+    return total;
+}
+void OJ_TARGET_SSE42 initialize_sse42(void) {
+    for (int i = 0; i < 8; i++) {
+        hibit_friendly_chars_sse42[i] = _mm_sub_epi8(
+            _mm_loadu_si128((__m128i *)(hibit_friendly_chars + i * sizeof(__m128i))),
+            _mm_set1_epi8('1'));
+    }
+}
+#else
+#define SIMD_TARGET
+#endif /* HAVE_SIMD_SSE4_2 */
 inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
 #ifdef HAVE_SIMD_NEON
     size_t size = 0;
@@ -220,6 +259,13 @@ inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
     size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
     return total;
+#elif defined(HAVE_SIMD_SSE4_2)
+    if (SIMD_Impl == SIMD_SSE42) {
+        if (len >= sizeof(__m128i)) {
+            return hibit_friendly_size_sse42(str, len);
+        }
+    }
+    return calculate_string_size(str, len, hibit_friendly_chars);
 #else
     return calculate_string_size(str, len, hibit_friendly_chars);
 #endif
@@ -944,6 +990,34 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
     return result;
 }
+#elif defined(HAVE_SIMD_SSE4_2)
+typedef struct _sse42_match_result {
+    __m128i actions;
+    bool    needs_escape;
+    int     escape_mask;
+    bool    has_some_hibit;
+    bool    do_unicode_validation;
+} sse42_match_result;
+static inline OJ_TARGET_SSE42 sse42_match_result
+sse42_update(const char *str, __m128i *cmap_sse42, int sse42_tab_size, bool do_unicode_validation, bool has_hi) {
+    sse42_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
+    __m128i chunk        = _mm_loadu_si128((__m128i *)str);
+    __m128i actions      = vector_lookup_sse42(chunk, cmap_sse42, sse42_tab_size);
+    __m128i needs_escape = _mm_xor_si128(_mm_cmpeq_epi8(actions, _mm_setzero_si128()), _mm_set1_epi8(0xFF));
+    result.actions       = _mm_add_epi8(actions, _mm_set1_epi8('1'));
+    result.escape_mask  = _mm_movemask_epi8(needs_escape);
+    result.needs_escape = result.escape_mask != 0;
+    if (has_hi && do_unicode_validation) {
+        __m128i has_some_hibit       = _mm_and_si128(chunk, _mm_set1_epi8(0x80));
+        result.has_some_hibit        = _mm_movemask_epi8(has_some_hibit) != 0;
+        result.do_unicode_validation = has_hi && do_unicode_validation && result.has_some_hibit;
+    }
+    return result;
+}
 #endif /* HAVE_SIMD_NEON */
 static inline FORCE_INLINE const char *process_character(char         action,
@@ -1023,6 +1097,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
 #ifdef HAVE_SIMD_NEON
     uint8x16x4_t *cmap_neon       = NULL;
     int           neon_table_size = 0;
+#elif defined(HAVE_SIMD_SSE4_2)
+    __m128i *cmap_sse42 = NULL;
+    int      sse42_tab_size;
 #endif /* HAVE_SIMD_NEON */
     const char *orig                  = str;
     bool        has_hi                = false;
@@ -1091,6 +1168,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
 #ifdef HAVE_SIMD_NEON
         cmap_neon       = hibit_friendly_chars_neon;
         neon_table_size = 2;
+#elif defined(HAVE_SIMD_SSE4_2)
+        cmap_sse42     = hibit_friendly_chars_sse42;
+        sse42_tab_size = 8;
 #endif /* HAVE_NEON_SIMD */
         size = hibit_friendly_size((uint8_t *)str, cnt);
     }
@@ -1118,21 +1198,32 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
         if (is_sym) {
             *out->cur++ = ':';
         }
-#ifdef HAVE_SIMD_NEON
-        const char *chunk_start;
-        const char *chunk_end;
-        const char *cursor   = str;
-        bool        use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
-        char        matches[16];
+#if defined(HAVE_SIMD_NEON) || defined(HAVE_SIMD_SSE4_2)
 #define SEARCH_FLUSH                                  \
     if (str > cursor) {                               \
         APPEND_CHARS(out->cur, cursor, str - cursor); \
         cursor = str;                                 \
     }
-#endif /* HAVE_SIMD_NEON */
+        const char *chunk_start;
+        const char *chunk_end;
+        const char *cursor = str;
+        char        matches[16];
+#endif /* HAVE_SIMD_NEON || HAVE_SIMD_SSE4_2 */
+#if defined(HAVE_SIMD_NEON)
+        bool use_simd = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
+#elif defined(HAVE_SIMD_SSE4_2)
+        bool use_simd = false;
+        if (SIMD_Impl == SIMD_SSE42) {
+            use_simd = (cmap_sse42 != NULL && cnt >= (sizeof(__m128i))) ? true : false;
+        }
+#endif
 #ifdef HAVE_SIMD_NEON
-        if (use_neon) {
+        if (use_simd) {
             while (str < end) {
                 const char *chunk_ptr = NULL;
                 if (str + sizeof(uint8x16_t) <= end) {
@@ -1195,7 +1286,55 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
             }
             SEARCH_FLUSH;
         }
-#endif /* HAVE_SIMD_NEON */
+#endif
+#ifdef HAVE_SIMD_SSE4_2
+        if (SIMD_Impl == SIMD_SSE42) {
+            if (use_simd) {
+                while (str < end) {
+                    const char *chunk_ptr = NULL;
+                    if (str + sizeof(__m128i) <= end) {
+                        chunk_ptr   = str;
+                        chunk_start = str;
+                        chunk_end   = str + sizeof(__m128i);
+                    } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
+                        memset(out->cur, 'A', sizeof(__m128i));
+                        memcpy(out->cur, str, (end - str));
+                        chunk_ptr   = out->cur;
+                        chunk_start = str;
+                        chunk_end   = end;
+                    } else {
+                        break;
+                    }
+                    sse42_match_result result = sse42_update(chunk_ptr,
+                                                             cmap_sse42,
+                                                             sse42_tab_size,
+                                                             do_unicode_validation,
+                                                             has_hi);
+                    if ((result.do_unicode_validation) || result.needs_escape) {
+                        SEARCH_FLUSH;
+                        _mm_storeu_si128((__m128i *)matches, result.actions);
+                        while (str < chunk_end) {
+                            long match_index = str - chunk_start;
+                            str              = process_character(matches[match_index],
+                                                    str,
+                                                    end,
+                                                    out,
+                                                    orig,
+                                                    do_unicode_validation,
+                                                    &check_start);
+                            str++;
+                        }
+                        cursor = str;
+                        continue;
+                    }
+                    str = chunk_end;
+                }
+                SEARCH_FLUSH;
+            }
+        }
+#endif /* HAVE_SIMD_SSE4_2 */
         for (; str < end; str++) {
             str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
         }

data/ext/oj/extconf.rb CHANGED Viewed

@@ -35,13 +35,16 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
 dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
-if with_config('--with-sse42')
-  if try_cflags('-msse4.2')
-    $CPPFLAGS += ' -msse4.2'
-    dflags['OJ_USE_SSE4_2'] = 1
-  else
-    warn 'SSE 4.2 is not supported on this platform.'
-  end
+# SIMD optimizations use runtime CPU detection and function-level target attributes
+# We do NOT add global -msse4.2/-msse2 flags here because:
+# 1. It would cause illegal instruction errors on CPUs without SSE4.2
+# 2. The code uses __attribute__((target("sse4.2"))) for SSE4.2 functions
+# 3. Runtime detection in oj_get_simd_implementation() selects the right path
+#
+# We only add -msse2 if available, since SSE2 is baseline for all x86_64 CPUs
+# and needed for compiling the SSE2 fallback code on 32-bit x86
+if try_cflags('-msse2')
+  $CPPFLAGS += ' -msse2'
 end
 if enable_config('trace-log', false)

data/ext/oj/oj.c CHANGED Viewed

@@ -167,6 +167,8 @@ pthread_mutex_t oj_cache_mutex;
 VALUE oj_cache_mutex = Qnil;
 #endif
+SIMD_Implementation SIMD_Impl = SIMD_NONE;
 extern void oj_parser_init();
 const char oj_json_class[] = "json_class";
@@ -1780,6 +1782,78 @@ static VALUE mem_report(VALUE self) {
  *
  * - *:wab* specifically for WAB data exchange.
  */
+// =============================================================================
+// Runtime SIMD CPU detection
+// Cross-platform support for Windows (MSVC), Linux, and macOS (GCC/Clang)
+// =============================================================================
+SIMD_Implementation oj_get_simd_implementation(void) {
+#ifdef HAVE_SIMD_X86
+    // x86/x86_64 runtime detection
+#if defined(_MSC_VER)
+    // MSVC: Use __cpuid intrinsic
+    int cpu_info[4];
+    __cpuid(cpu_info, 1);
+    // Check for SSE4.2 (bit 20 of ECX)
+    if (cpu_info[2] & (1 << 20)) {
+        return SIMD_SSE42;
+    }
+    // Check for SSE2 (bit 26 of EDX)
+    if (cpu_info[3] & (1 << 26)) {
+        return SIMD_SSE2;
+    }
+#elif defined(__GNUC__) || defined(__clang__)
+    // GCC/Clang: Use __builtin_cpu_supports if available
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_cpu_supports)
+#define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
+#endif
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+    // GCC 4.8+ has __builtin_cpu_supports
+#define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
+#endif
+#ifdef OJ_HAS_BUILTIN_CPU_SUPPORTS
+#ifdef HAVE_SIMD_SSE4_2
+    if (__builtin_cpu_supports("sse4.2")) {
+        return SIMD_SSE42;
+    }
+#endif
+#ifdef HAVE_SIMD_SSE2
+    if (__builtin_cpu_supports("sse2")) {
+        return SIMD_SSE2;
+    }
+#endif
+#else
+    // Fallback: Use CPUID instruction directly
+    unsigned int eax, ebx, ecx, edx;
+    if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+        // Check for SSE4.2 (bit 20 of ECX)
+        if (ecx & (1 << 20)) {
+            return SIMD_SSE42;
+        }
+        // Check for SSE2 (bit 26 of EDX)
+        if (edx & (1 << 26)) {
+            return SIMD_SSE2;
+        }
+    }
+#endif  // OJ_HAS_BUILTIN_CPU_SUPPORTS
+#endif  // _MSC_VER vs GCC/Clang
+#endif  // HAVE_SIMD_X86
+#ifdef HAVE_SIMD_NEON
+    // ARM NEON is always available on ARM64 and detected at compile time
+    return SIMD_NEON;
+#endif
+    return SIMD_NONE;
+}
 void Init_oj(void) {
     int err = 0;
@@ -2080,10 +2154,18 @@ void Init_oj(void) {
 #endif
     oj_init_doc();
+    SIMD_Impl = oj_get_simd_implementation();
     oj_parser_init();
     oj_scanner_init();
 #ifdef HAVE_SIMD_NEON
     initialize_neon();
 #endif /* HAVE_SIMD_NEON */
+#ifdef HAVE_SIMD_SSE4_2
+    if (SIMD_Impl == SIMD_SSE42) {
+        initialize_sse42();
+    }
+#endif /* HAVE_SIMD_SSE4_2 */
 }

data/ext/oj/parse.c CHANGED Viewed

@@ -15,12 +15,9 @@
 #include "mem.h"
 #include "oj.h"
 #include "rxclass.h"
+#include "simd.h"
 #include "val_stack.h"
-#ifdef OJ_USE_SSE4_2
-#include <nmmintrin.h>
-#endif
 // Workaround in case INFINITY is not defined in math.h or if the OS is CentOS
 #define OJ_INFINITY (1.0 / 0.0)
@@ -202,23 +199,145 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
     return str;
 }
-#ifdef OJ_USE_SSE4_2
-static inline const char *scan_string_SIMD(const char *str, const char *end) {
-    static const char chars[16] = "\x00\\\"";
-    const __m128i     terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
-    const char       *_end      = (const char *)(end - 16);
+#ifdef HAVE_SIMD_SSE4_2
+// Optimized SIMD string scanner using SSE4.2 instructions
+// Uses prefetching and processes multiple chunks in parallel to reduce latency
+// Note: OJ_TARGET_SSE42 attribute allows this to compile even without global -msse4.2
+static OJ_TARGET_SSE42 const char *scan_string_SSE42(const char *str, const char *end) {
+    static const char chars[16]   = "\x00\\\"";
+    const __m128i     terminate   = _mm_loadu_si128((const __m128i *)&chars[0]);
+    const char       *safe_end_64 = end - 64;
+    const char       *safe_end_16 = end - 16;
+    // Process 64 bytes at a time with parallel SIMD operations
+    // This reduces pipeline stalls and improves instruction-level parallelism
+    while (str <= safe_end_64) {
+        // Prefetch next cache line for better memory throughput
+        OJ_PREFETCH(str + 64);
+        // Load and compare 4 chunks in parallel
+        const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
+        const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
+        const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
+        const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
+        const int r0 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk0,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (OJ_UNLIKELY(r0 != 16))
+            return str + r0;
+        const int r1 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk1,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (OJ_UNLIKELY(r1 != 16))
+            return str + 16 + r1;
+        const int r2 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk2,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (OJ_UNLIKELY(r2 != 16))
+            return str + 32 + r2;
+        const int r3 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk3,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (OJ_UNLIKELY(r3 != 16))
+            return str + 48 + r3;
+        str += 64;
+    }
-    for (; str <= _end; str += 16) {
+    // Handle remaining 16-byte chunks
+    for (; str <= safe_end_16; str += 16) {
         const __m128i string = _mm_loadu_si128((const __m128i *)str);
         const int     r      = _mm_cmpestri(terminate,
                                    3,
                                    string,
                                    16,
                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
-        if (r != 16) {
-            str = (char *)(str + r);
-            return str;
-        }
+        if (r != 16)
+            return str + r;
+    }
+    return scan_string_noSIMD(str, end);
+}
+#endif
+#ifdef HAVE_SIMD_SSE2
+// Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
+// Uses SSE2 instructions with prefetching and parallel processing
+// Note: OJ_TARGET_SSE2 attribute allows this to compile even without global -msse2
+static OJ_TARGET_SSE2 const char *scan_string_SSE2(const char *str, const char *end) {
+    const char *safe_end_64 = end - 64;
+    const char *safe_end_16 = end - 16;
+    // Create comparison vectors for our three special characters
+    const __m128i null_char = _mm_setzero_si128();
+    const __m128i backslash = _mm_set1_epi8('\\');
+    const __m128i quote     = _mm_set1_epi8('"');
+    // Process 64 bytes at a time for better throughput
+    while (str <= safe_end_64) {
+        OJ_PREFETCH(str + 64);
+        // Load 4 chunks
+        const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
+        const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
+        const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
+        const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
+        // Compare all chunks (allows CPU to parallelize)
+        const __m128i cmp0 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk0, null_char), _mm_cmpeq_epi8(chunk0, backslash)),
+            _mm_cmpeq_epi8(chunk0, quote));
+        const __m128i cmp1 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk1, null_char), _mm_cmpeq_epi8(chunk1, backslash)),
+            _mm_cmpeq_epi8(chunk1, quote));
+        const __m128i cmp2 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk2, null_char), _mm_cmpeq_epi8(chunk2, backslash)),
+            _mm_cmpeq_epi8(chunk2, quote));
+        const __m128i cmp3 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk3, null_char), _mm_cmpeq_epi8(chunk3, backslash)),
+            _mm_cmpeq_epi8(chunk3, quote));
+        // Convert to masks
+        int mask0 = _mm_movemask_epi8(cmp0);
+        if (OJ_UNLIKELY(mask0 != 0))
+            return str + OJ_CTZ(mask0);
+        int mask1 = _mm_movemask_epi8(cmp1);
+        if (OJ_UNLIKELY(mask1 != 0))
+            return str + 16 + OJ_CTZ(mask1);
+        int mask2 = _mm_movemask_epi8(cmp2);
+        if (OJ_UNLIKELY(mask2 != 0))
+            return str + 32 + OJ_CTZ(mask2);
+        int mask3 = _mm_movemask_epi8(cmp3);
+        if (OJ_UNLIKELY(mask3 != 0))
+            return str + 48 + OJ_CTZ(mask3);
+        str += 64;
+    }
+    // Handle remaining 16-byte chunks
+    for (; str <= safe_end_16; str += 16) {
+        const __m128i chunk   = _mm_loadu_si128((const __m128i *)str);
+        const __m128i matches = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk, null_char), _mm_cmpeq_epi8(chunk, backslash)),
+            _mm_cmpeq_epi8(chunk, quote));
+        int mask = _mm_movemask_epi8(matches);
+        if (mask != 0)
+            return str + OJ_CTZ(mask);
     }
     return scan_string_noSIMD(str, end);
@@ -228,9 +347,20 @@ static inline const char *scan_string_SIMD(const char *str, const char *end) {
 static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
 void oj_scanner_init(void) {
-#ifdef OJ_USE_SSE4_2
-    scan_func = scan_string_SIMD;
+    // Use runtime CPU detection to select the best SIMD implementation
+    // This ensures we don't crash on CPUs that don't support SSE4.2
+    SIMD_Implementation impl = oj_get_simd_implementation();
+    switch (impl) {
+#ifdef HAVE_SIMD_SSE4_2
+    case SIMD_SSE42: scan_func = scan_string_SSE42; break;
+#endif
+#ifdef HAVE_SIMD_SSE2
+    case SIMD_SSE2: scan_func = scan_string_SSE2; break;
 #endif
+    default: scan_func = scan_string_noSIMD; break;
+    }
+    // Note: ARM NEON string scanning would be added here if needed
 }
 // entered at /

data/ext/oj/rails.c CHANGED Viewed

@@ -661,13 +661,15 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
     Encoder e = OJ_R_ALLOC(struct _encoder);
     e->opts = oj_default_options;
-    e->arg  = Qnil;
     copy_opts(&ropts, &e->ropts);
     if (1 <= argc && Qnil != *argv) {
-        oj_parse_options(*argv, &e->opts);
         e->arg = *argv;
+    } else {
+        e->arg = rb_hash_new();
     }
+    oj_parse_options(e->arg, &e->opts);
     return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
 }

data/ext/oj/simd.h CHANGED Viewed

@@ -1,10 +1,172 @@
 #ifndef OJ_SIMD_H
 #define OJ_SIMD_H
+// SIMD architecture detection and configuration
+// This header provides unified SIMD support across different CPU architectures
+// with cross-platform runtime detection (Windows/Linux/Mac)
+// SIMD implementation enum - used for runtime selection
+typedef enum _simd_implementation { SIMD_NONE, SIMD_NEON, SIMD_SSE2, SIMD_SSE42 } SIMD_Implementation;
+// Define in oj.c.
+extern SIMD_Implementation SIMD_Impl;
+// Runtime CPU detection function (implemented in oj.c)
+SIMD_Implementation oj_get_simd_implementation(void);
+// =============================================================================
+// Compiler compatibility macros
+// =============================================================================
+// Branch prediction hints
+#if defined(__GNUC__) || defined(__clang__)
+#define OJ_LIKELY(x) __builtin_expect(!!(x), 1)
+#define OJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define OJ_LIKELY(x) (x)
+#define OJ_UNLIKELY(x) (x)
+#endif
+// Prefetch hints
+#if defined(__GNUC__) || defined(__clang__)
+#define OJ_PREFETCH(addr) __builtin_prefetch(addr, 0, 0)
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#define OJ_PREFETCH(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0)
+#else
+#define OJ_PREFETCH(addr) ((void)0)
+#endif
+// Count trailing zeros (for SSE2 mask scanning)
+#if defined(__GNUC__) || defined(__clang__)
+#define OJ_CTZ(x) __builtin_ctz(x)
+#elif defined(_MSC_VER)
+#include <intrin.h>
+static __inline int oj_ctz_msvc(unsigned int x) {
+    unsigned long index;
+    _BitScanForward(&index, x);
+    return (int)index;
+}
+#define OJ_CTZ(x) oj_ctz_msvc(x)
+#else
+// Fallback: naive implementation
+static inline int oj_ctz_fallback(unsigned int x) {
+    int count = 0;
+    while ((x & 1) == 0 && count < 32) {
+        x >>= 1;
+        count++;
+    }
+    return count;
+}
+#define OJ_CTZ(x) oj_ctz_fallback(x)
+#endif
+// =============================================================================
+// x86/x86_64 SIMD detection
+// =============================================================================
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
+#define HAVE_SIMD_X86 1
+// Include appropriate SIMD headers
+#if defined(_MSC_VER)
+// MSVC: use intrin.h for all intrinsics
+#include <intrin.h>
+#define HAVE_SIMD_SSE4_2 1
+#define HAVE_SIMD_SSE2 1
+#elif defined(__GNUC__) || defined(__clang__)
+// GCC/Clang: check for header availability and include them
+// We include headers but use target attributes to enable instructions per-function
+// Include cpuid.h for __get_cpuid fallback when __builtin_cpu_supports is unavailable
+#if __has_include(<cpuid.h>)
+#include <cpuid.h>
+#endif
+#if defined(__SSE4_2__) || defined(__SSE2__)
+// If any SSE is enabled globally, x86intrin.h should be available
+#include <x86intrin.h>
+#define HAVE_SIMD_SSE4_2 1
+#define HAVE_SIMD_SSE2 1
+#else
+// Try to include headers anyway for target attribute functions
+#if __has_include(<x86intrin.h>)
+#include <x86intrin.h>
+#define HAVE_SIMD_SSE4_2 1
+#define HAVE_SIMD_SSE2 1
+#elif __has_include(<nmmintrin.h>)
+#include <nmmintrin.h>
+#define HAVE_SIMD_SSE4_2 1
+#define HAVE_SIMD_SSE2 1
+#elif __has_include(<emmintrin.h>)
+#include <emmintrin.h>
+#define HAVE_SIMD_SSE2 1
+#endif
+#endif
+#endif
+// Target attribute macros for function-level SIMD enabling
+#if defined(__clang__) || defined(__GNUC__)
+#define OJ_TARGET_SSE42 __attribute__((target("sse4.2")))
+#define OJ_TARGET_SSE2 __attribute__((target("sse2")))
+#else
+// MSVC doesn't need target attributes - intrinsics are always available
+#define OJ_TARGET_SSE42
+#define OJ_TARGET_SSE2
+#endif
+#endif  // x86/x86_64
+// =============================================================================
+// ARM NEON detection
+// =============================================================================
 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
 #define HAVE_SIMD_NEON 1
 #define SIMD_MINIMUM_THRESHOLD 6
 #include <arm_neon.h>
 #endif
-#endif /* OJ_SIMD_H */
+// =============================================================================
+// SIMD type string for debugging/logging
+// =============================================================================
+#if defined(HAVE_SIMD_SSE4_2) || defined(HAVE_SIMD_SSE2)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "x86 (runtime detected)"
+#elif defined(HAVE_SIMD_NEON)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "NEON"
+#else
+#define SIMD_TYPE "none"
+#endif
+#if defined(HAVE_SIMD_SSE4_2)
+#define SIMD_MINIMUM_THRESHOLD 6
+extern void initialize_sse42(void);
+static inline OJ_TARGET_SSE42 __m128i vector_lookup_sse42(__m128i input, __m128i *lookup_table, int tab_size) {
+    // Extract high 4 bits to determine which 16-byte chunk (0-15)
+    __m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));
+    // Extract low 4 bits for index within the chunk (0-15)
+    __m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));
+    // Perform lookups in all 16 tables
+    __m128i results[16];
+    for (int i = 0; i < tab_size; i++) {
+        results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
+    }
+    // Create masks for each chunk and blend results
+    __m128i final_result = _mm_setzero_si128();
+    for (int i = 0; i < tab_size; i++) {
+        __m128i mask          = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
+        __m128i masked_result = _mm_and_si128(mask, results[i]);
+        final_result          = _mm_or_si128(final_result, masked_result);
+    }
+    return final_result;
+}
+#endif
+#endif /* OJ_SIMD_H */

data/lib/oj/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module Oj
   # Current version of the module.
-  VERSION = '3.16.12'
+  VERSION = '3.16.14'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: oj
 version: !ruby/object:Gem::Version
-  version: 3.16.12
+  version: 3.16.14
 platform: ruby
 authors:
 - Peter Ohler
@@ -23,20 +23,6 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '3.0'
-- !ruby/object:Gem::Dependency
-  name: ostruct
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0.2'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0.2'
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
@@ -229,7 +215,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.6.9
+rubygems_version: 4.0.3
 specification_version: 4
 summary: A fast JSON parser and serializer.
 test_files: []