RubyGems - oj - Versions diffs - 3.16.13 → 3.16.14 - Mend

oj 3.16.13 → 3.16.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fc0290fa1cfe6af1094de1d7188836e0c09cb04f2f08401de118253026604650
-  data.tar.gz: de5258e96984a21afb2fac946fe28ad255926893a6157f2874445edf10aa8bbe
+  metadata.gz: c97d24d950284c4f108b8ff933e4358250d53b69f9300b3faf2b8b7f29fdb5b7
+  data.tar.gz: 3112c763244b2f558e4f2a8f0ae79c01dcea9ec4281a915a2e39a404a34bf8f1
 SHA512:
-  metadata.gz: d7870818fd86043a17b834756b67a4009a6f7ef60baf53b02a0b0d4431ccba723d9e533553bea04ae46ae9f233e447b79c4106e6827782bd0c2ffb9c332081a3
-  data.tar.gz: fd3966ac7fb5da9f1a5ebb68f4a8f5b9a5f9fa1a1255e93dfef078f66f00a6af5bb7e37676441f7d6229b29222741a2bc7b75164fd445a39b906ef904946d41b
+  metadata.gz: 3f32303a7e78478137b76fbadf4bcab93d164d60918fd5b8c20bccdd7cf0476e290e1290e4bc114f3fd6a3afaac0fb3c9df61c91ede88bfb9a9762d098731688
+  data.tar.gz: bcc7411ae7cde8ff0457563f67d99093b09c3d70a36a13c5db1144dbf983d5f864f204f421a966be30d3a2c3610f82033ed08c6c0c86d7ff2e8b391b4cd5b02f

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,13 @@
 # CHANGELOG
+## 3.16.14 - 2026-02-04
+-  Fixed SSE issue #989.
+- Removed ostruct dependency.
+- Removed generic object JSON gem tests.
 ## 3.16.13 - 2025-12-05
 - Fixed rails encoding for Hash and Array subclasses.

data/ext/oj/dump.c CHANGED Viewed

@@ -201,6 +201,45 @@ void initialize_neon(void) {
 }
 #endif
+#ifdef HAVE_SIMD_SSE4_2
+static __m128i hibit_friendly_chars_sse42[8];
+// From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
+inline uint32_t _mm_sum_epu8(const __m128i v) {
+    __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
+    return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
+}
+inline static OJ_TARGET_SSE42 size_t hibit_friendly_size_sse42(const uint8_t *str, size_t len) {
+    size_t size = 0;
+    size_t i    = 0;
+    for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i), str += sizeof(__m128i)) {
+        size += sizeof(__m128i);
+        __m128i chunk = _mm_loadu_si128((__m128i *)str);
+        __m128i tmp   = vector_lookup_sse42(chunk, hibit_friendly_chars_sse42, 8);
+        size += _mm_sum_epu8(tmp);
+    }
+    size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
+    return total;
+}
+void OJ_TARGET_SSE42 initialize_sse42(void) {
+    for (int i = 0; i < 8; i++) {
+        hibit_friendly_chars_sse42[i] = _mm_sub_epi8(
+            _mm_loadu_si128((__m128i *)(hibit_friendly_chars + i * sizeof(__m128i))),
+            _mm_set1_epi8('1'));
+    }
+}
+#else
+#define SIMD_TARGET
+#endif /* HAVE_SIMD_SSE4_2 */
 inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
 #ifdef HAVE_SIMD_NEON
     size_t size = 0;
@@ -220,6 +259,13 @@ inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
     size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
     return total;
+#elif defined(HAVE_SIMD_SSE4_2)
+    if (SIMD_Impl == SIMD_SSE42) {
+        if (len >= sizeof(__m128i)) {
+            return hibit_friendly_size_sse42(str, len);
+        }
+    }
+    return calculate_string_size(str, len, hibit_friendly_chars);
 #else
     return calculate_string_size(str, len, hibit_friendly_chars);
 #endif
@@ -944,6 +990,34 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
     return result;
 }
+#elif defined(HAVE_SIMD_SSE4_2)
+typedef struct _sse42_match_result {
+    __m128i actions;
+    bool    needs_escape;
+    int     escape_mask;
+    bool    has_some_hibit;
+    bool    do_unicode_validation;
+} sse42_match_result;
+static inline OJ_TARGET_SSE42 sse42_match_result
+sse42_update(const char *str, __m128i *cmap_sse42, int sse42_tab_size, bool do_unicode_validation, bool has_hi) {
+    sse42_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
+    __m128i chunk        = _mm_loadu_si128((__m128i *)str);
+    __m128i actions      = vector_lookup_sse42(chunk, cmap_sse42, sse42_tab_size);
+    __m128i needs_escape = _mm_xor_si128(_mm_cmpeq_epi8(actions, _mm_setzero_si128()), _mm_set1_epi8(0xFF));
+    result.actions       = _mm_add_epi8(actions, _mm_set1_epi8('1'));
+    result.escape_mask  = _mm_movemask_epi8(needs_escape);
+    result.needs_escape = result.escape_mask != 0;
+    if (has_hi && do_unicode_validation) {
+        __m128i has_some_hibit       = _mm_and_si128(chunk, _mm_set1_epi8(0x80));
+        result.has_some_hibit        = _mm_movemask_epi8(has_some_hibit) != 0;
+        result.do_unicode_validation = has_hi && do_unicode_validation && result.has_some_hibit;
+    }
+    return result;
+}
 #endif /* HAVE_SIMD_NEON */
 static inline FORCE_INLINE const char *process_character(char         action,
@@ -1023,6 +1097,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
 #ifdef HAVE_SIMD_NEON
     uint8x16x4_t *cmap_neon       = NULL;
     int           neon_table_size = 0;
+#elif defined(HAVE_SIMD_SSE4_2)
+    __m128i *cmap_sse42 = NULL;
+    int      sse42_tab_size;
 #endif /* HAVE_SIMD_NEON */
     const char *orig                  = str;
     bool        has_hi                = false;
@@ -1091,6 +1168,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
 #ifdef HAVE_SIMD_NEON
         cmap_neon       = hibit_friendly_chars_neon;
         neon_table_size = 2;
+#elif defined(HAVE_SIMD_SSE4_2)
+        cmap_sse42     = hibit_friendly_chars_sse42;
+        sse42_tab_size = 8;
 #endif /* HAVE_NEON_SIMD */
         size = hibit_friendly_size((uint8_t *)str, cnt);
     }
@@ -1118,21 +1198,32 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
         if (is_sym) {
             *out->cur++ = ':';
         }
-#ifdef HAVE_SIMD_NEON
-        const char *chunk_start;
-        const char *chunk_end;
-        const char *cursor   = str;
-        bool        use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
-        char        matches[16];
+#if defined(HAVE_SIMD_NEON) || defined(HAVE_SIMD_SSE4_2)
 #define SEARCH_FLUSH                                  \
     if (str > cursor) {                               \
         APPEND_CHARS(out->cur, cursor, str - cursor); \
         cursor = str;                                 \
     }
-#endif /* HAVE_SIMD_NEON */
+        const char *chunk_start;
+        const char *chunk_end;
+        const char *cursor = str;
+        char        matches[16];
+#endif /* HAVE_SIMD_NEON || HAVE_SIMD_SSE4_2 */
+#if defined(HAVE_SIMD_NEON)
+        bool use_simd = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
+#elif defined(HAVE_SIMD_SSE4_2)
+        bool use_simd = false;
+        if (SIMD_Impl == SIMD_SSE42) {
+            use_simd = (cmap_sse42 != NULL && cnt >= (sizeof(__m128i))) ? true : false;
+        }
+#endif
 #ifdef HAVE_SIMD_NEON
-        if (use_neon) {
+        if (use_simd) {
             while (str < end) {
                 const char *chunk_ptr = NULL;
                 if (str + sizeof(uint8x16_t) <= end) {
@@ -1195,7 +1286,55 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
             }
             SEARCH_FLUSH;
         }
-#endif /* HAVE_SIMD_NEON */
+#endif
+#ifdef HAVE_SIMD_SSE4_2
+        if (SIMD_Impl == SIMD_SSE42) {
+            if (use_simd) {
+                while (str < end) {
+                    const char *chunk_ptr = NULL;
+                    if (str + sizeof(__m128i) <= end) {
+                        chunk_ptr   = str;
+                        chunk_start = str;
+                        chunk_end   = str + sizeof(__m128i);
+                    } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
+                        memset(out->cur, 'A', sizeof(__m128i));
+                        memcpy(out->cur, str, (end - str));
+                        chunk_ptr   = out->cur;
+                        chunk_start = str;
+                        chunk_end   = end;
+                    } else {
+                        break;
+                    }
+                    sse42_match_result result = sse42_update(chunk_ptr,
+                                                             cmap_sse42,
+                                                             sse42_tab_size,
+                                                             do_unicode_validation,
+                                                             has_hi);
+                    if ((result.do_unicode_validation) || result.needs_escape) {
+                        SEARCH_FLUSH;
+                        _mm_storeu_si128((__m128i *)matches, result.actions);
+                        while (str < chunk_end) {
+                            long match_index = str - chunk_start;
+                            str              = process_character(matches[match_index],
+                                                    str,
+                                                    end,
+                                                    out,
+                                                    orig,
+                                                    do_unicode_validation,
+                                                    &check_start);
+                            str++;
+                        }
+                        cursor = str;
+                        continue;
+                    }
+                    str = chunk_end;
+                }
+                SEARCH_FLUSH;
+            }
+        }
+#endif /* HAVE_SIMD_SSE4_2 */
         for (; str < end; str++) {
             str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
         }

data/ext/oj/extconf.rb CHANGED Viewed

@@ -35,11 +35,15 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
 dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
-# Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
-# Falls back to SSE2 or compiler defaults if not available
-if try_cflags('-msse4.2')
-  $CPPFLAGS += ' -msse4.2'
-elsif try_cflags('-msse2')
+# SIMD optimizations use runtime CPU detection and function-level target attributes
+# We do NOT add global -msse4.2/-msse2 flags here because:
+# 1. It would cause illegal instruction errors on CPUs without SSE4.2
+# 2. The code uses __attribute__((target("sse4.2"))) for SSE4.2 functions
+# 3. Runtime detection in oj_get_simd_implementation() selects the right path
+#
+# We only add -msse2 if available, since SSE2 is baseline for all x86_64 CPUs
+# and needed for compiling the SSE2 fallback code on 32-bit x86
+if try_cflags('-msse2')
   $CPPFLAGS += ' -msse2'
 end

data/ext/oj/oj.c CHANGED Viewed

@@ -167,6 +167,8 @@ pthread_mutex_t oj_cache_mutex;
 VALUE oj_cache_mutex = Qnil;
 #endif
+SIMD_Implementation SIMD_Impl = SIMD_NONE;
 extern void oj_parser_init();
 const char oj_json_class[] = "json_class";
@@ -1780,6 +1782,78 @@ static VALUE mem_report(VALUE self) {
  *
  * - *:wab* specifically for WAB data exchange.
  */
+// =============================================================================
+// Runtime SIMD CPU detection
+// Cross-platform support for Windows (MSVC), Linux, and macOS (GCC/Clang)
+// =============================================================================
+SIMD_Implementation oj_get_simd_implementation(void) {
+#ifdef HAVE_SIMD_X86
+    // x86/x86_64 runtime detection
+#if defined(_MSC_VER)
+    // MSVC: Use __cpuid intrinsic
+    int cpu_info[4];
+    __cpuid(cpu_info, 1);
+    // Check for SSE4.2 (bit 20 of ECX)
+    if (cpu_info[2] & (1 << 20)) {
+        return SIMD_SSE42;
+    }
+    // Check for SSE2 (bit 26 of EDX)
+    if (cpu_info[3] & (1 << 26)) {
+        return SIMD_SSE2;
+    }
+#elif defined(__GNUC__) || defined(__clang__)
+    // GCC/Clang: Use __builtin_cpu_supports if available
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_cpu_supports)
+#define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
+#endif
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
+    // GCC 4.8+ has __builtin_cpu_supports
+#define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
+#endif
+#ifdef OJ_HAS_BUILTIN_CPU_SUPPORTS
+#ifdef HAVE_SIMD_SSE4_2
+    if (__builtin_cpu_supports("sse4.2")) {
+        return SIMD_SSE42;
+    }
+#endif
+#ifdef HAVE_SIMD_SSE2
+    if (__builtin_cpu_supports("sse2")) {
+        return SIMD_SSE2;
+    }
+#endif
+#else
+    // Fallback: Use CPUID instruction directly
+    unsigned int eax, ebx, ecx, edx;
+    if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+        // Check for SSE4.2 (bit 20 of ECX)
+        if (ecx & (1 << 20)) {
+            return SIMD_SSE42;
+        }
+        // Check for SSE2 (bit 26 of EDX)
+        if (edx & (1 << 26)) {
+            return SIMD_SSE2;
+        }
+    }
+#endif  // OJ_HAS_BUILTIN_CPU_SUPPORTS
+#endif  // _MSC_VER vs GCC/Clang
+#endif  // HAVE_SIMD_X86
+#ifdef HAVE_SIMD_NEON
+    // ARM NEON is always available on ARM64 and detected at compile time
+    return SIMD_NEON;
+#endif
+    return SIMD_NONE;
+}
 void Init_oj(void) {
     int err = 0;
@@ -2080,10 +2154,18 @@ void Init_oj(void) {
 #endif
     oj_init_doc();
+    SIMD_Impl = oj_get_simd_implementation();
     oj_parser_init();
     oj_scanner_init();
 #ifdef HAVE_SIMD_NEON
     initialize_neon();
 #endif /* HAVE_SIMD_NEON */
+#ifdef HAVE_SIMD_SSE4_2
+    if (SIMD_Impl == SIMD_SSE42) {
+        initialize_sse42();
+    }
+#endif /* HAVE_SIMD_SSE4_2 */
 }

data/ext/oj/parse.c CHANGED Viewed

@@ -202,7 +202,8 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
 #ifdef HAVE_SIMD_SSE4_2
 // Optimized SIMD string scanner using SSE4.2 instructions
 // Uses prefetching and processes multiple chunks in parallel to reduce latency
-static inline const char *scan_string_SSE42(const char *str, const char *end) {
+// Note: OJ_TARGET_SSE42 attribute allows this to compile even without global -msse4.2
+static OJ_TARGET_SSE42 const char *scan_string_SSE42(const char *str, const char *end) {
     static const char chars[16]   = "\x00\\\"";
     const __m128i     terminate   = _mm_loadu_si128((const __m128i *)&chars[0]);
     const char       *safe_end_64 = end - 64;
@@ -212,7 +213,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
     // This reduces pipeline stalls and improves instruction-level parallelism
     while (str <= safe_end_64) {
         // Prefetch next cache line for better memory throughput
-        __builtin_prefetch(str + 64, 0, 0);
+        OJ_PREFETCH(str + 64);
         // Load and compare 4 chunks in parallel
         const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
@@ -225,7 +226,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
                                     chunk0,
                                     16,
                                     _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
-        if (__builtin_expect(r0 != 16, 0))
+        if (OJ_UNLIKELY(r0 != 16))
             return str + r0;
         const int r1 = _mm_cmpestri(terminate,
@@ -233,7 +234,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
                                     chunk1,
                                     16,
                                     _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
-        if (__builtin_expect(r1 != 16, 0))
+        if (OJ_UNLIKELY(r1 != 16))
             return str + 16 + r1;
         const int r2 = _mm_cmpestri(terminate,
@@ -241,7 +242,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
                                     chunk2,
                                     16,
                                     _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
-        if (__builtin_expect(r2 != 16, 0))
+        if (OJ_UNLIKELY(r2 != 16))
             return str + 32 + r2;
         const int r3 = _mm_cmpestri(terminate,
@@ -249,7 +250,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
                                     chunk3,
                                     16,
                                     _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
-        if (__builtin_expect(r3 != 16, 0))
+        if (OJ_UNLIKELY(r3 != 16))
             return str + 48 + r3;
         str += 64;
@@ -274,7 +275,8 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
 #ifdef HAVE_SIMD_SSE2
 // Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
 // Uses SSE2 instructions with prefetching and parallel processing
-static inline const char *scan_string_SSE2(const char *str, const char *end) {
+// Note: OJ_TARGET_SSE2 attribute allows this to compile even without global -msse2
+static OJ_TARGET_SSE2 const char *scan_string_SSE2(const char *str, const char *end) {
     const char *safe_end_64 = end - 64;
     const char *safe_end_16 = end - 16;
@@ -285,7 +287,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
     // Process 64 bytes at a time for better throughput
     while (str <= safe_end_64) {
-        __builtin_prefetch(str + 64, 0, 0);
+        OJ_PREFETCH(str + 64);
         // Load 4 chunks
         const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
@@ -309,20 +311,20 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
         // Convert to masks
         int mask0 = _mm_movemask_epi8(cmp0);
-        if (__builtin_expect(mask0 != 0, 0))
-            return str + __builtin_ctz(mask0);
+        if (OJ_UNLIKELY(mask0 != 0))
+            return str + OJ_CTZ(mask0);
         int mask1 = _mm_movemask_epi8(cmp1);
-        if (__builtin_expect(mask1 != 0, 0))
-            return str + 16 + __builtin_ctz(mask1);
+        if (OJ_UNLIKELY(mask1 != 0))
+            return str + 16 + OJ_CTZ(mask1);
         int mask2 = _mm_movemask_epi8(cmp2);
-        if (__builtin_expect(mask2 != 0, 0))
-            return str + 32 + __builtin_ctz(mask2);
+        if (OJ_UNLIKELY(mask2 != 0))
+            return str + 32 + OJ_CTZ(mask2);
         int mask3 = _mm_movemask_epi8(cmp3);
-        if (__builtin_expect(mask3 != 0, 0))
-            return str + 48 + __builtin_ctz(mask3);
+        if (OJ_UNLIKELY(mask3 != 0))
+            return str + 48 + OJ_CTZ(mask3);
         str += 64;
     }
@@ -335,7 +337,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
             _mm_cmpeq_epi8(chunk, quote));
         int mask = _mm_movemask_epi8(matches);
         if (mask != 0)
-            return str + __builtin_ctz(mask);
+            return str + OJ_CTZ(mask);
     }
     return scan_string_noSIMD(str, end);
@@ -345,11 +347,19 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
 static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
 void oj_scanner_init(void) {
+    // Use runtime CPU detection to select the best SIMD implementation
+    // This ensures we don't crash on CPUs that don't support SSE4.2
+    SIMD_Implementation impl = oj_get_simd_implementation();
+    switch (impl) {
 #ifdef HAVE_SIMD_SSE4_2
-    scan_func = scan_string_SSE42;
-#elif defined(HAVE_SIMD_SSE2)
-    scan_func = scan_string_SSE2;
+    case SIMD_SSE42: scan_func = scan_string_SSE42; break;
+#endif
+#ifdef HAVE_SIMD_SSE2
+    case SIMD_SSE2: scan_func = scan_string_SSE2; break;
 #endif
+    default: scan_func = scan_string_noSIMD; break;
+    }
     // Note: ARM NEON string scanning would be added here if needed
 }

data/ext/oj/rails.c CHANGED Viewed

@@ -668,7 +668,7 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
     } else {
         e->arg = rb_hash_new();
     }
-    oj_parse_options(*argv, &e->opts);
+    oj_parse_options(e->arg, &e->opts);
     return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
 }

data/ext/oj/simd.h CHANGED Viewed

@@ -3,45 +3,170 @@
 // SIMD architecture detection and configuration
 // This header provides unified SIMD support across different CPU architectures
+// with cross-platform runtime detection (Windows/Linux/Mac)
+// SIMD implementation enum - used for runtime selection
+typedef enum _simd_implementation { SIMD_NONE, SIMD_NEON, SIMD_SSE2, SIMD_SSE42 } SIMD_Implementation;
+// Define in oj.c.
+extern SIMD_Implementation SIMD_Impl;
+// Runtime CPU detection function (implemented in oj.c)
+SIMD_Implementation oj_get_simd_implementation(void);
+// =============================================================================
+// Compiler compatibility macros
+// =============================================================================
+// Branch prediction hints
+#if defined(__GNUC__) || defined(__clang__)
+#define OJ_LIKELY(x) __builtin_expect(!!(x), 1)
+#define OJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
+#else
+#define OJ_LIKELY(x) (x)
+#define OJ_UNLIKELY(x) (x)
+#endif
+// Prefetch hints
+#if defined(__GNUC__) || defined(__clang__)
+#define OJ_PREFETCH(addr) __builtin_prefetch(addr, 0, 0)
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#define OJ_PREFETCH(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0)
+#else
+#define OJ_PREFETCH(addr) ((void)0)
+#endif
+// Count trailing zeros (for SSE2 mask scanning)
+#if defined(__GNUC__) || defined(__clang__)
+#define OJ_CTZ(x) __builtin_ctz(x)
+#elif defined(_MSC_VER)
+#include <intrin.h>
+static __inline int oj_ctz_msvc(unsigned int x) {
+    unsigned long index;
+    _BitScanForward(&index, x);
+    return (int)index;
+}
+#define OJ_CTZ(x) oj_ctz_msvc(x)
+#else
+// Fallback: naive implementation
+static inline int oj_ctz_fallback(unsigned int x) {
+    int count = 0;
+    while ((x & 1) == 0 && count < 32) {
+        x >>= 1;
+        count++;
+    }
+    return count;
+}
+#define OJ_CTZ(x) oj_ctz_fallback(x)
+#endif
+// =============================================================================
 // x86/x86_64 SIMD detection
+// =============================================================================
 #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
 #define HAVE_SIMD_X86 1
-// SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
-// Enabled automatically when compiler has -msse4.2 flag
-#if defined(__SSE4_2__)
+// Include appropriate SIMD headers
+#if defined(_MSC_VER)
+// MSVC: use intrin.h for all intrinsics
+#include <intrin.h>
 #define HAVE_SIMD_SSE4_2 1
-#include <nmmintrin.h>
+#define HAVE_SIMD_SSE2 1
+#elif defined(__GNUC__) || defined(__clang__)
+// GCC/Clang: check for header availability and include them
+// We include headers but use target attributes to enable instructions per-function
+// Include cpuid.h for __get_cpuid fallback when __builtin_cpu_supports is unavailable
+#if __has_include(<cpuid.h>)
+#include <cpuid.h>
 #endif
-// SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
-#if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
+#if defined(__SSE4_2__) || defined(__SSE2__)
+// If any SSE is enabled globally, x86intrin.h should be available
+#include <x86intrin.h>
+#define HAVE_SIMD_SSE4_2 1
+#define HAVE_SIMD_SSE2 1
+#else
+// Try to include headers anyway for target attribute functions
+#if __has_include(<x86intrin.h>)
+#include <x86intrin.h>
+#define HAVE_SIMD_SSE4_2 1
+#define HAVE_SIMD_SSE2 1
+#elif __has_include(<nmmintrin.h>)
+#include <nmmintrin.h>
+#define HAVE_SIMD_SSE4_2 1
 #define HAVE_SIMD_SSE2 1
+#elif __has_include(<emmintrin.h>)
 #include <emmintrin.h>
+#define HAVE_SIMD_SSE2 1
+#endif
+#endif
+#endif
+// Target attribute macros for function-level SIMD enabling
+#if defined(__clang__) || defined(__GNUC__)
+#define OJ_TARGET_SSE42 __attribute__((target("sse4.2")))
+#define OJ_TARGET_SSE2 __attribute__((target("sse2")))
+#else
+// MSVC doesn't need target attributes - intrinsics are always available
+#define OJ_TARGET_SSE42
+#define OJ_TARGET_SSE2
 #endif
 #endif  // x86/x86_64
+// =============================================================================
 // ARM NEON detection
+// =============================================================================
 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
 #define HAVE_SIMD_NEON 1
 #define SIMD_MINIMUM_THRESHOLD 6
 #include <arm_neon.h>
 #endif
-// Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
-#if defined(HAVE_SIMD_SSE4_2)
+// =============================================================================
+// SIMD type string for debugging/logging
+// =============================================================================
+#if defined(HAVE_SIMD_SSE4_2) || defined(HAVE_SIMD_SSE2)
 #define HAVE_SIMD_STRING_SCAN 1
-#define SIMD_TYPE "SSE4.2"
+#define SIMD_TYPE "x86 (runtime detected)"
 #elif defined(HAVE_SIMD_NEON)
 #define HAVE_SIMD_STRING_SCAN 1
 #define SIMD_TYPE "NEON"
-#elif defined(HAVE_SIMD_SSE2)
-#define HAVE_SIMD_STRING_SCAN 1
-#define SIMD_TYPE "SSE2"
 #else
 #define SIMD_TYPE "none"
 #endif
+#if defined(HAVE_SIMD_SSE4_2)
+#define SIMD_MINIMUM_THRESHOLD 6
+extern void initialize_sse42(void);
+static inline OJ_TARGET_SSE42 __m128i vector_lookup_sse42(__m128i input, __m128i *lookup_table, int tab_size) {
+    // Extract high 4 bits to determine which 16-byte chunk (0-15)
+    __m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));
+    // Extract low 4 bits for index within the chunk (0-15)
+    __m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));
+    // Perform lookups in all 16 tables
+    __m128i results[16];
+    for (int i = 0; i < tab_size; i++) {
+        results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
+    }
+    // Create masks for each chunk and blend results
+    __m128i final_result = _mm_setzero_si128();
+    for (int i = 0; i < tab_size; i++) {
+        __m128i mask          = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
+        __m128i masked_result = _mm_and_si128(mask, results[i]);
+        final_result          = _mm_or_si128(final_result, masked_result);
+    }
+    return final_result;
+}
+#endif
 #endif /* OJ_SIMD_H */

data/lib/oj/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module Oj
   # Current version of the module.
-  VERSION = '3.16.13'
+  VERSION = '3.16.14'
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: oj
 version: !ruby/object:Gem::Version
-  version: 3.16.13
+  version: 3.16.14
 platform: ruby
 authors:
 - Peter Ohler
@@ -23,20 +23,6 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '3.0'
-- !ruby/object:Gem::Dependency
-  name: ostruct
-  requirement: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0.2'
-  type: :runtime
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: '0.2'
 - !ruby/object:Gem::Dependency
   name: minitest
   requirement: !ruby/object:Gem::Requirement
@@ -229,7 +215,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.6.9
+rubygems_version: 4.0.3
 specification_version: 4
 summary: A fast JSON parser and serializer.
 test_files: []