RubyGems - oj - Versions diffs - 3.16.11 → 3.16.13 - Mend

oj 3.16.11 → 3.16.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 30aea721380a4e3edc306dd19906d8777f230a639ba3427e9394dd543a3a7e3b
-  data.tar.gz: b024a9d4513c16c1bfe4fc3c4adeacb1afd9a0e670987476d32cfa2fa74e9b1e
+  metadata.gz: fc0290fa1cfe6af1094de1d7188836e0c09cb04f2f08401de118253026604650
+  data.tar.gz: de5258e96984a21afb2fac946fe28ad255926893a6157f2874445edf10aa8bbe
 SHA512:
-  metadata.gz: 527ea1162cb135bbe16eefc10a7cb05444182767aca6fa0b6986622e52d7082bcec020c43e663251406c81602018f7d0842c2c5cee37aeca0269560e502d99dd
-  data.tar.gz: e49e9f63e373cb0ec21f604f97899f87815b86ef5a5eafad30e7bddbd11e71156f92beaa1259c83609c2d45d2a8aac87c8b27e3e266fcc2bd99a1908327c796d
+  metadata.gz: d7870818fd86043a17b834756b67a4009a6f7ef60baf53b02a0b0d4431ccba723d9e533553bea04ae46ae9f233e447b79c4106e6827782bd0c2ffb9c332081a3
+  data.tar.gz: fd3966ac7fb5da9f1a5ebb68f4a8f5b9a5f9fa1a1255e93dfef078f66f00a6af5bb7e37676441f7d6229b29222741a2bc7b75164fd445a39b906ef904946d41b

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,13 @@
 # CHANGELOG
+## 3.16.13 - 2025-12-05
+- Fixed rails encoding for Hash and Array subclasses.
+## 3.16.12 - 2025-10-29
+- Fixed dump realloc bug that occurred when using the compat mode dump options.
 ## 3.16.11 - 2025-05-29
 - Fixed range encoding with the :circular option

data/ext/oj/dump.c CHANGED Viewed

@@ -252,38 +252,46 @@ inline static size_t hixss_friendly_size(const uint8_t *str, size_t len) {
 }
 inline static long rails_xss_friendly_size(const uint8_t *str, size_t len) {
-    long    size = 0;
-    uint8_t hi   = 0;
+    long     size = 0;
+    uint32_t hi   = 0;
 #ifdef HAVE_SIMD_NEON
     size_t i = 0;
-    uint8x16_t has_some_hibit = vdupq_n_u8(0);
-    uint8x16_t hibit          = vdupq_n_u8(0x80);
-    for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
-        size += sizeof(uint8x16_t);
+    if (len >= sizeof(uint8x16_t)) {
+        uint8x16_t has_some_hibit = vdupq_n_u8(0);
+        uint8x16_t hibit          = vdupq_n_u8(0x80);
-        uint8x16_t chunk = vld1q_u8(str);
+        for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
+            size += sizeof(uint8x16_t);
-        // Check to see if any of these bytes have the high bit set.
-        has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
+            uint8x16_t chunk = vld1q_u8(str);
-        uint8x16_t tmp1   = vqtbl4q_u8(rails_xss_friendly_chars_neon[0], chunk);
-        uint8x16_t tmp2   = vqtbl4q_u8(rails_xss_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
-        uint8x16_t tmp3   = vqtbl4q_u8(rails_xss_friendly_chars_neon[2], veorq_u8(chunk, vdupq_n_u8(0x80)));
-        uint8x16_t tmp4   = vqtbl4q_u8(rails_xss_friendly_chars_neon[3], veorq_u8(chunk, vdupq_n_u8(0xc0)));
-        uint8x16_t result = vorrq_u8(tmp4, vorrq_u8(tmp3, vorrq_u8(tmp1, tmp2)));
-        uint8_t    tmp    = vaddvq_u8(result);
-        size += tmp;
+            // Check to see if any of these bytes have the high bit set.
+            has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
+            uint8x16_t tmp1   = vqtbl4q_u8(rails_xss_friendly_chars_neon[0], chunk);
+            uint8x16_t tmp2   = vqtbl4q_u8(rails_xss_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
+            uint8x16_t tmp3   = vqtbl4q_u8(rails_xss_friendly_chars_neon[2], veorq_u8(chunk, vdupq_n_u8(0x80)));
+            uint8x16_t tmp4   = vqtbl4q_u8(rails_xss_friendly_chars_neon[3], veorq_u8(chunk, vdupq_n_u8(0xc0)));
+            uint8x16_t result = vorrq_u8(tmp4, vorrq_u8(tmp3, vorrq_u8(tmp1, tmp2)));
+            uint8_t    tmp    = vaddvq_u8(result);
+            size += tmp;
+        }
+        // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
+        hi = vmaxvq_u8(has_some_hibit) != 0;
     }
-    // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
-    hi = vmaxvq_u8(has_some_hibit) != 0;
+    size_t len_remaining = len - i;
     for (; i < len; str++, i++) {
-        size += rails_xss_friendly_chars[*str] - '0';
+        size += rails_xss_friendly_chars[*str];
         hi |= *str & 0x80;
     }
+    size -= (len_remaining * ((size_t)'0'));
     if (0 == hi) {
         return size;
     }
@@ -302,37 +310,43 @@ inline static long rails_xss_friendly_size(const uint8_t *str, size_t len) {
 }
 inline static size_t rails_friendly_size(const uint8_t *str, size_t len) {
-    long    size = 0;
-    uint8_t hi   = 0;
+    long     size = 0;
+    uint32_t hi   = 0;
 #ifdef HAVE_SIMD_NEON
-    size_t i = 0;
+    size_t i     = 0;
+    long   extra = 0;
-    uint8x16_t has_some_hibit = vdupq_n_u8(0);
-    uint8x16_t hibit          = vdupq_n_u8(0x80);
+    if (len >= sizeof(uint8x16_t)) {
+        uint8x16_t has_some_hibit = vdupq_n_u8(0);
+        uint8x16_t hibit          = vdupq_n_u8(0x80);
-    for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
-        size += sizeof(uint8x16_t);
+        for (; i + sizeof(uint8x16_t) <= len; i += sizeof(uint8x16_t), str += sizeof(uint8x16_t)) {
+            size += sizeof(uint8x16_t);
-        // See https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
-        uint8x16_t chunk = vld1q_u8(str);
+            // See https://lemire.me/blog/2019/07/23/arbitrary-byte-to-byte-maps-using-arm-neon/
+            uint8x16_t chunk = vld1q_u8(str);
-        // Check to see if any of these bytes have the high bit set.
-        has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
+            // Check to see if any of these bytes have the high bit set.
+            has_some_hibit = vorrq_u8(has_some_hibit, vandq_u8(chunk, hibit));
-        uint8x16_t tmp1   = vqtbl4q_u8(rails_friendly_chars_neon[0], chunk);
-        uint8x16_t tmp2   = vqtbl4q_u8(rails_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
-        uint8x16_t result = vorrq_u8(tmp1, tmp2);
-        uint8_t    tmp    = vaddvq_u8(result);
-        size += tmp;
-    }
+            uint8x16_t tmp1   = vqtbl4q_u8(rails_friendly_chars_neon[0], chunk);
+            uint8x16_t tmp2   = vqtbl4q_u8(rails_friendly_chars_neon[1], veorq_u8(chunk, vdupq_n_u8(0x40)));
+            uint8x16_t result = vorrq_u8(tmp1, tmp2);
+            uint8_t    tmp    = vaddvq_u8(result);
+            size += tmp;
+        }
-    // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
-    hi = vmaxvq_u8(has_some_hibit) != 0;
+        // 'hi' should be set if any of the bytes we processed have the high bit set. It doesn't matter which ones.
+        hi = vmaxvq_u8(has_some_hibit) != 0;
+    }
-    for (; i < len; str++, i++) {
-        size += rails_friendly_chars[*str] - '0';
+    for (; i < len; str++, i++, extra++) {
+        size += rails_friendly_chars[*str];
         hi |= *str & 0x80;
     }
+    size -= (extra * ((size_t)'0'));
     if (0 == hi) {
         return size;
     }
@@ -896,6 +910,12 @@ void oj_dump_raw_json(VALUE obj, int depth, Out out) {
     }
 }
+#if defined(__clang__) || defined(__GNUC__)
+#define FORCE_INLINE __attribute__((always_inline))
+#else
+#define FORCE_INLINE
+#endif
 #ifdef HAVE_SIMD_NEON
 typedef struct _neon_match_result {
     uint8x16_t needs_escape;
@@ -903,12 +923,6 @@ typedef struct _neon_match_result {
     bool       do_unicode_validation;
 } neon_match_result;
-#if defined(__clang__) || defined(__GNUC__)
-#define FORCE_INLINE __attribute__((always_inline))
-#else
-#define FORCE_INLINE
-#endif
 static inline FORCE_INLINE neon_match_result
 neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool do_unicode_validation, bool has_hi) {
     neon_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
@@ -932,12 +946,83 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
 #endif /* HAVE_SIMD_NEON */
+static inline FORCE_INLINE const char *process_character(char         action,
+                                                         const char  *str,
+                                                         const char  *end,
+                                                         Out          out,
+                                                         const char  *orig,
+                                                         bool         do_unicode_validation,
+                                                         const char **check_start_) {
+    const char *check_start = *check_start_;
+    switch (action) {
+    case '1':
+        if (do_unicode_validation && check_start <= str) {
+            if (0 != (0x80 & (uint8_t)*str)) {
+                if (0xC0 == (0xC0 & (uint8_t)*str)) {
+                    *check_start_ = check_unicode(str, end, orig);
+                } else {
+                    raise_invalid_unicode(orig, (int)(end - orig), (int)(str - orig));
+                }
+            }
+        }
+        *out->cur++ = *str;
+        break;
+    case '2':
+        *out->cur++ = '\\';
+        switch (*str) {
+        case '\\': *out->cur++ = '\\'; break;
+        case '\b': *out->cur++ = 'b'; break;
+        case '\t': *out->cur++ = 't'; break;
+        case '\n': *out->cur++ = 'n'; break;
+        case '\f': *out->cur++ = 'f'; break;
+        case '\r': *out->cur++ = 'r'; break;
+        default: *out->cur++ = *str; break;
+        }
+        break;
+    case '3':  // Unicode
+        if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
+            if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
+                str = dump_unicode(str, end, out, orig);
+            } else {
+                *check_start_ = check_unicode(str, end, orig);
+                *out->cur++   = *str;
+            }
+            break;
+        }
+        str = dump_unicode(str, end, out, orig);
+        break;
+    case '6':  // control characters
+        if (*(uint8_t *)str < 0x80) {
+            if (0 == (uint8_t)*str && out->opts->dump_opts.omit_null_byte) {
+                break;
+            }
+            APPEND_CHARS(out->cur, "\\u00", 4);
+            dump_hex((uint8_t)*str, out);
+        } else {
+            if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
+                if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
+                    str = dump_unicode(str, end, out, orig);
+                } else {
+                    *check_start_ = check_unicode(str, end, orig);
+                    *out->cur++   = *str;
+                }
+                break;
+            }
+            str = dump_unicode(str, end, out, orig);
+        }
+        break;
+    default: break;  // ignore, should never happen if the table is correct
+    }
+    return str;
+}
 void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out out) {
     size_t size;
     char  *cmap;
 #ifdef HAVE_SIMD_NEON
-    uint8x16x4_t *cmap_neon = NULL;
-    int           neon_table_size;
+    uint8x16x4_t *cmap_neon       = NULL;
+    int           neon_table_size = 0;
 #endif /* HAVE_SIMD_NEON */
     const char *orig                  = str;
     bool        has_hi                = false;
@@ -1036,171 +1121,83 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
 #ifdef HAVE_SIMD_NEON
         const char *chunk_start;
         const char *chunk_end;
-        const char *cursor     = str;
-        int         neon_state = (cmap_neon != NULL) ? 1 : 4;
+        const char *cursor   = str;
+        bool        use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
         char        matches[16];
-        bool        do_hi_validation = false;
-        // uint64_t neon_match_mask = 0;
 #define SEARCH_FLUSH                                  \
     if (str > cursor) {                               \
         APPEND_CHARS(out->cur, cursor, str - cursor); \
         cursor = str;                                 \
     }
-    loop:
 #endif /* HAVE_SIMD_NEON */
-        for (; str < end; str++) {
-            char action = 0;
 #ifdef HAVE_SIMD_NEON
-            /* neon_state:
-             * 1: Scanning for matches. There must be at least
-                  sizeof(uint8x16_t) bytes of input data to use SIMD and
-                  cmap_neon must be non-null.
-             * 2: Matches have been found. Will set str to the position of the
-             *    next match and set the state to 3.
-             *    If there are no more matches it will transition to state 1.
-             * 4: Fallback to the scalar algorithm. Not enough data to use
-             *    SIMD.
-             */
-#define NEON_SET_STATE(state) \
-    neon_state = state;       \
-    goto loop;
-#define NEON_RETURN_TO_STATE(state) neon_state = state;
-            switch (neon_state) {
-            case 1: {
-                while (true) {
-                    const char *chunk_ptr = NULL;
-                    if (str + sizeof(uint8x16_t) <= end) {
-                        chunk_ptr   = str;
-                        chunk_start = str;
-                        chunk_end   = str + sizeof(uint8x16_t);
-                    } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
-                        memset(out->cur, 'A', sizeof(uint8x16_t));
-                        memcpy(out->cur, str, (end - str));
-                        chunk_ptr   = out->cur;
-                        chunk_start = str;
-                        chunk_end   = end;
-                    } else {
-                        SEARCH_FLUSH;
-                        NEON_SET_STATE(4);
-                        break; /* Unreachable */
-                    }
-                    neon_match_result result = neon_update(chunk_ptr,
-                                                           cmap_neon,
-                                                           neon_table_size,
-                                                           do_unicode_validation,
-                                                           has_hi);
-                    if ((result.do_unicode_validation) || vmaxvq_u8(result.needs_escape) != 0) {
-                        SEARCH_FLUSH;
-                        uint8x16_t actions = vaddq_u8(result.needs_escape, vdupq_n_u8('1'));
-                        do_hi_validation   = result.do_unicode_validation;
-                        vst1q_u8((unsigned char *)matches, actions);
-                        NEON_SET_STATE(2);
-                        break; /* Unreachable */
-                    }
-                    str = chunk_end;
-                }
-                // We must have run out of data to use SIMD. Go to state 4.
-                SEARCH_FLUSH;
-                NEON_SET_STATE(4);
-            } break;
-            case 3:
-                cursor = str;
-                // This fall through is intentional. We return to state 3 after we process
-                // a byte (or multiple). We return to this state to ensure the cursor is
-                // pointing to the correct location. We then resume looking for matches
-                // within the previously processed chunk.
-            case 2:
-                if (str >= chunk_end) {
-                    NEON_SET_STATE(1);
-                }
-                if (!do_hi_validation) {
-                    long i = str - chunk_start;
-                    for (; str < chunk_end; i++) {
-                        if ((action = matches[i]) != '1') {
-                            break;
-                        }
-                        *out->cur++ = *str++;
-                    }
-                    // The loop above may have advanced str and directly output them to out->cur.
-                    // Ensure cursor is set appropriately.
-                    cursor = str;
-                    if (str >= chunk_end) {
-                        // We must have advanced past the end... we are done.
-                        NEON_SET_STATE(1);
-                    }
+        if (use_neon) {
+            while (str < end) {
+                const char *chunk_ptr = NULL;
+                if (str + sizeof(uint8x16_t) <= end) {
+                    chunk_ptr   = str;
+                    chunk_start = str;
+                    chunk_end   = str + sizeof(uint8x16_t);
+                } else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
+                    memset(out->cur, 'A', sizeof(uint8x16_t));
+                    memcpy(out->cur, str, (end - str));
+                    chunk_ptr   = out->cur;
+                    chunk_start = str;
+                    chunk_end   = end;
                 } else {
-                    long match_index = str - chunk_start;
-                    action           = matches[match_index];
+                    break;
                 }
-                NEON_RETURN_TO_STATE(3);
-                break;
-            case 4: action = cmap[(uint8_t)*str];
-            }
-#undef NEON_SET_STATE
-#undef NEON_RETURN_TO_STATE
-#else
-            action = cmap[(uint8_t)*str];
-#endif /* HAVE_SIMD_NEON */
-            switch (action) {
-            case '1':
-                if (do_unicode_validation && check_start <= str) {
-                    if (0 != (0x80 & (uint8_t)*str)) {
-                        if (0xC0 == (0xC0 & (uint8_t)*str)) {
-                            check_start = check_unicode(str, end, orig);
-                        } else {
-                            raise_invalid_unicode(orig, (int)(end - orig), (int)(str - orig));
+                neon_match_result result = neon_update(chunk_ptr,
+                                                       cmap_neon,
+                                                       neon_table_size,
+                                                       do_unicode_validation,
+                                                       has_hi);
+                if ((result.do_unicode_validation) || vmaxvq_u8(result.needs_escape) != 0) {
+                    SEARCH_FLUSH;
+                    uint8x16_t actions     = vaddq_u8(result.needs_escape, vdupq_n_u8('1'));
+                    uint8_t    num_matches = vaddvq_u8(vandq_u8(result.needs_escape, vdupq_n_u8(0x1)));
+                    vst1q_u8((unsigned char *)matches, actions);
+                    bool process_each = result.do_unicode_validation || (num_matches > sizeof(uint8x16_t) / 2);
+                    // If no byte in this chunk had the high bit set then we can skip
+                    // all of the '1' bytes by directly copying them to the output.
+                    if (!process_each) {
+                        while (str < chunk_end) {
+                            long i = str - chunk_start;
+                            char action;
+                            while (str < chunk_end && (action = matches[i++]) == '1') {
+                                *out->cur++ = *str++;
+                            }
+                            cursor = str;
+                            if (str >= chunk_end) {
+                                break;
+                            }
+                            str = process_character(action, str, end, out, orig, do_unicode_validation, &check_start);
+                            str++;
                         }
-                    }
-                }
-                *out->cur++ = *str;
-                break;
-            case '2':
-                *out->cur++ = '\\';
-                switch (*str) {
-                case '\\': *out->cur++ = '\\'; break;
-                case '\b': *out->cur++ = 'b'; break;
-                case '\t': *out->cur++ = 't'; break;
-                case '\n': *out->cur++ = 'n'; break;
-                case '\f': *out->cur++ = 'f'; break;
-                case '\r': *out->cur++ = 'r'; break;
-                default: *out->cur++ = *str; break;
-                }
-                break;
-            case '3':  // Unicode
-                if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
-                    if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
-                        str = dump_unicode(str, end, out, orig);
                     } else {
-                        check_start = check_unicode(str, end, orig);
-                        *out->cur++ = *str;
-                    }
-                    break;
-                }
-                str = dump_unicode(str, end, out, orig);
-                break;
-            case '6':  // control characters
-                if (*(uint8_t *)str < 0x80) {
-                    if (0 == (uint8_t)*str && out->opts->dump_opts.omit_null_byte) {
-                        break;
-                    }
-                    APPEND_CHARS(out->cur, "\\u00", 4);
-                    dump_hex((uint8_t)*str, out);
-                } else {
-                    if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
-                        if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
-                            str = dump_unicode(str, end, out, orig);
-                        } else {
-                            check_start = check_unicode(str, end, orig);
-                            *out->cur++ = *str;
+                        while (str < chunk_end) {
+                            long match_index = str - chunk_start;
+                            str              = process_character(matches[match_index],
+                                                    str,
+                                                    end,
+                                                    out,
+                                                    orig,
+                                                    do_unicode_validation,
+                                                    &check_start);
+                            str++;
                         }
-                        break;
                     }
-                    str = dump_unicode(str, end, out, orig);
+                    cursor = str;
+                    continue;
                 }
-                break;
-            default: break;  // ignore, should never happen if the table is correct
+                str = chunk_end;
             }
+            SEARCH_FLUSH;
+        }
+#endif /* HAVE_SIMD_NEON */
+        for (; str < end; str++) {
+            str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
         }
         *out->cur++ = '"';
     }

data/ext/oj/dump_compat.c CHANGED Viewed

@@ -148,10 +148,10 @@ static void dump_array(VALUE a, int depth, Out out, bool as_ok) {
         } else {
             size = d2 * out->indent + 2;
         }
-        assure_size(out, size * cnt);
         cnt--;
         for (i = 0; i <= cnt; i++) {
             if (out->opts->dump_opts.use) {
+                assure_size(out, size);
                 if (0 < out->opts->dump_opts.array_size) {
                     APPEND_CHARS(out->cur, out->opts->dump_opts.array_nl, out->opts->dump_opts.array_size);
                 }

data/ext/oj/extconf.rb CHANGED Viewed

@@ -35,13 +35,12 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
 dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
-if with_config('--with-sse42')
-  if try_cflags('-msse4.2')
-    $CPPFLAGS += ' -msse4.2'
-    dflags['OJ_USE_SSE4_2'] = 1
-  else
-    warn 'SSE 4.2 is not supported on this platform.'
-  end
+# Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
+# Falls back to SSE2 or compiler defaults if not available
+if try_cflags('-msse4.2')
+  $CPPFLAGS += ' -msse4.2'
+elsif try_cflags('-msse2')
+  $CPPFLAGS += ' -msse2'
 end
 if enable_config('trace-log', false)

data/ext/oj/parse.c CHANGED Viewed

@@ -15,12 +15,9 @@
 #include "mem.h"
 #include "oj.h"
 #include "rxclass.h"
+#include "simd.h"
 #include "val_stack.h"
-#ifdef OJ_USE_SSE4_2
-#include <nmmintrin.h>
-#endif
 // Workaround in case INFINITY is not defined in math.h or if the OS is CentOS
 #define OJ_INFINITY (1.0 / 0.0)
@@ -202,23 +199,143 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
     return str;
 }
-#ifdef OJ_USE_SSE4_2
-static inline const char *scan_string_SIMD(const char *str, const char *end) {
-    static const char chars[16] = "\x00\\\"";
-    const __m128i     terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
-    const char       *_end      = (const char *)(end - 16);
+#ifdef HAVE_SIMD_SSE4_2
+// Optimized SIMD string scanner using SSE4.2 instructions
+// Uses prefetching and processes multiple chunks in parallel to reduce latency
+static inline const char *scan_string_SSE42(const char *str, const char *end) {
+    static const char chars[16]   = "\x00\\\"";
+    const __m128i     terminate   = _mm_loadu_si128((const __m128i *)&chars[0]);
+    const char       *safe_end_64 = end - 64;
+    const char       *safe_end_16 = end - 16;
+    // Process 64 bytes at a time with parallel SIMD operations
+    // This reduces pipeline stalls and improves instruction-level parallelism
+    while (str <= safe_end_64) {
+        // Prefetch next cache line for better memory throughput
+        __builtin_prefetch(str + 64, 0, 0);
+        // Load and compare 4 chunks in parallel
+        const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
+        const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
+        const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
+        const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
+        const int r0 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk0,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r0 != 16, 0))
+            return str + r0;
+        const int r1 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk1,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r1 != 16, 0))
+            return str + 16 + r1;
+        const int r2 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk2,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r2 != 16, 0))
+            return str + 32 + r2;
+        const int r3 = _mm_cmpestri(terminate,
+                                    3,
+                                    chunk3,
+                                    16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r3 != 16, 0))
+            return str + 48 + r3;
+        str += 64;
+    }
-    for (; str <= _end; str += 16) {
+    // Handle remaining 16-byte chunks
+    for (; str <= safe_end_16; str += 16) {
         const __m128i string = _mm_loadu_si128((const __m128i *)str);
         const int     r      = _mm_cmpestri(terminate,
                                    3,
                                    string,
                                    16,
                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
-        if (r != 16) {
-            str = (char *)(str + r);
-            return str;
-        }
+        if (r != 16)
+            return str + r;
+    }
+    return scan_string_noSIMD(str, end);
+}
+#endif
+#ifdef HAVE_SIMD_SSE2
+// Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
+// Uses SSE2 instructions with prefetching and parallel processing
+static inline const char *scan_string_SSE2(const char *str, const char *end) {
+    const char *safe_end_64 = end - 64;
+    const char *safe_end_16 = end - 16;
+    // Create comparison vectors for our three special characters
+    const __m128i null_char = _mm_setzero_si128();
+    const __m128i backslash = _mm_set1_epi8('\\');
+    const __m128i quote     = _mm_set1_epi8('"');
+    // Process 64 bytes at a time for better throughput
+    while (str <= safe_end_64) {
+        __builtin_prefetch(str + 64, 0, 0);
+        // Load 4 chunks
+        const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
+        const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
+        const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
+        const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
+        // Compare all chunks (allows CPU to parallelize)
+        const __m128i cmp0 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk0, null_char), _mm_cmpeq_epi8(chunk0, backslash)),
+            _mm_cmpeq_epi8(chunk0, quote));
+        const __m128i cmp1 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk1, null_char), _mm_cmpeq_epi8(chunk1, backslash)),
+            _mm_cmpeq_epi8(chunk1, quote));
+        const __m128i cmp2 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk2, null_char), _mm_cmpeq_epi8(chunk2, backslash)),
+            _mm_cmpeq_epi8(chunk2, quote));
+        const __m128i cmp3 = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk3, null_char), _mm_cmpeq_epi8(chunk3, backslash)),
+            _mm_cmpeq_epi8(chunk3, quote));
+        // Convert to masks
+        int mask0 = _mm_movemask_epi8(cmp0);
+        if (__builtin_expect(mask0 != 0, 0))
+            return str + __builtin_ctz(mask0);
+        int mask1 = _mm_movemask_epi8(cmp1);
+        if (__builtin_expect(mask1 != 0, 0))
+            return str + 16 + __builtin_ctz(mask1);
+        int mask2 = _mm_movemask_epi8(cmp2);
+        if (__builtin_expect(mask2 != 0, 0))
+            return str + 32 + __builtin_ctz(mask2);
+        int mask3 = _mm_movemask_epi8(cmp3);
+        if (__builtin_expect(mask3 != 0, 0))
+            return str + 48 + __builtin_ctz(mask3);
+        str += 64;
+    }
+    // Handle remaining 16-byte chunks
+    for (; str <= safe_end_16; str += 16) {
+        const __m128i chunk   = _mm_loadu_si128((const __m128i *)str);
+        const __m128i matches = _mm_or_si128(
+            _mm_or_si128(_mm_cmpeq_epi8(chunk, null_char), _mm_cmpeq_epi8(chunk, backslash)),
+            _mm_cmpeq_epi8(chunk, quote));
+        int mask = _mm_movemask_epi8(matches);
+        if (mask != 0)
+            return str + __builtin_ctz(mask);
     }
     return scan_string_noSIMD(str, end);
@@ -228,9 +345,12 @@ static inline const char *scan_string_SIMD(const char *str, const char *end) {
 static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
 void oj_scanner_init(void) {
-#ifdef OJ_USE_SSE4_2
-    scan_func = scan_string_SIMD;
+#ifdef HAVE_SIMD_SSE4_2
+    scan_func = scan_string_SSE42;
+#elif defined(HAVE_SIMD_SSE2)
+    scan_func = scan_string_SSE2;
 #endif
+    // Note: ARM NEON string scanning would be added here if needed
 }
 // entered at /

data/ext/oj/rails.c CHANGED Viewed

@@ -661,13 +661,15 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
     Encoder e = OJ_R_ALLOC(struct _encoder);
     e->opts = oj_default_options;
-    e->arg  = Qnil;
     copy_opts(&ropts, &e->ropts);
     if (1 <= argc && Qnil != *argv) {
-        oj_parse_options(*argv, &e->opts);
         e->arg = *argv;
+    } else {
+        e->arg = rb_hash_new();
     }
+    oj_parse_options(*argv, &e->opts);
     return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
 }

data/ext/oj/simd.h CHANGED Viewed

@@ -1,10 +1,47 @@
 #ifndef OJ_SIMD_H
 #define OJ_SIMD_H
+// SIMD architecture detection and configuration
+// This header provides unified SIMD support across different CPU architectures
+// x86/x86_64 SIMD detection
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
+#define HAVE_SIMD_X86 1
+// SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
+// Enabled automatically when compiler has -msse4.2 flag
+#if defined(__SSE4_2__)
+#define HAVE_SIMD_SSE4_2 1
+#include <nmmintrin.h>
+#endif
+// SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
+#if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
+#define HAVE_SIMD_SSE2 1
+#include <emmintrin.h>
+#endif
+#endif  // x86/x86_64
+// ARM NEON detection
 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
 #define HAVE_SIMD_NEON 1
 #define SIMD_MINIMUM_THRESHOLD 6
 #include <arm_neon.h>
 #endif
-#endif /* OJ_SIMD_H */
+// Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
+#if defined(HAVE_SIMD_SSE4_2)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "SSE4.2"
+#elif defined(HAVE_SIMD_NEON)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "NEON"
+#elif defined(HAVE_SIMD_SSE2)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "SSE2"
+#else
+#define SIMD_TYPE "none"
+#endif
+#endif /* OJ_SIMD_H */

data/lib/oj/version.rb CHANGED Viewed

@@ -1,4 +1,4 @@
 module Oj
   # Current version of the module.
-  VERSION = '3.16.11'
+  VERSION = '3.16.13'
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: oj
 version: !ruby/object:Gem::Version
-  version: 3.16.11
+  version: 3.16.13
 platform: ruby
 authors:
 - Peter Ohler
 bindir: bin
 cert_chain: []
-date: 2025-05-30 00:00:00.000000000 Z
+date: 1980-01-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bigdecimal
@@ -91,9 +91,9 @@ executables: []
 extensions:
 - ext/oj/extconf.rb
 extra_rdoc_files:
-- README.md
-- LICENSE
 - CHANGELOG.md
+- LICENSE
+- README.md
 - RELEASE_NOTES.md
 - pages/Advanced.md
 - pages/Compatibility.md
@@ -229,7 +229,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.6.2
+rubygems_version: 3.6.9
 specification_version: 4
 summary: A fast JSON parser and serializer.
 test_files: []