oj 3.16.12 → 3.16.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6e2053531cd4c7c7b49bf16ddafbfda868e67e66c1bfbec1b06daaa0ba3f1c45
4
- data.tar.gz: ce029b2f90660922dd8fb335c23463a1fe1d81317d56e48f2cfddc9271de2d15
3
+ metadata.gz: fc0290fa1cfe6af1094de1d7188836e0c09cb04f2f08401de118253026604650
4
+ data.tar.gz: de5258e96984a21afb2fac946fe28ad255926893a6157f2874445edf10aa8bbe
5
5
  SHA512:
6
- metadata.gz: deb7f1447b5022adad6d7387b8a8bfd866d399abc2d9e434f7e6d321fa73cb1738ff9aa7ee22ac064c455d5d3951ba7469a30720c9474af5e96c70eaa5b5303a
7
- data.tar.gz: b9d28d76c714947c1e6b133225e348838ad13f1c8b7dc82f0fee261272259cd0e83b911f2f2762c55a719f93e669275ad32d393f371c264a4e587e49b1c3a84b
6
+ metadata.gz: d7870818fd86043a17b834756b67a4009a6f7ef60baf53b02a0b0d4431ccba723d9e533553bea04ae46ae9f233e447b79c4106e6827782bd0c2ffb9c332081a3
7
+ data.tar.gz: fd3966ac7fb5da9f1a5ebb68f4a8f5b9a5f9fa1a1255e93dfef078f66f00a6af5bb7e37676441f7d6229b29222741a2bc7b75164fd445a39b906ef904946d41b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## 3.16.13 - 2025-12-05
4
+
5
+ - Fixed rails encoding for Hash and Array subclasses.
6
+
3
7
  ## 3.16.12 - 2025-10-29
4
8
 
5
9
  - Fixed dump realloc bug that occurred when using the compat mode dump options.
data/ext/oj/extconf.rb CHANGED
@@ -35,13 +35,12 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
35
35
 
36
36
  dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
37
37
 
38
- if with_config('--with-sse42')
39
- if try_cflags('-msse4.2')
40
- $CPPFLAGS += ' -msse4.2'
41
- dflags['OJ_USE_SSE4_2'] = 1
42
- else
43
- warn 'SSE 4.2 is not supported on this platform.'
44
- end
38
+ # Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
39
+ # Falls back to SSE2 or compiler defaults if not available
40
+ if try_cflags('-msse4.2')
41
+ $CPPFLAGS += ' -msse4.2'
42
+ elsif try_cflags('-msse2')
43
+ $CPPFLAGS += ' -msse2'
45
44
  end
46
45
 
47
46
  if enable_config('trace-log', false)
data/ext/oj/parse.c CHANGED
@@ -15,12 +15,9 @@
15
15
  #include "mem.h"
16
16
  #include "oj.h"
17
17
  #include "rxclass.h"
18
+ #include "simd.h"
18
19
  #include "val_stack.h"
19
20
 
20
- #ifdef OJ_USE_SSE4_2
21
- #include <nmmintrin.h>
22
- #endif
23
-
24
21
  // Workaround in case INFINITY is not defined in math.h or if the OS is CentOS
25
22
  #define OJ_INFINITY (1.0 / 0.0)
26
23
 
@@ -202,23 +199,143 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
202
199
  return str;
203
200
  }
204
201
 
205
- #ifdef OJ_USE_SSE4_2
206
- static inline const char *scan_string_SIMD(const char *str, const char *end) {
207
- static const char chars[16] = "\x00\\\"";
208
- const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
209
- const char *_end = (const char *)(end - 16);
202
+ #ifdef HAVE_SIMD_SSE4_2
203
+ // Optimized SIMD string scanner using SSE4.2 instructions
204
+ // Uses prefetching and processes multiple chunks in parallel to reduce latency
205
+ static inline const char *scan_string_SSE42(const char *str, const char *end) {
206
+ static const char chars[16] = "\x00\\\"";
207
+ const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
208
+ const char *safe_end_64 = end - 64;
209
+ const char *safe_end_16 = end - 16;
210
+
211
+ // Process 64 bytes at a time with parallel SIMD operations
212
+ // This reduces pipeline stalls and improves instruction-level parallelism
213
+ while (str <= safe_end_64) {
214
+ // Prefetch next cache line for better memory throughput
215
+ __builtin_prefetch(str + 64, 0, 0);
216
+
217
+ // Load and compare 4 chunks in parallel
218
+ const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
219
+ const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
220
+ const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
221
+ const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
222
+
223
+ const int r0 = _mm_cmpestri(terminate,
224
+ 3,
225
+ chunk0,
226
+ 16,
227
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
228
+ if (__builtin_expect(r0 != 16, 0))
229
+ return str + r0;
230
+
231
+ const int r1 = _mm_cmpestri(terminate,
232
+ 3,
233
+ chunk1,
234
+ 16,
235
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
236
+ if (__builtin_expect(r1 != 16, 0))
237
+ return str + 16 + r1;
238
+
239
+ const int r2 = _mm_cmpestri(terminate,
240
+ 3,
241
+ chunk2,
242
+ 16,
243
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
244
+ if (__builtin_expect(r2 != 16, 0))
245
+ return str + 32 + r2;
246
+
247
+ const int r3 = _mm_cmpestri(terminate,
248
+ 3,
249
+ chunk3,
250
+ 16,
251
+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
252
+ if (__builtin_expect(r3 != 16, 0))
253
+ return str + 48 + r3;
254
+
255
+ str += 64;
256
+ }
210
257
 
211
- for (; str <= _end; str += 16) {
258
+ // Handle remaining 16-byte chunks
259
+ for (; str <= safe_end_16; str += 16) {
212
260
  const __m128i string = _mm_loadu_si128((const __m128i *)str);
213
261
  const int r = _mm_cmpestri(terminate,
214
262
  3,
215
263
  string,
216
264
  16,
217
265
  _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
218
- if (r != 16) {
219
- str = (char *)(str + r);
220
- return str;
221
- }
266
+ if (r != 16)
267
+ return str + r;
268
+ }
269
+
270
+ return scan_string_noSIMD(str, end);
271
+ }
272
+ #endif
273
+
274
+ #ifdef HAVE_SIMD_SSE2
275
+ // Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
276
+ // Uses SSE2 instructions with prefetching and parallel processing
277
+ static inline const char *scan_string_SSE2(const char *str, const char *end) {
278
+ const char *safe_end_64 = end - 64;
279
+ const char *safe_end_16 = end - 16;
280
+
281
+ // Create comparison vectors for our three special characters
282
+ const __m128i null_char = _mm_setzero_si128();
283
+ const __m128i backslash = _mm_set1_epi8('\\');
284
+ const __m128i quote = _mm_set1_epi8('"');
285
+
286
+ // Process 64 bytes at a time for better throughput
287
+ while (str <= safe_end_64) {
288
+ __builtin_prefetch(str + 64, 0, 0);
289
+
290
+ // Load 4 chunks
291
+ const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
292
+ const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
293
+ const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
294
+ const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
295
+
296
+ // Compare all chunks (allows CPU to parallelize)
297
+ const __m128i cmp0 = _mm_or_si128(
298
+ _mm_or_si128(_mm_cmpeq_epi8(chunk0, null_char), _mm_cmpeq_epi8(chunk0, backslash)),
299
+ _mm_cmpeq_epi8(chunk0, quote));
300
+ const __m128i cmp1 = _mm_or_si128(
301
+ _mm_or_si128(_mm_cmpeq_epi8(chunk1, null_char), _mm_cmpeq_epi8(chunk1, backslash)),
302
+ _mm_cmpeq_epi8(chunk1, quote));
303
+ const __m128i cmp2 = _mm_or_si128(
304
+ _mm_or_si128(_mm_cmpeq_epi8(chunk2, null_char), _mm_cmpeq_epi8(chunk2, backslash)),
305
+ _mm_cmpeq_epi8(chunk2, quote));
306
+ const __m128i cmp3 = _mm_or_si128(
307
+ _mm_or_si128(_mm_cmpeq_epi8(chunk3, null_char), _mm_cmpeq_epi8(chunk3, backslash)),
308
+ _mm_cmpeq_epi8(chunk3, quote));
309
+
310
+ // Convert to masks
311
+ int mask0 = _mm_movemask_epi8(cmp0);
312
+ if (__builtin_expect(mask0 != 0, 0))
313
+ return str + __builtin_ctz(mask0);
314
+
315
+ int mask1 = _mm_movemask_epi8(cmp1);
316
+ if (__builtin_expect(mask1 != 0, 0))
317
+ return str + 16 + __builtin_ctz(mask1);
318
+
319
+ int mask2 = _mm_movemask_epi8(cmp2);
320
+ if (__builtin_expect(mask2 != 0, 0))
321
+ return str + 32 + __builtin_ctz(mask2);
322
+
323
+ int mask3 = _mm_movemask_epi8(cmp3);
324
+ if (__builtin_expect(mask3 != 0, 0))
325
+ return str + 48 + __builtin_ctz(mask3);
326
+
327
+ str += 64;
328
+ }
329
+
330
+ // Handle remaining 16-byte chunks
331
+ for (; str <= safe_end_16; str += 16) {
332
+ const __m128i chunk = _mm_loadu_si128((const __m128i *)str);
333
+ const __m128i matches = _mm_or_si128(
334
+ _mm_or_si128(_mm_cmpeq_epi8(chunk, null_char), _mm_cmpeq_epi8(chunk, backslash)),
335
+ _mm_cmpeq_epi8(chunk, quote));
336
+ int mask = _mm_movemask_epi8(matches);
337
+ if (mask != 0)
338
+ return str + __builtin_ctz(mask);
222
339
  }
223
340
 
224
341
  return scan_string_noSIMD(str, end);
@@ -228,9 +345,12 @@ static inline const char *scan_string_SIMD(const char *str, const char *end) {
228
345
  static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
229
346
 
230
347
  void oj_scanner_init(void) {
231
- #ifdef OJ_USE_SSE4_2
232
- scan_func = scan_string_SIMD;
348
+ #ifdef HAVE_SIMD_SSE4_2
349
+ scan_func = scan_string_SSE42;
350
+ #elif defined(HAVE_SIMD_SSE2)
351
+ scan_func = scan_string_SSE2;
233
352
  #endif
353
+ // Note: ARM NEON string scanning would be added here if needed
234
354
  }
235
355
 
236
356
  // entered at /
data/ext/oj/rails.c CHANGED
@@ -661,13 +661,15 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
661
661
  Encoder e = OJ_R_ALLOC(struct _encoder);
662
662
 
663
663
  e->opts = oj_default_options;
664
- e->arg = Qnil;
665
664
  copy_opts(&ropts, &e->ropts);
666
665
 
667
666
  if (1 <= argc && Qnil != *argv) {
668
- oj_parse_options(*argv, &e->opts);
669
667
  e->arg = *argv;
668
+ } else {
669
+ e->arg = rb_hash_new();
670
670
  }
671
+ oj_parse_options(*argv, &e->opts);
672
+
671
673
  return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
672
674
  }
673
675
 
data/ext/oj/simd.h CHANGED
@@ -1,10 +1,47 @@
1
1
  #ifndef OJ_SIMD_H
2
2
  #define OJ_SIMD_H
3
3
 
4
+ // SIMD architecture detection and configuration
5
+ // This header provides unified SIMD support across different CPU architectures
6
+
7
+ // x86/x86_64 SIMD detection
8
+ #if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
9
+ #define HAVE_SIMD_X86 1
10
+
11
+ // SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
12
+ // Enabled automatically when compiler has -msse4.2 flag
13
+ #if defined(__SSE4_2__)
14
+ #define HAVE_SIMD_SSE4_2 1
15
+ #include <nmmintrin.h>
16
+ #endif
17
+
18
+ // SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
19
+ #if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
20
+ #define HAVE_SIMD_SSE2 1
21
+ #include <emmintrin.h>
22
+ #endif
23
+
24
+ #endif // x86/x86_64
25
+
26
+ // ARM NEON detection
4
27
  #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
5
28
  #define HAVE_SIMD_NEON 1
6
29
  #define SIMD_MINIMUM_THRESHOLD 6
7
30
  #include <arm_neon.h>
8
31
  #endif
9
32
 
10
- #endif /* OJ_SIMD_H */
33
+ // Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
34
+ #if defined(HAVE_SIMD_SSE4_2)
35
+ #define HAVE_SIMD_STRING_SCAN 1
36
+ #define SIMD_TYPE "SSE4.2"
37
+ #elif defined(HAVE_SIMD_NEON)
38
+ #define HAVE_SIMD_STRING_SCAN 1
39
+ #define SIMD_TYPE "NEON"
40
+ #elif defined(HAVE_SIMD_SSE2)
41
+ #define HAVE_SIMD_STRING_SCAN 1
42
+ #define SIMD_TYPE "SSE2"
43
+ #else
44
+ #define SIMD_TYPE "none"
45
+ #endif
46
+
47
+ #endif /* OJ_SIMD_H */
data/lib/oj/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Oj
2
2
  # Current version of the module.
3
- VERSION = '3.16.12'
3
+ VERSION = '3.16.13'
4
4
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: oj
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.16.12
4
+ version: 3.16.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Ohler