oj 3.16.13 → 3.16.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/oj/dump.c +148 -9
- data/ext/oj/extconf.rb +9 -5
- data/ext/oj/oj.c +82 -0
- data/ext/oj/parse.c +30 -20
- data/ext/oj/rails.c +1 -1
- data/ext/oj/simd.h +138 -13
- data/lib/oj/version.rb +1 -1
- metadata +2 -16
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c97d24d950284c4f108b8ff933e4358250d53b69f9300b3faf2b8b7f29fdb5b7
|
|
4
|
+
data.tar.gz: 3112c763244b2f558e4f2a8f0ae79c01dcea9ec4281a915a2e39a404a34bf8f1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 3f32303a7e78478137b76fbadf4bcab93d164d60918fd5b8c20bccdd7cf0476e290e1290e4bc114f3fd6a3afaac0fb3c9df61c91ede88bfb9a9762d098731688
|
|
7
|
+
data.tar.gz: bcc7411ae7cde8ff0457563f67d99093b09c3d70a36a13c5db1144dbf983d5f864f204f421a966be30d3a2c3610f82033ed08c6c0c86d7ff2e8b391b4cd5b02f
|
data/CHANGELOG.md
CHANGED
data/ext/oj/dump.c
CHANGED
|
@@ -201,6 +201,45 @@ void initialize_neon(void) {
|
|
|
201
201
|
}
|
|
202
202
|
#endif
|
|
203
203
|
|
|
204
|
+
#ifdef HAVE_SIMD_SSE4_2
|
|
205
|
+
|
|
206
|
+
static __m128i hibit_friendly_chars_sse42[8];
|
|
207
|
+
|
|
208
|
+
// From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
|
|
209
|
+
inline uint32_t _mm_sum_epu8(const __m128i v) {
|
|
210
|
+
__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
|
|
211
|
+
return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
inline static OJ_TARGET_SSE42 size_t hibit_friendly_size_sse42(const uint8_t *str, size_t len) {
|
|
215
|
+
size_t size = 0;
|
|
216
|
+
size_t i = 0;
|
|
217
|
+
|
|
218
|
+
for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i), str += sizeof(__m128i)) {
|
|
219
|
+
size += sizeof(__m128i);
|
|
220
|
+
|
|
221
|
+
__m128i chunk = _mm_loadu_si128((__m128i *)str);
|
|
222
|
+
__m128i tmp = vector_lookup_sse42(chunk, hibit_friendly_chars_sse42, 8);
|
|
223
|
+
size += _mm_sum_epu8(tmp);
|
|
224
|
+
}
|
|
225
|
+
size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
|
|
226
|
+
return total;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
void OJ_TARGET_SSE42 initialize_sse42(void) {
|
|
230
|
+
for (int i = 0; i < 8; i++) {
|
|
231
|
+
hibit_friendly_chars_sse42[i] = _mm_sub_epi8(
|
|
232
|
+
_mm_loadu_si128((__m128i *)(hibit_friendly_chars + i * sizeof(__m128i))),
|
|
233
|
+
_mm_set1_epi8('1'));
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
#else
|
|
238
|
+
|
|
239
|
+
#define SIMD_TARGET
|
|
240
|
+
|
|
241
|
+
#endif /* HAVE_SIMD_SSE4_2 */
|
|
242
|
+
|
|
204
243
|
inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
|
|
205
244
|
#ifdef HAVE_SIMD_NEON
|
|
206
245
|
size_t size = 0;
|
|
@@ -220,6 +259,13 @@ inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
|
|
|
220
259
|
|
|
221
260
|
size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
|
|
222
261
|
return total;
|
|
262
|
+
#elif defined(HAVE_SIMD_SSE4_2)
|
|
263
|
+
if (SIMD_Impl == SIMD_SSE42) {
|
|
264
|
+
if (len >= sizeof(__m128i)) {
|
|
265
|
+
return hibit_friendly_size_sse42(str, len);
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
return calculate_string_size(str, len, hibit_friendly_chars);
|
|
223
269
|
#else
|
|
224
270
|
return calculate_string_size(str, len, hibit_friendly_chars);
|
|
225
271
|
#endif
|
|
@@ -944,6 +990,34 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
|
|
|
944
990
|
return result;
|
|
945
991
|
}
|
|
946
992
|
|
|
993
|
+
#elif defined(HAVE_SIMD_SSE4_2)
|
|
994
|
+
typedef struct _sse42_match_result {
|
|
995
|
+
__m128i actions;
|
|
996
|
+
bool needs_escape;
|
|
997
|
+
int escape_mask;
|
|
998
|
+
bool has_some_hibit;
|
|
999
|
+
bool do_unicode_validation;
|
|
1000
|
+
} sse42_match_result;
|
|
1001
|
+
|
|
1002
|
+
static inline OJ_TARGET_SSE42 sse42_match_result
|
|
1003
|
+
sse42_update(const char *str, __m128i *cmap_sse42, int sse42_tab_size, bool do_unicode_validation, bool has_hi) {
|
|
1004
|
+
sse42_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};
|
|
1005
|
+
|
|
1006
|
+
__m128i chunk = _mm_loadu_si128((__m128i *)str);
|
|
1007
|
+
__m128i actions = vector_lookup_sse42(chunk, cmap_sse42, sse42_tab_size);
|
|
1008
|
+
__m128i needs_escape = _mm_xor_si128(_mm_cmpeq_epi8(actions, _mm_setzero_si128()), _mm_set1_epi8(0xFF));
|
|
1009
|
+
result.actions = _mm_add_epi8(actions, _mm_set1_epi8('1'));
|
|
1010
|
+
|
|
1011
|
+
result.escape_mask = _mm_movemask_epi8(needs_escape);
|
|
1012
|
+
result.needs_escape = result.escape_mask != 0;
|
|
1013
|
+
if (has_hi && do_unicode_validation) {
|
|
1014
|
+
__m128i has_some_hibit = _mm_and_si128(chunk, _mm_set1_epi8(0x80));
|
|
1015
|
+
result.has_some_hibit = _mm_movemask_epi8(has_some_hibit) != 0;
|
|
1016
|
+
result.do_unicode_validation = has_hi && do_unicode_validation && result.has_some_hibit;
|
|
1017
|
+
}
|
|
1018
|
+
return result;
|
|
1019
|
+
}
|
|
1020
|
+
|
|
947
1021
|
#endif /* HAVE_SIMD_NEON */
|
|
948
1022
|
|
|
949
1023
|
static inline FORCE_INLINE const char *process_character(char action,
|
|
@@ -1023,6 +1097,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
|
|
|
1023
1097
|
#ifdef HAVE_SIMD_NEON
|
|
1024
1098
|
uint8x16x4_t *cmap_neon = NULL;
|
|
1025
1099
|
int neon_table_size = 0;
|
|
1100
|
+
#elif defined(HAVE_SIMD_SSE4_2)
|
|
1101
|
+
__m128i *cmap_sse42 = NULL;
|
|
1102
|
+
int sse42_tab_size;
|
|
1026
1103
|
#endif /* HAVE_SIMD_NEON */
|
|
1027
1104
|
const char *orig = str;
|
|
1028
1105
|
bool has_hi = false;
|
|
@@ -1091,6 +1168,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
|
|
|
1091
1168
|
#ifdef HAVE_SIMD_NEON
|
|
1092
1169
|
cmap_neon = hibit_friendly_chars_neon;
|
|
1093
1170
|
neon_table_size = 2;
|
|
1171
|
+
#elif defined(HAVE_SIMD_SSE4_2)
|
|
1172
|
+
cmap_sse42 = hibit_friendly_chars_sse42;
|
|
1173
|
+
sse42_tab_size = 8;
|
|
1094
1174
|
#endif /* HAVE_NEON_SIMD */
|
|
1095
1175
|
size = hibit_friendly_size((uint8_t *)str, cnt);
|
|
1096
1176
|
}
|
|
@@ -1118,21 +1198,32 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
|
|
|
1118
1198
|
if (is_sym) {
|
|
1119
1199
|
*out->cur++ = ':';
|
|
1120
1200
|
}
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
const char *cursor = str;
|
|
1125
|
-
bool use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
|
|
1126
|
-
char matches[16];
|
|
1201
|
+
|
|
1202
|
+
#if defined(HAVE_SIMD_NEON) || defined(HAVE_SIMD_SSE4_2)
|
|
1203
|
+
|
|
1127
1204
|
#define SEARCH_FLUSH \
|
|
1128
1205
|
if (str > cursor) { \
|
|
1129
1206
|
APPEND_CHARS(out->cur, cursor, str - cursor); \
|
|
1130
1207
|
cursor = str; \
|
|
1131
1208
|
}
|
|
1132
1209
|
|
|
1133
|
-
|
|
1210
|
+
const char *chunk_start;
|
|
1211
|
+
const char *chunk_end;
|
|
1212
|
+
const char *cursor = str;
|
|
1213
|
+
char matches[16];
|
|
1214
|
+
#endif /* HAVE_SIMD_NEON || HAVE_SIMD_SSE4_2 */
|
|
1215
|
+
|
|
1216
|
+
#if defined(HAVE_SIMD_NEON)
|
|
1217
|
+
bool use_simd = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
|
|
1218
|
+
#elif defined(HAVE_SIMD_SSE4_2)
|
|
1219
|
+
bool use_simd = false;
|
|
1220
|
+
if (SIMD_Impl == SIMD_SSE42) {
|
|
1221
|
+
use_simd = (cmap_sse42 != NULL && cnt >= (sizeof(__m128i))) ? true : false;
|
|
1222
|
+
}
|
|
1223
|
+
#endif
|
|
1224
|
+
|
|
1134
1225
|
#ifdef HAVE_SIMD_NEON
|
|
1135
|
-
if (
|
|
1226
|
+
if (use_simd) {
|
|
1136
1227
|
while (str < end) {
|
|
1137
1228
|
const char *chunk_ptr = NULL;
|
|
1138
1229
|
if (str + sizeof(uint8x16_t) <= end) {
|
|
@@ -1195,7 +1286,55 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
|
|
|
1195
1286
|
}
|
|
1196
1287
|
SEARCH_FLUSH;
|
|
1197
1288
|
}
|
|
1198
|
-
#endif
|
|
1289
|
+
#endif
|
|
1290
|
+
|
|
1291
|
+
#ifdef HAVE_SIMD_SSE4_2
|
|
1292
|
+
if (SIMD_Impl == SIMD_SSE42) {
|
|
1293
|
+
if (use_simd) {
|
|
1294
|
+
while (str < end) {
|
|
1295
|
+
const char *chunk_ptr = NULL;
|
|
1296
|
+
if (str + sizeof(__m128i) <= end) {
|
|
1297
|
+
chunk_ptr = str;
|
|
1298
|
+
chunk_start = str;
|
|
1299
|
+
chunk_end = str + sizeof(__m128i);
|
|
1300
|
+
} else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
|
|
1301
|
+
memset(out->cur, 'A', sizeof(__m128i));
|
|
1302
|
+
memcpy(out->cur, str, (end - str));
|
|
1303
|
+
chunk_ptr = out->cur;
|
|
1304
|
+
chunk_start = str;
|
|
1305
|
+
chunk_end = end;
|
|
1306
|
+
} else {
|
|
1307
|
+
break;
|
|
1308
|
+
}
|
|
1309
|
+
sse42_match_result result = sse42_update(chunk_ptr,
|
|
1310
|
+
cmap_sse42,
|
|
1311
|
+
sse42_tab_size,
|
|
1312
|
+
do_unicode_validation,
|
|
1313
|
+
has_hi);
|
|
1314
|
+
if ((result.do_unicode_validation) || result.needs_escape) {
|
|
1315
|
+
SEARCH_FLUSH;
|
|
1316
|
+
_mm_storeu_si128((__m128i *)matches, result.actions);
|
|
1317
|
+
while (str < chunk_end) {
|
|
1318
|
+
long match_index = str - chunk_start;
|
|
1319
|
+
str = process_character(matches[match_index],
|
|
1320
|
+
str,
|
|
1321
|
+
end,
|
|
1322
|
+
out,
|
|
1323
|
+
orig,
|
|
1324
|
+
do_unicode_validation,
|
|
1325
|
+
&check_start);
|
|
1326
|
+
str++;
|
|
1327
|
+
}
|
|
1328
|
+
cursor = str;
|
|
1329
|
+
continue;
|
|
1330
|
+
}
|
|
1331
|
+
str = chunk_end;
|
|
1332
|
+
}
|
|
1333
|
+
SEARCH_FLUSH;
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
#endif /* HAVE_SIMD_SSE4_2 */
|
|
1337
|
+
|
|
1199
1338
|
for (; str < end; str++) {
|
|
1200
1339
|
str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
|
|
1201
1340
|
}
|
data/ext/oj/extconf.rb
CHANGED
|
@@ -35,11 +35,15 @@ have_func('rb_ext_ractor_safe', 'ruby.h')
|
|
|
35
35
|
|
|
36
36
|
dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
|
|
37
37
|
|
|
38
|
-
#
|
|
39
|
-
#
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
38
|
+
# SIMD optimizations use runtime CPU detection and function-level target attributes
|
|
39
|
+
# We do NOT add global -msse4.2/-msse2 flags here because:
|
|
40
|
+
# 1. It would cause illegal instruction errors on CPUs without SSE4.2
|
|
41
|
+
# 2. The code uses __attribute__((target("sse4.2"))) for SSE4.2 functions
|
|
42
|
+
# 3. Runtime detection in oj_get_simd_implementation() selects the right path
|
|
43
|
+
#
|
|
44
|
+
# We only add -msse2 if available, since SSE2 is baseline for all x86_64 CPUs
|
|
45
|
+
# and needed for compiling the SSE2 fallback code on 32-bit x86
|
|
46
|
+
if try_cflags('-msse2')
|
|
43
47
|
$CPPFLAGS += ' -msse2'
|
|
44
48
|
end
|
|
45
49
|
|
data/ext/oj/oj.c
CHANGED
|
@@ -167,6 +167,8 @@ pthread_mutex_t oj_cache_mutex;
|
|
|
167
167
|
VALUE oj_cache_mutex = Qnil;
|
|
168
168
|
#endif
|
|
169
169
|
|
|
170
|
+
SIMD_Implementation SIMD_Impl = SIMD_NONE;
|
|
171
|
+
|
|
170
172
|
extern void oj_parser_init();
|
|
171
173
|
|
|
172
174
|
const char oj_json_class[] = "json_class";
|
|
@@ -1780,6 +1782,78 @@ static VALUE mem_report(VALUE self) {
|
|
|
1780
1782
|
*
|
|
1781
1783
|
* - *:wab* specifically for WAB data exchange.
|
|
1782
1784
|
*/
|
|
1785
|
+
|
|
1786
|
+
// =============================================================================
|
|
1787
|
+
// Runtime SIMD CPU detection
|
|
1788
|
+
// Cross-platform support for Windows (MSVC), Linux, and macOS (GCC/Clang)
|
|
1789
|
+
// =============================================================================
|
|
1790
|
+
SIMD_Implementation oj_get_simd_implementation(void) {
|
|
1791
|
+
#ifdef HAVE_SIMD_X86
|
|
1792
|
+
// x86/x86_64 runtime detection
|
|
1793
|
+
|
|
1794
|
+
#if defined(_MSC_VER)
|
|
1795
|
+
// MSVC: Use __cpuid intrinsic
|
|
1796
|
+
int cpu_info[4];
|
|
1797
|
+
__cpuid(cpu_info, 1);
|
|
1798
|
+
|
|
1799
|
+
// Check for SSE4.2 (bit 20 of ECX)
|
|
1800
|
+
if (cpu_info[2] & (1 << 20)) {
|
|
1801
|
+
return SIMD_SSE42;
|
|
1802
|
+
}
|
|
1803
|
+
// Check for SSE2 (bit 26 of EDX)
|
|
1804
|
+
if (cpu_info[3] & (1 << 26)) {
|
|
1805
|
+
return SIMD_SSE2;
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
#elif defined(__GNUC__) || defined(__clang__)
|
|
1809
|
+
// GCC/Clang: Use __builtin_cpu_supports if available
|
|
1810
|
+
#if defined(__has_builtin)
|
|
1811
|
+
#if __has_builtin(__builtin_cpu_supports)
|
|
1812
|
+
#define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
|
|
1813
|
+
#endif
|
|
1814
|
+
#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8))
|
|
1815
|
+
// GCC 4.8+ has __builtin_cpu_supports
|
|
1816
|
+
#define OJ_HAS_BUILTIN_CPU_SUPPORTS 1
|
|
1817
|
+
#endif
|
|
1818
|
+
|
|
1819
|
+
#ifdef OJ_HAS_BUILTIN_CPU_SUPPORTS
|
|
1820
|
+
#ifdef HAVE_SIMD_SSE4_2
|
|
1821
|
+
if (__builtin_cpu_supports("sse4.2")) {
|
|
1822
|
+
return SIMD_SSE42;
|
|
1823
|
+
}
|
|
1824
|
+
#endif
|
|
1825
|
+
#ifdef HAVE_SIMD_SSE2
|
|
1826
|
+
if (__builtin_cpu_supports("sse2")) {
|
|
1827
|
+
return SIMD_SSE2;
|
|
1828
|
+
}
|
|
1829
|
+
#endif
|
|
1830
|
+
#else
|
|
1831
|
+
// Fallback: Use CPUID instruction directly
|
|
1832
|
+
unsigned int eax, ebx, ecx, edx;
|
|
1833
|
+
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
|
|
1834
|
+
// Check for SSE4.2 (bit 20 of ECX)
|
|
1835
|
+
if (ecx & (1 << 20)) {
|
|
1836
|
+
return SIMD_SSE42;
|
|
1837
|
+
}
|
|
1838
|
+
// Check for SSE2 (bit 26 of EDX)
|
|
1839
|
+
if (edx & (1 << 26)) {
|
|
1840
|
+
return SIMD_SSE2;
|
|
1841
|
+
}
|
|
1842
|
+
}
|
|
1843
|
+
#endif // OJ_HAS_BUILTIN_CPU_SUPPORTS
|
|
1844
|
+
|
|
1845
|
+
#endif // _MSC_VER vs GCC/Clang
|
|
1846
|
+
|
|
1847
|
+
#endif // HAVE_SIMD_X86
|
|
1848
|
+
|
|
1849
|
+
#ifdef HAVE_SIMD_NEON
|
|
1850
|
+
// ARM NEON is always available on ARM64 and detected at compile time
|
|
1851
|
+
return SIMD_NEON;
|
|
1852
|
+
#endif
|
|
1853
|
+
|
|
1854
|
+
return SIMD_NONE;
|
|
1855
|
+
}
|
|
1856
|
+
|
|
1783
1857
|
void Init_oj(void) {
|
|
1784
1858
|
int err = 0;
|
|
1785
1859
|
|
|
@@ -2080,10 +2154,18 @@ void Init_oj(void) {
|
|
|
2080
2154
|
#endif
|
|
2081
2155
|
oj_init_doc();
|
|
2082
2156
|
|
|
2157
|
+
SIMD_Impl = oj_get_simd_implementation();
|
|
2158
|
+
|
|
2083
2159
|
oj_parser_init();
|
|
2084
2160
|
oj_scanner_init();
|
|
2085
2161
|
|
|
2086
2162
|
#ifdef HAVE_SIMD_NEON
|
|
2087
2163
|
initialize_neon();
|
|
2088
2164
|
#endif /* HAVE_SIMD_NEON */
|
|
2165
|
+
|
|
2166
|
+
#ifdef HAVE_SIMD_SSE4_2
|
|
2167
|
+
if (SIMD_Impl == SIMD_SSE42) {
|
|
2168
|
+
initialize_sse42();
|
|
2169
|
+
}
|
|
2170
|
+
#endif /* HAVE_SIMD_SSE4_2 */
|
|
2089
2171
|
}
|
data/ext/oj/parse.c
CHANGED
|
@@ -202,7 +202,8 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
|
|
|
202
202
|
#ifdef HAVE_SIMD_SSE4_2
|
|
203
203
|
// Optimized SIMD string scanner using SSE4.2 instructions
|
|
204
204
|
// Uses prefetching and processes multiple chunks in parallel to reduce latency
|
|
205
|
-
|
|
205
|
+
// Note: OJ_TARGET_SSE42 attribute allows this to compile even without global -msse4.2
|
|
206
|
+
static OJ_TARGET_SSE42 const char *scan_string_SSE42(const char *str, const char *end) {
|
|
206
207
|
static const char chars[16] = "\x00\\\"";
|
|
207
208
|
const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
|
|
208
209
|
const char *safe_end_64 = end - 64;
|
|
@@ -212,7 +213,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
|
|
|
212
213
|
// This reduces pipeline stalls and improves instruction-level parallelism
|
|
213
214
|
while (str <= safe_end_64) {
|
|
214
215
|
// Prefetch next cache line for better memory throughput
|
|
215
|
-
|
|
216
|
+
OJ_PREFETCH(str + 64);
|
|
216
217
|
|
|
217
218
|
// Load and compare 4 chunks in parallel
|
|
218
219
|
const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
|
|
@@ -225,7 +226,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
|
|
|
225
226
|
chunk0,
|
|
226
227
|
16,
|
|
227
228
|
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
|
|
228
|
-
if (
|
|
229
|
+
if (OJ_UNLIKELY(r0 != 16))
|
|
229
230
|
return str + r0;
|
|
230
231
|
|
|
231
232
|
const int r1 = _mm_cmpestri(terminate,
|
|
@@ -233,7 +234,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
|
|
|
233
234
|
chunk1,
|
|
234
235
|
16,
|
|
235
236
|
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
|
|
236
|
-
if (
|
|
237
|
+
if (OJ_UNLIKELY(r1 != 16))
|
|
237
238
|
return str + 16 + r1;
|
|
238
239
|
|
|
239
240
|
const int r2 = _mm_cmpestri(terminate,
|
|
@@ -241,7 +242,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
|
|
|
241
242
|
chunk2,
|
|
242
243
|
16,
|
|
243
244
|
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
|
|
244
|
-
if (
|
|
245
|
+
if (OJ_UNLIKELY(r2 != 16))
|
|
245
246
|
return str + 32 + r2;
|
|
246
247
|
|
|
247
248
|
const int r3 = _mm_cmpestri(terminate,
|
|
@@ -249,7 +250,7 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
|
|
|
249
250
|
chunk3,
|
|
250
251
|
16,
|
|
251
252
|
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
|
|
252
|
-
if (
|
|
253
|
+
if (OJ_UNLIKELY(r3 != 16))
|
|
253
254
|
return str + 48 + r3;
|
|
254
255
|
|
|
255
256
|
str += 64;
|
|
@@ -274,7 +275,8 @@ static inline const char *scan_string_SSE42(const char *str, const char *end) {
|
|
|
274
275
|
#ifdef HAVE_SIMD_SSE2
|
|
275
276
|
// Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
|
|
276
277
|
// Uses SSE2 instructions with prefetching and parallel processing
|
|
277
|
-
|
|
278
|
+
// Note: OJ_TARGET_SSE2 attribute allows this to compile even without global -msse2
|
|
279
|
+
static OJ_TARGET_SSE2 const char *scan_string_SSE2(const char *str, const char *end) {
|
|
278
280
|
const char *safe_end_64 = end - 64;
|
|
279
281
|
const char *safe_end_16 = end - 16;
|
|
280
282
|
|
|
@@ -285,7 +287,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
|
|
|
285
287
|
|
|
286
288
|
// Process 64 bytes at a time for better throughput
|
|
287
289
|
while (str <= safe_end_64) {
|
|
288
|
-
|
|
290
|
+
OJ_PREFETCH(str + 64);
|
|
289
291
|
|
|
290
292
|
// Load 4 chunks
|
|
291
293
|
const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
|
|
@@ -309,20 +311,20 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
|
|
|
309
311
|
|
|
310
312
|
// Convert to masks
|
|
311
313
|
int mask0 = _mm_movemask_epi8(cmp0);
|
|
312
|
-
if (
|
|
313
|
-
return str +
|
|
314
|
+
if (OJ_UNLIKELY(mask0 != 0))
|
|
315
|
+
return str + OJ_CTZ(mask0);
|
|
314
316
|
|
|
315
317
|
int mask1 = _mm_movemask_epi8(cmp1);
|
|
316
|
-
if (
|
|
317
|
-
return str + 16 +
|
|
318
|
+
if (OJ_UNLIKELY(mask1 != 0))
|
|
319
|
+
return str + 16 + OJ_CTZ(mask1);
|
|
318
320
|
|
|
319
321
|
int mask2 = _mm_movemask_epi8(cmp2);
|
|
320
|
-
if (
|
|
321
|
-
return str + 32 +
|
|
322
|
+
if (OJ_UNLIKELY(mask2 != 0))
|
|
323
|
+
return str + 32 + OJ_CTZ(mask2);
|
|
322
324
|
|
|
323
325
|
int mask3 = _mm_movemask_epi8(cmp3);
|
|
324
|
-
if (
|
|
325
|
-
return str + 48 +
|
|
326
|
+
if (OJ_UNLIKELY(mask3 != 0))
|
|
327
|
+
return str + 48 + OJ_CTZ(mask3);
|
|
326
328
|
|
|
327
329
|
str += 64;
|
|
328
330
|
}
|
|
@@ -335,7 +337,7 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
|
|
|
335
337
|
_mm_cmpeq_epi8(chunk, quote));
|
|
336
338
|
int mask = _mm_movemask_epi8(matches);
|
|
337
339
|
if (mask != 0)
|
|
338
|
-
return str +
|
|
340
|
+
return str + OJ_CTZ(mask);
|
|
339
341
|
}
|
|
340
342
|
|
|
341
343
|
return scan_string_noSIMD(str, end);
|
|
@@ -345,11 +347,19 @@ static inline const char *scan_string_SSE2(const char *str, const char *end) {
|
|
|
345
347
|
static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
|
|
346
348
|
|
|
347
349
|
void oj_scanner_init(void) {
|
|
350
|
+
// Use runtime CPU detection to select the best SIMD implementation
|
|
351
|
+
// This ensures we don't crash on CPUs that don't support SSE4.2
|
|
352
|
+
SIMD_Implementation impl = oj_get_simd_implementation();
|
|
353
|
+
|
|
354
|
+
switch (impl) {
|
|
348
355
|
#ifdef HAVE_SIMD_SSE4_2
|
|
349
|
-
scan_func = scan_string_SSE42;
|
|
350
|
-
#
|
|
351
|
-
|
|
356
|
+
case SIMD_SSE42: scan_func = scan_string_SSE42; break;
|
|
357
|
+
#endif
|
|
358
|
+
#ifdef HAVE_SIMD_SSE2
|
|
359
|
+
case SIMD_SSE2: scan_func = scan_string_SSE2; break;
|
|
352
360
|
#endif
|
|
361
|
+
default: scan_func = scan_string_noSIMD; break;
|
|
362
|
+
}
|
|
353
363
|
// Note: ARM NEON string scanning would be added here if needed
|
|
354
364
|
}
|
|
355
365
|
|
data/ext/oj/rails.c
CHANGED
|
@@ -668,7 +668,7 @@ static VALUE encoder_new(int argc, VALUE *argv, VALUE self) {
|
|
|
668
668
|
} else {
|
|
669
669
|
e->arg = rb_hash_new();
|
|
670
670
|
}
|
|
671
|
-
oj_parse_options(
|
|
671
|
+
oj_parse_options(e->arg, &e->opts);
|
|
672
672
|
|
|
673
673
|
return TypedData_Wrap_Struct(encoder_class, &oj_encoder_type, e);
|
|
674
674
|
}
|
data/ext/oj/simd.h
CHANGED
|
@@ -3,45 +3,170 @@
|
|
|
3
3
|
|
|
4
4
|
// SIMD architecture detection and configuration
|
|
5
5
|
// This header provides unified SIMD support across different CPU architectures
|
|
6
|
+
// with cross-platform runtime detection (Windows/Linux/Mac)
|
|
6
7
|
|
|
8
|
+
// SIMD implementation enum - used for runtime selection
|
|
9
|
+
typedef enum _simd_implementation { SIMD_NONE, SIMD_NEON, SIMD_SSE2, SIMD_SSE42 } SIMD_Implementation;
|
|
10
|
+
|
|
11
|
+
// Define in oj.c.
|
|
12
|
+
extern SIMD_Implementation SIMD_Impl;
|
|
13
|
+
|
|
14
|
+
// Runtime CPU detection function (implemented in oj.c)
|
|
15
|
+
SIMD_Implementation oj_get_simd_implementation(void);
|
|
16
|
+
|
|
17
|
+
// =============================================================================
|
|
18
|
+
// Compiler compatibility macros
|
|
19
|
+
// =============================================================================
|
|
20
|
+
|
|
21
|
+
// Branch prediction hints
|
|
22
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
23
|
+
#define OJ_LIKELY(x) __builtin_expect(!!(x), 1)
|
|
24
|
+
#define OJ_UNLIKELY(x) __builtin_expect(!!(x), 0)
|
|
25
|
+
#else
|
|
26
|
+
#define OJ_LIKELY(x) (x)
|
|
27
|
+
#define OJ_UNLIKELY(x) (x)
|
|
28
|
+
#endif
|
|
29
|
+
|
|
30
|
+
// Prefetch hints
|
|
31
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
32
|
+
#define OJ_PREFETCH(addr) __builtin_prefetch(addr, 0, 0)
|
|
33
|
+
#elif defined(_MSC_VER)
|
|
34
|
+
#include <intrin.h>
|
|
35
|
+
#define OJ_PREFETCH(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0)
|
|
36
|
+
#else
|
|
37
|
+
#define OJ_PREFETCH(addr) ((void)0)
|
|
38
|
+
#endif
|
|
39
|
+
|
|
40
|
+
// Count trailing zeros (for SSE2 mask scanning)
|
|
41
|
+
#if defined(__GNUC__) || defined(__clang__)
|
|
42
|
+
#define OJ_CTZ(x) __builtin_ctz(x)
|
|
43
|
+
#elif defined(_MSC_VER)
|
|
44
|
+
#include <intrin.h>
|
|
45
|
+
static __inline int oj_ctz_msvc(unsigned int x) {
|
|
46
|
+
unsigned long index;
|
|
47
|
+
_BitScanForward(&index, x);
|
|
48
|
+
return (int)index;
|
|
49
|
+
}
|
|
50
|
+
#define OJ_CTZ(x) oj_ctz_msvc(x)
|
|
51
|
+
#else
|
|
52
|
+
// Fallback: naive implementation
|
|
53
|
+
static inline int oj_ctz_fallback(unsigned int x) {
|
|
54
|
+
int count = 0;
|
|
55
|
+
while ((x & 1) == 0 && count < 32) {
|
|
56
|
+
x >>= 1;
|
|
57
|
+
count++;
|
|
58
|
+
}
|
|
59
|
+
return count;
|
|
60
|
+
}
|
|
61
|
+
#define OJ_CTZ(x) oj_ctz_fallback(x)
|
|
62
|
+
#endif
|
|
63
|
+
|
|
64
|
+
// =============================================================================
|
|
7
65
|
// x86/x86_64 SIMD detection
|
|
66
|
+
// =============================================================================
|
|
8
67
|
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
|
|
9
68
|
#define HAVE_SIMD_X86 1
|
|
10
69
|
|
|
11
|
-
//
|
|
12
|
-
|
|
13
|
-
|
|
70
|
+
// Include appropriate SIMD headers
|
|
71
|
+
#if defined(_MSC_VER)
|
|
72
|
+
// MSVC: use intrin.h for all intrinsics
|
|
73
|
+
#include <intrin.h>
|
|
14
74
|
#define HAVE_SIMD_SSE4_2 1
|
|
15
|
-
#
|
|
75
|
+
#define HAVE_SIMD_SSE2 1
|
|
76
|
+
#elif defined(__GNUC__) || defined(__clang__)
|
|
77
|
+
// GCC/Clang: check for header availability and include them
|
|
78
|
+
// We include headers but use target attributes to enable instructions per-function
|
|
79
|
+
// Include cpuid.h for __get_cpuid fallback when __builtin_cpu_supports is unavailable
|
|
80
|
+
#if __has_include(<cpuid.h>)
|
|
81
|
+
#include <cpuid.h>
|
|
16
82
|
#endif
|
|
17
|
-
|
|
18
|
-
//
|
|
19
|
-
#
|
|
83
|
+
#if defined(__SSE4_2__) || defined(__SSE2__)
|
|
84
|
+
// If any SSE is enabled globally, x86intrin.h should be available
|
|
85
|
+
#include <x86intrin.h>
|
|
86
|
+
#define HAVE_SIMD_SSE4_2 1
|
|
87
|
+
#define HAVE_SIMD_SSE2 1
|
|
88
|
+
#else
|
|
89
|
+
// Try to include headers anyway for target attribute functions
|
|
90
|
+
#if __has_include(<x86intrin.h>)
|
|
91
|
+
#include <x86intrin.h>
|
|
92
|
+
#define HAVE_SIMD_SSE4_2 1
|
|
93
|
+
#define HAVE_SIMD_SSE2 1
|
|
94
|
+
#elif __has_include(<nmmintrin.h>)
|
|
95
|
+
#include <nmmintrin.h>
|
|
96
|
+
#define HAVE_SIMD_SSE4_2 1
|
|
20
97
|
#define HAVE_SIMD_SSE2 1
|
|
98
|
+
#elif __has_include(<emmintrin.h>)
|
|
21
99
|
#include <emmintrin.h>
|
|
100
|
+
#define HAVE_SIMD_SSE2 1
|
|
101
|
+
#endif
|
|
102
|
+
#endif
|
|
103
|
+
#endif
|
|
104
|
+
|
|
105
|
+
// Target attribute macros for function-level SIMD enabling
|
|
106
|
+
#if defined(__clang__) || defined(__GNUC__)
|
|
107
|
+
#define OJ_TARGET_SSE42 __attribute__((target("sse4.2")))
|
|
108
|
+
#define OJ_TARGET_SSE2 __attribute__((target("sse2")))
|
|
109
|
+
#else
|
|
110
|
+
// MSVC doesn't need target attributes - intrinsics are always available
|
|
111
|
+
#define OJ_TARGET_SSE42
|
|
112
|
+
#define OJ_TARGET_SSE2
|
|
22
113
|
#endif
|
|
23
114
|
|
|
24
115
|
#endif // x86/x86_64
|
|
25
116
|
|
|
117
|
+
// =============================================================================
|
|
26
118
|
// ARM NEON detection
|
|
119
|
+
// =============================================================================
|
|
27
120
|
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
|
|
28
121
|
#define HAVE_SIMD_NEON 1
|
|
29
122
|
#define SIMD_MINIMUM_THRESHOLD 6
|
|
30
123
|
#include <arm_neon.h>
|
|
31
124
|
#endif
|
|
32
125
|
|
|
33
|
-
//
|
|
34
|
-
|
|
126
|
+
// =============================================================================
|
|
127
|
+
// SIMD type string for debugging/logging
|
|
128
|
+
// =============================================================================
|
|
129
|
+
#if defined(HAVE_SIMD_SSE4_2) || defined(HAVE_SIMD_SSE2)
|
|
35
130
|
#define HAVE_SIMD_STRING_SCAN 1
|
|
36
|
-
#define SIMD_TYPE "
|
|
131
|
+
#define SIMD_TYPE "x86 (runtime detected)"
|
|
37
132
|
#elif defined(HAVE_SIMD_NEON)
|
|
38
133
|
#define HAVE_SIMD_STRING_SCAN 1
|
|
39
134
|
#define SIMD_TYPE "NEON"
|
|
40
|
-
#elif defined(HAVE_SIMD_SSE2)
|
|
41
|
-
#define HAVE_SIMD_STRING_SCAN 1
|
|
42
|
-
#define SIMD_TYPE "SSE2"
|
|
43
135
|
#else
|
|
44
136
|
#define SIMD_TYPE "none"
|
|
45
137
|
#endif
|
|
46
138
|
|
|
139
|
+
#if defined(HAVE_SIMD_SSE4_2)
|
|
140
|
+
|
|
141
|
+
#define SIMD_MINIMUM_THRESHOLD 6
|
|
142
|
+
|
|
143
|
+
extern void initialize_sse42(void);
|
|
144
|
+
|
|
145
|
+
static inline OJ_TARGET_SSE42 __m128i vector_lookup_sse42(__m128i input, __m128i *lookup_table, int tab_size) {
|
|
146
|
+
// Extract high 4 bits to determine which 16-byte chunk (0-15)
|
|
147
|
+
__m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));
|
|
148
|
+
|
|
149
|
+
// Extract low 4 bits for index within the chunk (0-15)
|
|
150
|
+
__m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));
|
|
151
|
+
|
|
152
|
+
// Perform lookups in all 16 tables
|
|
153
|
+
__m128i results[16];
|
|
154
|
+
for (int i = 0; i < tab_size; i++) {
|
|
155
|
+
results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Create masks for each chunk and blend results
|
|
159
|
+
__m128i final_result = _mm_setzero_si128();
|
|
160
|
+
|
|
161
|
+
for (int i = 0; i < tab_size; i++) {
|
|
162
|
+
__m128i mask = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
|
|
163
|
+
__m128i masked_result = _mm_and_si128(mask, results[i]);
|
|
164
|
+
final_result = _mm_or_si128(final_result, masked_result);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return final_result;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
#endif
|
|
171
|
+
|
|
47
172
|
#endif /* OJ_SIMD_H */
|
data/lib/oj/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: oj
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 3.16.
|
|
4
|
+
version: 3.16.14
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Peter Ohler
|
|
@@ -23,20 +23,6 @@ dependencies:
|
|
|
23
23
|
- - ">="
|
|
24
24
|
- !ruby/object:Gem::Version
|
|
25
25
|
version: '3.0'
|
|
26
|
-
- !ruby/object:Gem::Dependency
|
|
27
|
-
name: ostruct
|
|
28
|
-
requirement: !ruby/object:Gem::Requirement
|
|
29
|
-
requirements:
|
|
30
|
-
- - ">="
|
|
31
|
-
- !ruby/object:Gem::Version
|
|
32
|
-
version: '0.2'
|
|
33
|
-
type: :runtime
|
|
34
|
-
prerelease: false
|
|
35
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
-
requirements:
|
|
37
|
-
- - ">="
|
|
38
|
-
- !ruby/object:Gem::Version
|
|
39
|
-
version: '0.2'
|
|
40
26
|
- !ruby/object:Gem::Dependency
|
|
41
27
|
name: minitest
|
|
42
28
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -229,7 +215,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
229
215
|
- !ruby/object:Gem::Version
|
|
230
216
|
version: '0'
|
|
231
217
|
requirements: []
|
|
232
|
-
rubygems_version:
|
|
218
|
+
rubygems_version: 4.0.3
|
|
233
219
|
specification_version: 4
|
|
234
220
|
summary: A fast JSON parser and serializer.
|
|
235
221
|
test_files: []
|