json-extended 2.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ case RbConfig::CONFIG['host_cpu']
2
+ when /^(arm|aarch64)/
3
+ # Try to compile a small program using NEON instructions
4
+ header, type, init, extra = 'arm_neon.h', 'uint8x16_t', 'vdupq_n_u8(32)', nil
5
+ when /^(x86_64|x64)/
6
+ header, type, init, extra = 'x86intrin.h', '__m128i', '_mm_set1_epi8(32)', 'if (__builtin_cpu_supports("sse2")) { printf("OK"); }'
7
+ end
8
+ if header
9
+ if have_header(header) && try_compile(<<~SRC, '-Werror=implicit-function-declaration')
10
+ #{cpp_include(header)}
11
+ int main(int argc, char **argv) {
12
+ #{type} test = #{init};
13
+ #{extra}
14
+ if (argc > 100000) printf("%p", &test);
15
+ return 0;
16
+ }
17
+ SRC
18
+ $defs.push("-DJSON_ENABLE_SIMD")
19
+ else
20
+ puts "Disable SIMD"
21
+ end
22
+ end
23
+
24
+ have_header('cpuid.h')
@@ -0,0 +1,208 @@
1
+ #include "../json.h"
2
+
3
+ typedef enum {
4
+ SIMD_NONE,
5
+ SIMD_NEON,
6
+ SIMD_SSE2
7
+ } SIMD_Implementation;
8
+
9
+ #ifndef __has_builtin // Optional of course.
10
+ #define __has_builtin(x) 0 // Compatibility with non-clang compilers.
11
+ #endif
12
+
13
+ #ifdef __clang__
14
+ # if __has_builtin(__builtin_ctzll)
15
+ # define HAVE_BUILTIN_CTZLL 1
16
+ # else
17
+ # define HAVE_BUILTIN_CTZLL 0
18
+ # endif
19
+ #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
20
+ # define HAVE_BUILTIN_CTZLL 1
21
+ #else
22
+ # define HAVE_BUILTIN_CTZLL 0
23
+ #endif
24
+
25
+ static inline uint32_t trailing_zeros64(uint64_t input)
26
+ {
27
+ JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
28
+
29
+ #if HAVE_BUILTIN_CTZLL
30
+ return __builtin_ctzll(input);
31
+ #else
32
+ uint32_t trailing_zeros = 0;
33
+ uint64_t temp = input;
34
+ while ((temp & 1) == 0 && temp > 0) {
35
+ trailing_zeros++;
36
+ temp >>= 1;
37
+ }
38
+ return trailing_zeros;
39
+ #endif
40
+ }
41
+
42
+ static inline int trailing_zeros(int input)
43
+ {
44
+ JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
45
+
46
+ #if HAVE_BUILTIN_CTZLL
47
+ return __builtin_ctz(input);
48
+ #else
49
+ int trailing_zeros = 0;
50
+ int temp = input;
51
+ while ((temp & 1) == 0 && temp > 0) {
52
+ trailing_zeros++;
53
+ temp >>= 1;
54
+ }
55
+ return trailing_zeros;
56
+ #endif
57
+ }
58
+
59
+ #ifdef JSON_ENABLE_SIMD
60
+
61
+ #define SIMD_MINIMUM_THRESHOLD 4
62
+
63
+ ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
64
+ {
65
+ RBIMPL_ASSERT_OR_ASSUME(len < 16);
66
+ RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
67
+ #if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
68
+ // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
69
+ // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
70
+ // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
71
+ // position in both copies.
72
+
73
+ // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
74
+ // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
75
+ // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
76
+ // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
77
+ // plus two loads and stores generated when using __builtin_memcpy.
78
+ if (len >= 8) {
79
+ __builtin_memcpy(dst, src, 8);
80
+ __builtin_memcpy(dst + len - 8, src + len - 8, 8);
81
+ } else {
82
+ __builtin_memcpy(dst, src, 4);
83
+ __builtin_memcpy(dst + len - 4, src + len - 4, 4);
84
+ }
85
+ #else
86
+ MEMCPY(dst, src, char, len);
87
+ #endif
88
+ }
89
+
90
+ #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
91
+ #include <arm_neon.h>
92
+
93
+ #define FIND_SIMD_IMPLEMENTATION_DEFINED 1
94
+ static inline SIMD_Implementation find_simd_implementation(void)
95
+ {
96
+ return SIMD_NEON;
97
+ }
98
+
99
+ #define HAVE_SIMD 1
100
+ #define HAVE_SIMD_NEON 1
101
+
102
+ // See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
103
+ ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
104
+ {
105
+ const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
106
+ const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
107
+ return mask & 0x8888888888888888ull;
108
+ }
109
+
110
+ ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
111
+ {
112
+ uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
113
+
114
+ // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
115
+ // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
116
+ const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33));
117
+
118
+ uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\'));
119
+ uint8x16_t needs_escape = vorrq_u8(too_low_or_dbl_quote, has_backslash);
120
+ return neon_match_mask(needs_escape);
121
+ }
122
+
123
+ ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
124
+ {
125
+ while (*ptr + sizeof(uint8x16_t) <= end) {
126
+ uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
127
+ if (chunk_mask) {
128
+ *mask = chunk_mask;
129
+ return 1;
130
+ }
131
+ *ptr += sizeof(uint8x16_t);
132
+ }
133
+ return 0;
134
+ }
135
+
136
+ #endif /* ARM Neon Support.*/
137
+
138
+ #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
139
+
140
+ #ifdef HAVE_X86INTRIN_H
141
+ #include <x86intrin.h>
142
+
143
+ #define HAVE_SIMD 1
144
+ #define HAVE_SIMD_SSE2 1
145
+
146
+ #ifdef HAVE_CPUID_H
147
+ #define FIND_SIMD_IMPLEMENTATION_DEFINED 1
148
+
149
+ #if defined(__clang__) || defined(__GNUC__)
150
+ #define TARGET_SSE2 __attribute__((target("sse2")))
151
+ #else
152
+ #define TARGET_SSE2
153
+ #endif
154
+
155
+ #define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a)
156
+ #define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a)
157
+ #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
158
+ #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
159
+
160
+ ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
161
+ {
162
+ __m128i chunk = _mm_loadu_si128((__m128i const*)ptr);
163
+ // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
164
+ // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/
165
+ __m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33));
166
+ __m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\'));
167
+ __m128i needs_escape = _mm_or_si128(too_low_or_dbl_quote, has_backslash);
168
+ return _mm_movemask_epi8(needs_escape);
169
+ }
170
+
171
+ ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
172
+ {
173
+ while (*ptr + sizeof(__m128i) <= end) {
174
+ int chunk_mask = compute_chunk_mask_sse2(*ptr);
175
+ if (chunk_mask) {
176
+ *mask = chunk_mask;
177
+ return 1;
178
+ }
179
+ *ptr += sizeof(__m128i);
180
+ }
181
+
182
+ return 0;
183
+ }
184
+
185
+ #include <cpuid.h>
186
+ #endif /* HAVE_CPUID_H */
187
+
188
+ static inline SIMD_Implementation find_simd_implementation(void)
189
+ {
190
+ // TODO Revisit. I think the SSE version now only uses SSE2 instructions.
191
+ if (__builtin_cpu_supports("sse2")) {
192
+ return SIMD_SSE2;
193
+ }
194
+
195
+ return SIMD_NONE;
196
+ }
197
+
198
+ #endif /* HAVE_X86INTRIN_H */
199
+ #endif /* X86_64 Support */
200
+
201
+ #endif /* JSON_ENABLE_SIMD */
202
+
203
+ #ifndef FIND_SIMD_IMPLEMENTATION_DEFINED
204
+ static inline SIMD_Implementation find_simd_implementation(void)
205
+ {
206
+ return SIMD_NONE;
207
+ }
208
+ #endif