json 2.15.2 → 2.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,14 @@
1
+ #include "../json.h"
2
+
1
3
  typedef enum {
2
4
  SIMD_NONE,
3
5
  SIMD_NEON,
4
6
  SIMD_SSE2
5
7
  } SIMD_Implementation;
6
8
 
7
- #ifdef JSON_ENABLE_SIMD
9
+ #ifndef __has_builtin // Optional of course.
10
+ #define __has_builtin(x) 0 // Compatibility with non-clang compilers.
11
+ #endif
8
12
 
9
13
  #ifdef __clang__
10
14
  # if __has_builtin(__builtin_ctzll)
@@ -20,6 +24,8 @@ typedef enum {
20
24
 
21
25
  static inline uint32_t trailing_zeros64(uint64_t input)
22
26
  {
27
+ JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
28
+
23
29
  #if HAVE_BUILTIN_CTZLL
24
30
  return __builtin_ctzll(input);
25
31
  #else
@@ -35,6 +41,8 @@ static inline uint32_t trailing_zeros64(uint64_t input)
35
41
 
36
42
  static inline int trailing_zeros(int input)
37
43
  {
44
+ JSON_ASSERT(input > 0); // __builtin_ctz(0) is undefined behavior
45
+
38
46
  #if HAVE_BUILTIN_CTZLL
39
47
  return __builtin_ctz(input);
40
48
  #else
@@ -48,14 +56,36 @@ static inline int trailing_zeros(int input)
48
56
  #endif
49
57
  }
50
58
 
51
- #if (defined(__GNUC__ ) || defined(__clang__))
52
- #define FORCE_INLINE __attribute__((always_inline))
53
- #else
54
- #define FORCE_INLINE
55
- #endif
59
+ #ifdef JSON_ENABLE_SIMD
56
60
 
61
+ #define SIMD_MINIMUM_THRESHOLD 4
57
62
 
58
- #define SIMD_MINIMUM_THRESHOLD 6
63
+ ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
64
+ {
65
+ RBIMPL_ASSERT_OR_ASSUME(len < 16);
66
+ RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
67
+ #if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
68
+ // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
69
+ // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
70
+ // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
71
+ // position in both copies.
72
+
73
+ // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
74
+ // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
75
+ // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
76
+ // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
77
+ // plus two loads and stores generated when using __builtin_memcpy.
78
+ if (len >= 8) {
79
+ __builtin_memcpy(dst, src, 8);
80
+ __builtin_memcpy(dst + len - 8, src + len - 8, 8);
81
+ } else {
82
+ __builtin_memcpy(dst, src, 4);
83
+ __builtin_memcpy(dst + len - 4, src + len - 4, 4);
84
+ }
85
+ #else
86
+ MEMCPY(dst, src, char, len);
87
+ #endif
88
+ }
59
89
 
60
90
  #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
61
91
  #include <arm_neon.h>
@@ -70,14 +100,14 @@ static inline SIMD_Implementation find_simd_implementation(void)
70
100
  #define HAVE_SIMD_NEON 1
71
101
 
72
102
  // See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
73
- static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches)
103
+ ALWAYS_INLINE(static) uint64_t neon_match_mask(uint8x16_t matches)
74
104
  {
75
105
  const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4);
76
106
  const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0);
77
107
  return mask & 0x8888888888888888ull;
78
108
  }
79
109
 
80
- static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr)
110
+ ALWAYS_INLINE(static) uint64_t compute_chunk_mask_neon(const char *ptr)
81
111
  {
82
112
  uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr);
83
113
 
@@ -90,7 +120,7 @@ static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr)
90
120
  return neon_match_mask(needs_escape);
91
121
  }
92
122
 
93
- static inline FORCE_INLINE int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
123
+ ALWAYS_INLINE(static) int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask)
94
124
  {
95
125
  while (*ptr + sizeof(uint8x16_t) <= end) {
96
126
  uint64_t chunk_mask = compute_chunk_mask_neon(*ptr);
@@ -103,16 +133,6 @@ static inline FORCE_INLINE int string_scan_simd_neon(const char **ptr, const cha
103
133
  return 0;
104
134
  }
105
135
 
106
- static inline uint8x16x4_t load_uint8x16_4(const unsigned char *table)
107
- {
108
- uint8x16x4_t tab;
109
- tab.val[0] = vld1q_u8(table);
110
- tab.val[1] = vld1q_u8(table+16);
111
- tab.val[2] = vld1q_u8(table+32);
112
- tab.val[3] = vld1q_u8(table+48);
113
- return tab;
114
- }
115
-
116
136
  #endif /* ARM Neon Support.*/
117
137
 
118
138
  #if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
@@ -137,7 +157,7 @@ static inline uint8x16x4_t load_uint8x16_4(const unsigned char *table)
137
157
  #define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1))
138
158
  #define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a)
139
159
 
140
- static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *ptr)
160
+ ALWAYS_INLINE(static) TARGET_SSE2 int compute_chunk_mask_sse2(const char *ptr)
141
161
  {
142
162
  __m128i chunk = _mm_loadu_si128((__m128i const*)ptr);
143
163
  // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33
@@ -148,7 +168,7 @@ static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *p
148
168
  return _mm_movemask_epi8(needs_escape);
149
169
  }
150
170
 
151
- static inline TARGET_SSE2 FORCE_INLINE int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
171
+ ALWAYS_INLINE(static) TARGET_SSE2 int string_scan_simd_sse2(const char **ptr, const char *end, int *mask)
152
172
  {
153
173
  while (*ptr + sizeof(__m128i) <= end) {
154
174
  int chunk_mask = compute_chunk_mask_sse2(*ptr);
@@ -29,7 +29,7 @@
29
29
  #include <string.h>
30
30
  #include <stdint.h>
31
31
 
32
- #ifdef JSON_DEBUG
32
+ #if JSON_DEBUG
33
33
  #include <assert.h>
34
34
  #endif
35
35
 
@@ -449,7 +449,7 @@ static int filter_special(double fp, char* dest)
449
449
  * }
450
450
  *
451
451
  */
452
- static int fpconv_dtoa(double d, char dest[28])
452
+ static int fpconv_dtoa(double d, char dest[32])
453
453
  {
454
454
  char digits[18];
455
455
 
@@ -472,7 +472,7 @@ static int fpconv_dtoa(double d, char dest[28])
472
472
  int ndigits = grisu2(d, digits, &K);
473
473
 
474
474
  str_len += emit_digits(digits, ndigits, dest + str_len, K, neg);
475
- #ifdef JSON_DEBUG
475
+ #if JSON_DEBUG
476
476
  assert(str_len <= 32);
477
477
  #endif
478
478