simdjson 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.clang-format +5 -0
  3. data/.gitignore +14 -0
  4. data/.gitmodules +3 -0
  5. data/.rubocop.yml +9 -0
  6. data/.travis.yml +7 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +39 -0
  10. data/Rakefile +32 -0
  11. data/benchmark/apache_builds.json +4421 -0
  12. data/benchmark/demo.json +15 -0
  13. data/benchmark/github_events.json +1390 -0
  14. data/benchmark/run_benchmark.rb +30 -0
  15. data/ext/simdjson/extconf.rb +22 -0
  16. data/ext/simdjson/simdjson.cpp +76 -0
  17. data/ext/simdjson/simdjson.hpp +6 -0
  18. data/lib/simdjson/version.rb +3 -0
  19. data/lib/simdjson.rb +2 -0
  20. data/simdjson.gemspec +35 -0
  21. data/vendor/.gitkeep +0 -0
  22. data/vendor/simdjson/AUTHORS +3 -0
  23. data/vendor/simdjson/CMakeLists.txt +63 -0
  24. data/vendor/simdjson/CONTRIBUTORS +27 -0
  25. data/vendor/simdjson/Dockerfile +10 -0
  26. data/vendor/simdjson/LICENSE +201 -0
  27. data/vendor/simdjson/Makefile +203 -0
  28. data/vendor/simdjson/Notes.md +85 -0
  29. data/vendor/simdjson/README.md +581 -0
  30. data/vendor/simdjson/amalgamation.sh +158 -0
  31. data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
  32. data/vendor/simdjson/benchmark/benchmark.h +223 -0
  33. data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
  34. data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
  35. data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
  36. data/vendor/simdjson/benchmark/parse.cpp +393 -0
  37. data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
  38. data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
  39. data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
  40. data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
  41. data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
  42. data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
  43. data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
  44. data/vendor/simdjson/doc/gbps.png +0 -0
  45. data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
  46. data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
  47. data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
  48. data/vendor/simdjson/images/halvarflake.png +0 -0
  49. data/vendor/simdjson/images/logo.png +0 -0
  50. data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
  51. data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
  52. data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
  53. data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
  54. data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
  55. data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
  56. data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
  57. data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
  58. data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
  59. data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
  60. data/vendor/simdjson/include/simdjson/portability.h +172 -0
  61. data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
  62. data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
  63. data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
  64. data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
  65. data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
  66. data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
  67. data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
  68. data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
  69. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
  70. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
  71. data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
  72. data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
  73. data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
  74. data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
  75. data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
  76. data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
  77. data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
  78. data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
  79. data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
  80. data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
  81. data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
  82. data/vendor/simdjson/scripts/issue150.sh +14 -0
  83. data/vendor/simdjson/scripts/javascript/README.md +3 -0
  84. data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
  85. data/vendor/simdjson/scripts/minifier.sh +11 -0
  86. data/vendor/simdjson/scripts/parseandstat.sh +24 -0
  87. data/vendor/simdjson/scripts/parser.sh +11 -0
  88. data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
  89. data/vendor/simdjson/scripts/plotparse.sh +98 -0
  90. data/vendor/simdjson/scripts/selectparser.sh +11 -0
  91. data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
  92. data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
  93. data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
  94. data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
  95. data/vendor/simdjson/scripts/testjson2json.sh +99 -0
  96. data/vendor/simdjson/scripts/transitions/Makefile +10 -0
  97. data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
  98. data/vendor/simdjson/singleheader/README.md +1 -0
  99. data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
  100. data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
  101. data/vendor/simdjson/singleheader/simdjson.h +39692 -0
  102. data/vendor/simdjson/src/CMakeLists.txt +67 -0
  103. data/vendor/simdjson/src/jsonioutil.cpp +35 -0
  104. data/vendor/simdjson/src/jsonminifier.cpp +285 -0
  105. data/vendor/simdjson/src/jsonparser.cpp +91 -0
  106. data/vendor/simdjson/src/parsedjson.cpp +323 -0
  107. data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
  108. data/vendor/simdjson/src/simdjson.cpp +30 -0
  109. data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
  110. data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
  111. data/vendor/simdjson/style/clang-format-check.sh +25 -0
  112. data/vendor/simdjson/style/clang-format.sh +25 -0
  113. data/vendor/simdjson/style/run-clang-format.py +326 -0
  114. data/vendor/simdjson/tape.md +134 -0
  115. data/vendor/simdjson/tests/CMakeLists.txt +25 -0
  116. data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
  117. data/vendor/simdjson/tests/basictests.cpp +75 -0
  118. data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
  119. data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
  120. data/vendor/simdjson/tests/pointercheck.cpp +38 -0
  121. data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
  122. data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
  123. data/vendor/simdjson/tools/CMakeLists.txt +3 -0
  124. data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
  125. data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
  126. data/vendor/simdjson/tools/json2json.cpp +112 -0
  127. data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
  128. data/vendor/simdjson/tools/jsonstats.cpp +143 -0
  129. data/vendor/simdjson/tools/minify.cpp +21 -0
  130. data/vendor/simdjson/tools/release.py +125 -0
  131. data/vendor/simdjson/windows/dirent_portable.h +1043 -0
  132. metadata +273 -0
@@ -0,0 +1,210 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
3
+
4
+ #include "simdjson/simdutf8check_arm64.h"
5
+ #include "simdjson/stage1_find_marks.h"
6
+ #include "simdjson/stage1_find_marks_flatten.h"
7
+ #include "simdjson/stage1_find_marks_macros.h"
8
+
9
+ #ifdef IS_ARM64
10
+ namespace simdjson {
11
+ template <> struct simd_input<Architecture::ARM64> {
12
+ uint8x16_t i0;
13
+ uint8x16_t i1;
14
+ uint8x16_t i2;
15
+ uint8x16_t i3;
16
+ };
17
+
18
+ template <>
19
+ really_inline simd_input<Architecture::ARM64>
20
+ fill_input<Architecture::ARM64>(const uint8_t *ptr) {
21
+ struct simd_input<Architecture::ARM64> in;
22
+ in.i0 = vld1q_u8(ptr + 0);
23
+ in.i1 = vld1q_u8(ptr + 16);
24
+ in.i2 = vld1q_u8(ptr + 32);
25
+ in.i3 = vld1q_u8(ptr + 48);
26
+ return in;
27
+ }
28
+
29
+ really_inline uint16_t neon_movemask(uint8x16_t input) {
30
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
31
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
32
+ uint8x16_t minput = vandq_u8(input, bit_mask);
33
+ uint8x16_t tmp = vpaddq_u8(minput, minput);
34
+ tmp = vpaddq_u8(tmp, tmp);
35
+ tmp = vpaddq_u8(tmp, tmp);
36
+ return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
37
+ }
38
+
39
+ really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
40
+ uint8x16_t p2, uint8x16_t p3) {
41
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
42
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
43
+ uint8x16_t t0 = vandq_u8(p0, bit_mask);
44
+ uint8x16_t t1 = vandq_u8(p1, bit_mask);
45
+ uint8x16_t t2 = vandq_u8(p2, bit_mask);
46
+ uint8x16_t t3 = vandq_u8(p3, bit_mask);
47
+ uint8x16_t sum0 = vpaddq_u8(t0, t1);
48
+ uint8x16_t sum1 = vpaddq_u8(t2, t3);
49
+ sum0 = vpaddq_u8(sum0, sum1);
50
+ sum0 = vpaddq_u8(sum0, sum0);
51
+ return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
52
+ }
53
+
54
+ template <>
55
+ really_inline uint64_t
56
+ compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
57
+ #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
58
+ return vmull_p64(-1ULL, quote_bits);
59
+ #else
60
+ return portable_compute_quote_mask(quote_bits);
61
+ #endif
62
+ }
63
+
64
+ template <> struct utf8_checking_state<Architecture::ARM64> {
65
+ int8x16_t has_error{};
66
+ processed_utf_bytes previous{};
67
+ };
68
+
69
+ // Checks that all bytes are ascii
70
+ really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
71
+ // checking if the most significant bit is always equal to 0.
72
+ uint8x16_t high_bit = vdupq_n_u8(0x80);
73
+ uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
74
+ uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
75
+ uint8x16_t t3 = vorrq_u8(t0, t1);
76
+ uint8x16_t t4 = vandq_u8(t3, high_bit);
77
+ uint64x2_t v64 = vreinterpretq_u64_u8(t4);
78
+ uint32x2_t v32 = vqmovn_u64(v64);
79
+ uint64x1_t result = vreinterpret_u64_u32(v32);
80
+ return vget_lane_u64(result, 0) == 0;
81
+ }
82
+
83
+ template <>
84
+ really_inline void check_utf8<Architecture::ARM64>(
85
+ simd_input<Architecture::ARM64> in,
86
+ utf8_checking_state<Architecture::ARM64> &state) {
87
+ if (check_ascii_neon(in)) {
88
+ // All bytes are ascii. Therefore the byte that was just before must be
89
+ // ascii too. We only check the byte that was just before simd_input. Nines
90
+ // are arbitrary values.
91
+ const int8x16_t verror =
92
+ (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
93
+ state.has_error =
94
+ vorrq_s8(vreinterpretq_s8_u8(
95
+ vcgtq_s8(state.previous.carried_continuations, verror)),
96
+ state.has_error);
97
+ } else {
98
+ // it is not ascii so we have to do heavy work
99
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
100
+ &(state.previous), &(state.has_error));
101
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
102
+ &(state.previous), &(state.has_error));
103
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
104
+ &(state.previous), &(state.has_error));
105
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
106
+ &(state.previous), &(state.has_error));
107
+ }
108
+ }
109
+
110
+ template <>
111
+ really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
112
+ utf8_checking_state<Architecture::ARM64> &state) {
113
+ uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
114
+ uint32x2_t v32 = vqmovn_u64(v64);
115
+ uint64x1_t result = vreinterpret_u64_u32(v32);
116
+ return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
117
+ : simdjson::SUCCESS;
118
+ }
119
+
120
+ template <>
121
+ really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
122
+ simd_input<Architecture::ARM64> in, uint8_t m) {
123
+ const uint8x16_t mask = vmovq_n_u8(m);
124
+ uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
125
+ uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
126
+ uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
127
+ uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
128
+ return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
129
+ }
130
+
131
+ template <>
132
+ really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
133
+ simd_input<Architecture::ARM64> in, uint8_t m) {
134
+ const uint8x16_t mask = vmovq_n_u8(m);
135
+ uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
136
+ uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
137
+ uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
138
+ uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
139
+ return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
140
+ }
141
+
142
+ template <>
143
+ really_inline uint64_t find_odd_backslash_sequences<Architecture::ARM64>(
144
+ simd_input<Architecture::ARM64> in,
145
+ uint64_t &prev_iter_ends_odd_backslash) {
146
+ FIND_ODD_BACKSLASH_SEQUENCES(Architecture::ARM64, in,
147
+ prev_iter_ends_odd_backslash);
148
+ }
149
+
150
+ template <>
151
+ really_inline uint64_t find_quote_mask_and_bits<Architecture::ARM64>(
152
+ simd_input<Architecture::ARM64> in, uint64_t odd_ends,
153
+ uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
154
+ uint64_t &error_mask) {
155
+ FIND_QUOTE_MASK_AND_BITS(Architecture::ARM64, in, odd_ends,
156
+ prev_iter_inside_quote, quote_bits, error_mask)
157
+ }
158
+
159
+ template <>
160
+ really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
161
+ simd_input<Architecture::ARM64> in, uint64_t &whitespace,
162
+ uint64_t &structurals) {
163
+ const uint8x16_t low_nibble_mask =
164
+ (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
165
+ const uint8x16_t high_nibble_mask =
166
+ (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
167
+ const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
168
+ const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
169
+ const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
170
+
171
+ uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
172
+ uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
173
+ uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
174
+ uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
175
+ uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
176
+
177
+ uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
178
+ uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
179
+ uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
180
+ uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
181
+ uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
182
+
183
+ uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
184
+ uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
185
+ uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
186
+ uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
187
+ uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
188
+
189
+ uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
190
+ uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
191
+ uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
192
+ uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
193
+ uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
194
+
195
+ uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
196
+ uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
197
+ uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
198
+ uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
199
+ structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
200
+
201
+ uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
202
+ uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
203
+ uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
204
+ uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
205
+ whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
206
+ }
207
+ } // namespace simdjson
208
+
209
+ #endif // IS_ARM64
210
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
@@ -0,0 +1,93 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
3
+
4
+ namespace simdjson {
5
+
6
+ #ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
7
+ //
8
+ // This is just a naive implementation. It should be normally
9
+ // disable, but can be used for research purposes to compare
10
+ // again our optimized version.
11
+ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
12
+ uint32_t idx, uint64_t bits) {
13
+ uint32_t *out_ptr = base_ptr + base;
14
+ idx -= 64;
15
+ while (bits != 0) {
16
+ out_ptr[0] = idx + trailing_zeroes(bits);
17
+ bits = bits & (bits - 1);
18
+ out_ptr++;
19
+ }
20
+ base = (out_ptr - base_ptr);
21
+ }
22
+
23
+ #else
24
+ // flatten out values in 'bits' assuming that they are are to have values of idx
25
+ // plus their position in the bitvector, and store these indexes at
26
+ // base_ptr[base] incrementing base as we go
27
+ // will potentially store extra values beyond end of valid bits, so base_ptr
28
+ // needs to be large enough to handle this
29
+ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
30
+ uint32_t idx, uint64_t bits) {
31
+ // In some instances, the next branch is expensive because it is mispredicted.
32
+ // Unfortunately, in other cases,
33
+ // it helps tremendously.
34
+ if (bits == 0)
35
+ return;
36
+ uint32_t cnt = hamming(bits);
37
+ uint32_t next_base = base + cnt;
38
+ idx -= 64;
39
+ base_ptr += base;
40
+ {
41
+ base_ptr[0] = idx + trailing_zeroes(bits);
42
+ bits = bits & (bits - 1);
43
+ base_ptr[1] = idx + trailing_zeroes(bits);
44
+ bits = bits & (bits - 1);
45
+ base_ptr[2] = idx + trailing_zeroes(bits);
46
+ bits = bits & (bits - 1);
47
+ base_ptr[3] = idx + trailing_zeroes(bits);
48
+ bits = bits & (bits - 1);
49
+ base_ptr[4] = idx + trailing_zeroes(bits);
50
+ bits = bits & (bits - 1);
51
+ base_ptr[5] = idx + trailing_zeroes(bits);
52
+ bits = bits & (bits - 1);
53
+ base_ptr[6] = idx + trailing_zeroes(bits);
54
+ bits = bits & (bits - 1);
55
+ base_ptr[7] = idx + trailing_zeroes(bits);
56
+ bits = bits & (bits - 1);
57
+ base_ptr += 8;
58
+ }
59
+ // We hope that the next branch is easily predicted.
60
+ if (cnt > 8) {
61
+ base_ptr[0] = idx + trailing_zeroes(bits);
62
+ bits = bits & (bits - 1);
63
+ base_ptr[1] = idx + trailing_zeroes(bits);
64
+ bits = bits & (bits - 1);
65
+ base_ptr[2] = idx + trailing_zeroes(bits);
66
+ bits = bits & (bits - 1);
67
+ base_ptr[3] = idx + trailing_zeroes(bits);
68
+ bits = bits & (bits - 1);
69
+ base_ptr[4] = idx + trailing_zeroes(bits);
70
+ bits = bits & (bits - 1);
71
+ base_ptr[5] = idx + trailing_zeroes(bits);
72
+ bits = bits & (bits - 1);
73
+ base_ptr[6] = idx + trailing_zeroes(bits);
74
+ bits = bits & (bits - 1);
75
+ base_ptr[7] = idx + trailing_zeroes(bits);
76
+ bits = bits & (bits - 1);
77
+ base_ptr += 8;
78
+ }
79
+ if (cnt > 16) { // unluckly: we rarely get here
80
+ // since it means having one structural or pseudo-structral element
81
+ // every 4 characters (possible with inputs like "","","",...).
82
+ do {
83
+ base_ptr[0] = idx + trailing_zeroes(bits);
84
+ bits = bits & (bits - 1);
85
+ base_ptr++;
86
+ } while (bits != 0);
87
+ }
88
+ base = next_base;
89
+ }
90
+ #endif // SIMDJSON_NAIVE_FLATTEN
91
+ } // namespace simdjson
92
+
93
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
@@ -0,0 +1,95 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
3
+
4
+ // This file provides the same function as
5
+ // stage1_find_marks_flatten.h, but uses Intel intrinsics.
6
+ // This should provide better performance on Visual Studio
7
+ // and other compilers that do a conservative optimization.
8
+
9
+ // Specifically, on x64 processors with BMI,
10
+ // x & (x - 1) should be mapped to
11
+ // the blsr instruction. By using the
12
+ // _blsr_u64 intrinsic, we
13
+ // ensure that this will happen.
14
+ /////////
15
+
16
+ #include "simdjson/common_defs.h"
17
+ #include "simdjson/portability.h"
18
+
19
+ #ifdef IS_X86_64
20
+
21
+ TARGET_HASWELL
22
+ namespace simdjson {
23
+ namespace haswell {
24
+
25
+ // flatten out values in 'bits' assuming that they are are to have values of idx
26
+ // plus their position in the bitvector, and store these indexes at
27
+ // base_ptr[base] incrementing base as we go
28
+ // will potentially store extra values beyond end of valid bits, so base_ptr
29
+ // needs to be large enough to handle this
30
+ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
31
+ uint32_t idx, uint64_t bits) {
32
+ // In some instances, the next branch is expensive because it is mispredicted.
33
+ // Unfortunately, in other cases,
34
+ // it helps tremendously.
35
+ if (bits == 0)
36
+ return;
37
+ uint32_t cnt = _mm_popcnt_u64(bits);
38
+ uint32_t next_base = base + cnt;
39
+ idx -= 64;
40
+ base_ptr += base;
41
+ {
42
+ base_ptr[0] = idx + trailing_zeroes(bits);
43
+ bits = _blsr_u64(bits);
44
+ base_ptr[1] = idx + trailing_zeroes(bits);
45
+ bits = _blsr_u64(bits);
46
+ base_ptr[2] = idx + trailing_zeroes(bits);
47
+ bits = _blsr_u64(bits);
48
+ base_ptr[3] = idx + trailing_zeroes(bits);
49
+ bits = _blsr_u64(bits);
50
+ base_ptr[4] = idx + trailing_zeroes(bits);
51
+ bits = _blsr_u64(bits);
52
+ base_ptr[5] = idx + trailing_zeroes(bits);
53
+ bits = _blsr_u64(bits);
54
+ base_ptr[6] = idx + trailing_zeroes(bits);
55
+ bits = _blsr_u64(bits);
56
+ base_ptr[7] = idx + trailing_zeroes(bits);
57
+ bits = _blsr_u64(bits);
58
+ base_ptr += 8;
59
+ }
60
+ // We hope that the next branch is easily predicted.
61
+ if (cnt > 8) {
62
+ base_ptr[0] = idx + trailing_zeroes(bits);
63
+ bits = _blsr_u64(bits);
64
+ base_ptr[1] = idx + trailing_zeroes(bits);
65
+ bits = _blsr_u64(bits);
66
+ base_ptr[2] = idx + trailing_zeroes(bits);
67
+ bits = _blsr_u64(bits);
68
+ base_ptr[3] = idx + trailing_zeroes(bits);
69
+ bits = _blsr_u64(bits);
70
+ base_ptr[4] = idx + trailing_zeroes(bits);
71
+ bits = _blsr_u64(bits);
72
+ base_ptr[5] = idx + trailing_zeroes(bits);
73
+ bits = _blsr_u64(bits);
74
+ base_ptr[6] = idx + trailing_zeroes(bits);
75
+ bits = _blsr_u64(bits);
76
+ base_ptr[7] = idx + trailing_zeroes(bits);
77
+ bits = _blsr_u64(bits);
78
+ base_ptr += 8;
79
+ }
80
+ if (cnt > 16) { // unluckly: we rarely get here
81
+ // since it means having one structural or pseudo-structral element
82
+ // every 4 characters (possible with inputs like "","","",...).
83
+ do {
84
+ base_ptr[0] = idx + trailing_zeroes(bits);
85
+ bits = _blsr_u64(bits);
86
+ base_ptr++;
87
+ } while (bits != 0);
88
+ }
89
+ base = next_base;
90
+ }
91
+ } // namespace haswell
92
+ } // namespace simdjson
93
+ UNTARGET_REGION
94
+ #endif // IS_X86_64
95
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
@@ -0,0 +1,210 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
3
+
4
+ #include "simdjson/simdutf8check_haswell.h"
5
+ #include "simdjson/stage1_find_marks.h"
6
+ #include "simdjson/stage1_find_marks_flatten_haswell.h"
7
+ #include "simdjson/stage1_find_marks_macros.h"
8
+
9
+ #ifdef IS_X86_64
10
+
11
+ TARGET_HASWELL
12
+ namespace simdjson {
13
+ template <> struct simd_input<Architecture::HASWELL> {
14
+ __m256i lo;
15
+ __m256i hi;
16
+ };
17
+
18
+ template <>
19
+ really_inline simd_input<Architecture::HASWELL>
20
+ fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
21
+ struct simd_input<Architecture::HASWELL> in;
22
+ in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
23
+ in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
24
+ return in;
25
+ }
26
+
27
+ template <>
28
+ really_inline uint64_t
29
+ compute_quote_mask<Architecture::HASWELL>(uint64_t quote_bits) {
30
+ // There should be no such thing with a processing supporting avx2
31
+ // but not clmul.
32
+ uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
33
+ _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
34
+ return quote_mask;
35
+ }
36
+
37
+ template <> struct utf8_checking_state<Architecture::HASWELL> {
38
+ __m256i has_error;
39
+ avx_processed_utf_bytes previous;
40
+ utf8_checking_state() {
41
+ has_error = _mm256_setzero_si256();
42
+ previous.raw_bytes = _mm256_setzero_si256();
43
+ previous.high_nibbles = _mm256_setzero_si256();
44
+ previous.carried_continuations = _mm256_setzero_si256();
45
+ }
46
+ };
47
+
48
+ template <>
49
+ really_inline void check_utf8<Architecture::HASWELL>(
50
+ simd_input<Architecture::HASWELL> in,
51
+ utf8_checking_state<Architecture::HASWELL> &state) {
52
+ __m256i high_bit = _mm256_set1_epi8(0x80u);
53
+ if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
54
+ // it is ascii, we just check continuation
55
+ state.has_error = _mm256_or_si256(
56
+ _mm256_cmpgt_epi8(state.previous.carried_continuations,
57
+ _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
58
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
59
+ 9, 9, 9, 9, 9, 9, 9, 1)),
60
+ state.has_error);
61
+ } else {
62
+ // it is not ascii so we have to do heavy work
63
+ state.previous =
64
+ avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
65
+ state.previous =
66
+ avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
67
+ }
68
+ }
69
+
70
+ template <>
71
+ really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
72
+ utf8_checking_state<Architecture::HASWELL> &state) {
73
+ return _mm256_testz_si256(state.has_error, state.has_error) == 0
74
+ ? simdjson::UTF8_ERROR
75
+ : simdjson::SUCCESS;
76
+ }
77
+
78
+ template <>
79
+ really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
80
+ simd_input<Architecture::HASWELL> in, uint8_t m) {
81
+ const __m256i mask = _mm256_set1_epi8(m);
82
+ __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
83
+ uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
84
+ __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
85
+ uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
86
+ return res_0 | (res_1 << 32);
87
+ }
88
+
89
+ template <>
90
+ really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
91
+ simd_input<Architecture::HASWELL> in, uint8_t m) {
92
+ const __m256i maxval = _mm256_set1_epi8(m);
93
+ __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
94
+ uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
95
+ __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
96
+ uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
97
+ return res_0 | (res_1 << 32);
98
+ }
99
+
100
+ template <>
101
+ really_inline uint64_t find_odd_backslash_sequences<Architecture::HASWELL>(
102
+ simd_input<Architecture::HASWELL> in,
103
+ uint64_t &prev_iter_ends_odd_backslash) {
104
+ FIND_ODD_BACKSLASH_SEQUENCES(Architecture::HASWELL, in,
105
+ prev_iter_ends_odd_backslash);
106
+ }
107
+
108
+ template <>
109
+ really_inline uint64_t find_quote_mask_and_bits<Architecture::HASWELL>(
110
+ simd_input<Architecture::HASWELL> in, uint64_t odd_ends,
111
+ uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
112
+ uint64_t &error_mask) {
113
+ FIND_QUOTE_MASK_AND_BITS(Architecture::HASWELL, in, odd_ends,
114
+ prev_iter_inside_quote, quote_bits, error_mask)
115
+ }
116
+
117
+ template <>
118
+ really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
119
+ simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
120
+ uint64_t &structurals) {
121
+ #ifdef SIMDJSON_NAIVE_STRUCTURAL
122
+ // You should never need this naive approach, but it can be useful
123
+ // for research purposes
124
+ const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
125
+ __m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
126
+ __m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
127
+ const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
128
+ struct_lo =
129
+ _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_brace));
130
+ struct_hi =
131
+ _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_brace));
132
+ const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
133
+ struct_lo =
134
+ _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_open_bracket));
135
+ struct_hi =
136
+ _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_open_bracket));
137
+ const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
138
+ struct_lo =
139
+ _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_bracket));
140
+ struct_hi =
141
+ _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_bracket));
142
+ const __m256i mask_column = _mm256_set1_epi8(0x3a);
143
+ struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_column));
144
+ struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_column));
145
+ const __m256i mask_comma = _mm256_set1_epi8(0x2c);
146
+ struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma));
147
+ struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma));
148
+ uint64_t structural_res_0 =
149
+ static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
150
+ uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
151
+ structurals = (structural_res_0 | (structural_res_1 << 32));
152
+
153
+ const __m256i mask_space = _mm256_set1_epi8(0x20);
154
+ __m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
155
+ __m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
156
+ const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
157
+ space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_linefeed));
158
+ space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_linefeed));
159
+ const __m256i mask_tab = _mm256_set1_epi8(0x09);
160
+ space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_tab));
161
+ space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_tab));
162
+ const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
163
+ space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_carriage));
164
+ space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_carriage));
165
+
166
+ uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
167
+ uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
168
+ whitespace = (ws_res_0 | (ws_res_1 << 32));
169
+ // end of naive approach
170
+
171
+ #else // SIMDJSON_NAIVE_STRUCTURAL
172
+ // clang-format off
173
+ const __m256i structural_table =
174
+ _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
175
+ 44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
176
+ const __m256i white_table = _mm256_setr_epi8(
177
+ 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
178
+ 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
179
+ // clang-format on
180
+ const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
181
+ const __m256i struct_mask = _mm256_set1_epi8(32);
182
+
183
+ __m256i lo_white =
184
+ _mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo));
185
+ __m256i hi_white =
186
+ _mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi));
187
+ uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
188
+ uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
189
+ whitespace = (ws_res_0 | (ws_res_1 << 32));
190
+ __m256i lo_struct_r1 = _mm256_add_epi8(struct_offset, in.lo);
191
+ __m256i hi_struct_r1 = _mm256_add_epi8(struct_offset, in.hi);
192
+ __m256i lo_struct_r2 = _mm256_or_si256(in.lo, struct_mask);
193
+ __m256i hi_struct_r2 = _mm256_or_si256(in.hi, struct_mask);
194
+ __m256i lo_struct_r3 = _mm256_shuffle_epi8(structural_table, lo_struct_r1);
195
+ __m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
196
+ __m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
197
+ __m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
198
+
199
+ uint64_t structural_res_0 =
200
+ static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
201
+ uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
202
+ structurals = (structural_res_0 | (structural_res_1 << 32));
203
+ #endif // SIMDJSON_NAIVE_STRUCTURAL
204
+ }
205
+
206
+ } // namespace simdjson
207
+ UNTARGET_REGION
208
+
209
+ #endif // IS_X86_64
210
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H