simdjson 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.clang-format +5 -0
  3. data/.gitignore +14 -0
  4. data/.gitmodules +3 -0
  5. data/.rubocop.yml +9 -0
  6. data/.travis.yml +7 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +39 -0
  10. data/Rakefile +32 -0
  11. data/benchmark/apache_builds.json +4421 -0
  12. data/benchmark/demo.json +15 -0
  13. data/benchmark/github_events.json +1390 -0
  14. data/benchmark/run_benchmark.rb +30 -0
  15. data/ext/simdjson/extconf.rb +22 -0
  16. data/ext/simdjson/simdjson.cpp +76 -0
  17. data/ext/simdjson/simdjson.hpp +6 -0
  18. data/lib/simdjson/version.rb +3 -0
  19. data/lib/simdjson.rb +2 -0
  20. data/simdjson.gemspec +35 -0
  21. data/vendor/.gitkeep +0 -0
  22. data/vendor/simdjson/AUTHORS +3 -0
  23. data/vendor/simdjson/CMakeLists.txt +63 -0
  24. data/vendor/simdjson/CONTRIBUTORS +27 -0
  25. data/vendor/simdjson/Dockerfile +10 -0
  26. data/vendor/simdjson/LICENSE +201 -0
  27. data/vendor/simdjson/Makefile +203 -0
  28. data/vendor/simdjson/Notes.md +85 -0
  29. data/vendor/simdjson/README.md +581 -0
  30. data/vendor/simdjson/amalgamation.sh +158 -0
  31. data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
  32. data/vendor/simdjson/benchmark/benchmark.h +223 -0
  33. data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
  34. data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
  35. data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
  36. data/vendor/simdjson/benchmark/parse.cpp +393 -0
  37. data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
  38. data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
  39. data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
  40. data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
  41. data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
  42. data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
  43. data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
  44. data/vendor/simdjson/doc/gbps.png +0 -0
  45. data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
  46. data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
  47. data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
  48. data/vendor/simdjson/images/halvarflake.png +0 -0
  49. data/vendor/simdjson/images/logo.png +0 -0
  50. data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
  51. data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
  52. data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
  53. data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
  54. data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
  55. data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
  56. data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
  57. data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
  58. data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
  59. data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
  60. data/vendor/simdjson/include/simdjson/portability.h +172 -0
  61. data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
  62. data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
  63. data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
  64. data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
  65. data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
  66. data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
  67. data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
  68. data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
  69. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
  70. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
  71. data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
  72. data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
  73. data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
  74. data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
  75. data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
  76. data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
  77. data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
  78. data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
  79. data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
  80. data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
  81. data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
  82. data/vendor/simdjson/scripts/issue150.sh +14 -0
  83. data/vendor/simdjson/scripts/javascript/README.md +3 -0
  84. data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
  85. data/vendor/simdjson/scripts/minifier.sh +11 -0
  86. data/vendor/simdjson/scripts/parseandstat.sh +24 -0
  87. data/vendor/simdjson/scripts/parser.sh +11 -0
  88. data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
  89. data/vendor/simdjson/scripts/plotparse.sh +98 -0
  90. data/vendor/simdjson/scripts/selectparser.sh +11 -0
  91. data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
  92. data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
  93. data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
  94. data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
  95. data/vendor/simdjson/scripts/testjson2json.sh +99 -0
  96. data/vendor/simdjson/scripts/transitions/Makefile +10 -0
  97. data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
  98. data/vendor/simdjson/singleheader/README.md +1 -0
  99. data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
  100. data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
  101. data/vendor/simdjson/singleheader/simdjson.h +39692 -0
  102. data/vendor/simdjson/src/CMakeLists.txt +67 -0
  103. data/vendor/simdjson/src/jsonioutil.cpp +35 -0
  104. data/vendor/simdjson/src/jsonminifier.cpp +285 -0
  105. data/vendor/simdjson/src/jsonparser.cpp +91 -0
  106. data/vendor/simdjson/src/parsedjson.cpp +323 -0
  107. data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
  108. data/vendor/simdjson/src/simdjson.cpp +30 -0
  109. data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
  110. data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
  111. data/vendor/simdjson/style/clang-format-check.sh +25 -0
  112. data/vendor/simdjson/style/clang-format.sh +25 -0
  113. data/vendor/simdjson/style/run-clang-format.py +326 -0
  114. data/vendor/simdjson/tape.md +134 -0
  115. data/vendor/simdjson/tests/CMakeLists.txt +25 -0
  116. data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
  117. data/vendor/simdjson/tests/basictests.cpp +75 -0
  118. data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
  119. data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
  120. data/vendor/simdjson/tests/pointercheck.cpp +38 -0
  121. data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
  122. data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
  123. data/vendor/simdjson/tools/CMakeLists.txt +3 -0
  124. data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
  125. data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
  126. data/vendor/simdjson/tools/json2json.cpp +112 -0
  127. data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
  128. data/vendor/simdjson/tools/jsonstats.cpp +143 -0
  129. data/vendor/simdjson/tools/minify.cpp +21 -0
  130. data/vendor/simdjson/tools/release.py +125 -0
  131. data/vendor/simdjson/windows/dirent_portable.h +1043 -0
  132. metadata +273 -0
@@ -0,0 +1,210 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
3
+
4
+ #include "simdjson/simdutf8check_arm64.h"
5
+ #include "simdjson/stage1_find_marks.h"
6
+ #include "simdjson/stage1_find_marks_flatten.h"
7
+ #include "simdjson/stage1_find_marks_macros.h"
8
+
9
+ #ifdef IS_ARM64
10
+ namespace simdjson {
11
+ template <> struct simd_input<Architecture::ARM64> {
12
+ uint8x16_t i0;
13
+ uint8x16_t i1;
14
+ uint8x16_t i2;
15
+ uint8x16_t i3;
16
+ };
17
+
18
+ template <>
19
+ really_inline simd_input<Architecture::ARM64>
20
+ fill_input<Architecture::ARM64>(const uint8_t *ptr) {
21
+ struct simd_input<Architecture::ARM64> in;
22
+ in.i0 = vld1q_u8(ptr + 0);
23
+ in.i1 = vld1q_u8(ptr + 16);
24
+ in.i2 = vld1q_u8(ptr + 32);
25
+ in.i3 = vld1q_u8(ptr + 48);
26
+ return in;
27
+ }
28
+
29
+ really_inline uint16_t neon_movemask(uint8x16_t input) {
30
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
31
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
32
+ uint8x16_t minput = vandq_u8(input, bit_mask);
33
+ uint8x16_t tmp = vpaddq_u8(minput, minput);
34
+ tmp = vpaddq_u8(tmp, tmp);
35
+ tmp = vpaddq_u8(tmp, tmp);
36
+ return vgetq_lane_u16(vreinterpretq_u16_u8(tmp), 0);
37
+ }
38
+
39
+ really_inline uint64_t neon_movemask_bulk(uint8x16_t p0, uint8x16_t p1,
40
+ uint8x16_t p2, uint8x16_t p3) {
41
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
42
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
43
+ uint8x16_t t0 = vandq_u8(p0, bit_mask);
44
+ uint8x16_t t1 = vandq_u8(p1, bit_mask);
45
+ uint8x16_t t2 = vandq_u8(p2, bit_mask);
46
+ uint8x16_t t3 = vandq_u8(p3, bit_mask);
47
+ uint8x16_t sum0 = vpaddq_u8(t0, t1);
48
+ uint8x16_t sum1 = vpaddq_u8(t2, t3);
49
+ sum0 = vpaddq_u8(sum0, sum1);
50
+ sum0 = vpaddq_u8(sum0, sum0);
51
+ return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
52
+ }
53
+
54
+ template <>
55
+ really_inline uint64_t
56
+ compute_quote_mask<Architecture::ARM64>(uint64_t quote_bits) {
57
+ #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
58
+ return vmull_p64(-1ULL, quote_bits);
59
+ #else
60
+ return portable_compute_quote_mask(quote_bits);
61
+ #endif
62
+ }
63
+
64
+ template <> struct utf8_checking_state<Architecture::ARM64> {
65
+ int8x16_t has_error{};
66
+ processed_utf_bytes previous{};
67
+ };
68
+
69
+ // Checks that all bytes are ascii
70
+ really_inline bool check_ascii_neon(simd_input<Architecture::ARM64> in) {
71
+ // checking if the most significant bit is always equal to 0.
72
+ uint8x16_t high_bit = vdupq_n_u8(0x80);
73
+ uint8x16_t t0 = vorrq_u8(in.i0, in.i1);
74
+ uint8x16_t t1 = vorrq_u8(in.i2, in.i3);
75
+ uint8x16_t t3 = vorrq_u8(t0, t1);
76
+ uint8x16_t t4 = vandq_u8(t3, high_bit);
77
+ uint64x2_t v64 = vreinterpretq_u64_u8(t4);
78
+ uint32x2_t v32 = vqmovn_u64(v64);
79
+ uint64x1_t result = vreinterpret_u64_u32(v32);
80
+ return vget_lane_u64(result, 0) == 0;
81
+ }
82
+
83
+ template <>
84
+ really_inline void check_utf8<Architecture::ARM64>(
85
+ simd_input<Architecture::ARM64> in,
86
+ utf8_checking_state<Architecture::ARM64> &state) {
87
+ if (check_ascii_neon(in)) {
88
+ // All bytes are ascii. Therefore the byte that was just before must be
89
+ // ascii too. We only check the byte that was just before simd_input. Nines
90
+ // are arbitrary values.
91
+ const int8x16_t verror =
92
+ (int8x16_t){9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1};
93
+ state.has_error =
94
+ vorrq_s8(vreinterpretq_s8_u8(
95
+ vcgtq_s8(state.previous.carried_continuations, verror)),
96
+ state.has_error);
97
+ } else {
98
+ // it is not ascii so we have to do heavy work
99
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i0),
100
+ &(state.previous), &(state.has_error));
101
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i1),
102
+ &(state.previous), &(state.has_error));
103
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i2),
104
+ &(state.previous), &(state.has_error));
105
+ state.previous = check_utf8_bytes(vreinterpretq_s8_u8(in.i3),
106
+ &(state.previous), &(state.has_error));
107
+ }
108
+ }
109
+
110
+ template <>
111
+ really_inline ErrorValues check_utf8_errors<Architecture::ARM64>(
112
+ utf8_checking_state<Architecture::ARM64> &state) {
113
+ uint64x2_t v64 = vreinterpretq_u64_s8(state.has_error);
114
+ uint32x2_t v32 = vqmovn_u64(v64);
115
+ uint64x1_t result = vreinterpret_u64_u32(v32);
116
+ return vget_lane_u64(result, 0) != 0 ? simdjson::UTF8_ERROR
117
+ : simdjson::SUCCESS;
118
+ }
119
+
120
+ template <>
121
+ really_inline uint64_t cmp_mask_against_input<Architecture::ARM64>(
122
+ simd_input<Architecture::ARM64> in, uint8_t m) {
123
+ const uint8x16_t mask = vmovq_n_u8(m);
124
+ uint8x16_t cmp_res_0 = vceqq_u8(in.i0, mask);
125
+ uint8x16_t cmp_res_1 = vceqq_u8(in.i1, mask);
126
+ uint8x16_t cmp_res_2 = vceqq_u8(in.i2, mask);
127
+ uint8x16_t cmp_res_3 = vceqq_u8(in.i3, mask);
128
+ return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
129
+ }
130
+
131
+ template <>
132
+ really_inline uint64_t unsigned_lteq_against_input<Architecture::ARM64>(
133
+ simd_input<Architecture::ARM64> in, uint8_t m) {
134
+ const uint8x16_t mask = vmovq_n_u8(m);
135
+ uint8x16_t cmp_res_0 = vcleq_u8(in.i0, mask);
136
+ uint8x16_t cmp_res_1 = vcleq_u8(in.i1, mask);
137
+ uint8x16_t cmp_res_2 = vcleq_u8(in.i2, mask);
138
+ uint8x16_t cmp_res_3 = vcleq_u8(in.i3, mask);
139
+ return neon_movemask_bulk(cmp_res_0, cmp_res_1, cmp_res_2, cmp_res_3);
140
+ }
141
+
142
+ template <>
143
+ really_inline uint64_t find_odd_backslash_sequences<Architecture::ARM64>(
144
+ simd_input<Architecture::ARM64> in,
145
+ uint64_t &prev_iter_ends_odd_backslash) {
146
+ FIND_ODD_BACKSLASH_SEQUENCES(Architecture::ARM64, in,
147
+ prev_iter_ends_odd_backslash);
148
+ }
149
+
150
+ template <>
151
+ really_inline uint64_t find_quote_mask_and_bits<Architecture::ARM64>(
152
+ simd_input<Architecture::ARM64> in, uint64_t odd_ends,
153
+ uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
154
+ uint64_t &error_mask) {
155
+ FIND_QUOTE_MASK_AND_BITS(Architecture::ARM64, in, odd_ends,
156
+ prev_iter_inside_quote, quote_bits, error_mask)
157
+ }
158
+
159
+ template <>
160
+ really_inline void find_whitespace_and_structurals<Architecture::ARM64>(
161
+ simd_input<Architecture::ARM64> in, uint64_t &whitespace,
162
+ uint64_t &structurals) {
163
+ const uint8x16_t low_nibble_mask =
164
+ (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
165
+ const uint8x16_t high_nibble_mask =
166
+ (uint8x16_t){8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0};
167
+ const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
168
+ const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
169
+ const uint8x16_t low_nib_and_mask = vmovq_n_u8(0xf);
170
+
171
+ uint8x16_t nib_0_lo = vandq_u8(in.i0, low_nib_and_mask);
172
+ uint8x16_t nib_0_hi = vshrq_n_u8(in.i0, 4);
173
+ uint8x16_t shuf_0_lo = vqtbl1q_u8(low_nibble_mask, nib_0_lo);
174
+ uint8x16_t shuf_0_hi = vqtbl1q_u8(high_nibble_mask, nib_0_hi);
175
+ uint8x16_t v_0 = vandq_u8(shuf_0_lo, shuf_0_hi);
176
+
177
+ uint8x16_t nib_1_lo = vandq_u8(in.i1, low_nib_and_mask);
178
+ uint8x16_t nib_1_hi = vshrq_n_u8(in.i1, 4);
179
+ uint8x16_t shuf_1_lo = vqtbl1q_u8(low_nibble_mask, nib_1_lo);
180
+ uint8x16_t shuf_1_hi = vqtbl1q_u8(high_nibble_mask, nib_1_hi);
181
+ uint8x16_t v_1 = vandq_u8(shuf_1_lo, shuf_1_hi);
182
+
183
+ uint8x16_t nib_2_lo = vandq_u8(in.i2, low_nib_and_mask);
184
+ uint8x16_t nib_2_hi = vshrq_n_u8(in.i2, 4);
185
+ uint8x16_t shuf_2_lo = vqtbl1q_u8(low_nibble_mask, nib_2_lo);
186
+ uint8x16_t shuf_2_hi = vqtbl1q_u8(high_nibble_mask, nib_2_hi);
187
+ uint8x16_t v_2 = vandq_u8(shuf_2_lo, shuf_2_hi);
188
+
189
+ uint8x16_t nib_3_lo = vandq_u8(in.i3, low_nib_and_mask);
190
+ uint8x16_t nib_3_hi = vshrq_n_u8(in.i3, 4);
191
+ uint8x16_t shuf_3_lo = vqtbl1q_u8(low_nibble_mask, nib_3_lo);
192
+ uint8x16_t shuf_3_hi = vqtbl1q_u8(high_nibble_mask, nib_3_hi);
193
+ uint8x16_t v_3 = vandq_u8(shuf_3_lo, shuf_3_hi);
194
+
195
+ uint8x16_t tmp_0 = vtstq_u8(v_0, structural_shufti_mask);
196
+ uint8x16_t tmp_1 = vtstq_u8(v_1, structural_shufti_mask);
197
+ uint8x16_t tmp_2 = vtstq_u8(v_2, structural_shufti_mask);
198
+ uint8x16_t tmp_3 = vtstq_u8(v_3, structural_shufti_mask);
199
+ structurals = neon_movemask_bulk(tmp_0, tmp_1, tmp_2, tmp_3);
200
+
201
+ uint8x16_t tmp_ws_0 = vtstq_u8(v_0, whitespace_shufti_mask);
202
+ uint8x16_t tmp_ws_1 = vtstq_u8(v_1, whitespace_shufti_mask);
203
+ uint8x16_t tmp_ws_2 = vtstq_u8(v_2, whitespace_shufti_mask);
204
+ uint8x16_t tmp_ws_3 = vtstq_u8(v_3, whitespace_shufti_mask);
205
+ whitespace = neon_movemask_bulk(tmp_ws_0, tmp_ws_1, tmp_ws_2, tmp_ws_3);
206
+ }
207
+ } // namespace simdjson
208
+
209
+ #endif // IS_ARM64
210
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_ARM64_H
@@ -0,0 +1,93 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
3
+
4
+ namespace simdjson {
5
+
6
+ #ifdef SIMDJSON_NAIVE_FLATTEN // useful for benchmarking
7
+ //
8
+ // This is just a naive implementation. It should be normally
9
+ // disable, but can be used for research purposes to compare
10
+ // again our optimized version.
11
+ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
12
+ uint32_t idx, uint64_t bits) {
13
+ uint32_t *out_ptr = base_ptr + base;
14
+ idx -= 64;
15
+ while (bits != 0) {
16
+ out_ptr[0] = idx + trailing_zeroes(bits);
17
+ bits = bits & (bits - 1);
18
+ out_ptr++;
19
+ }
20
+ base = (out_ptr - base_ptr);
21
+ }
22
+
23
+ #else
24
+ // flatten out values in 'bits' assuming that they are are to have values of idx
25
+ // plus their position in the bitvector, and store these indexes at
26
+ // base_ptr[base] incrementing base as we go
27
+ // will potentially store extra values beyond end of valid bits, so base_ptr
28
+ // needs to be large enough to handle this
29
+ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
30
+ uint32_t idx, uint64_t bits) {
31
+ // In some instances, the next branch is expensive because it is mispredicted.
32
+ // Unfortunately, in other cases,
33
+ // it helps tremendously.
34
+ if (bits == 0)
35
+ return;
36
+ uint32_t cnt = hamming(bits);
37
+ uint32_t next_base = base + cnt;
38
+ idx -= 64;
39
+ base_ptr += base;
40
+ {
41
+ base_ptr[0] = idx + trailing_zeroes(bits);
42
+ bits = bits & (bits - 1);
43
+ base_ptr[1] = idx + trailing_zeroes(bits);
44
+ bits = bits & (bits - 1);
45
+ base_ptr[2] = idx + trailing_zeroes(bits);
46
+ bits = bits & (bits - 1);
47
+ base_ptr[3] = idx + trailing_zeroes(bits);
48
+ bits = bits & (bits - 1);
49
+ base_ptr[4] = idx + trailing_zeroes(bits);
50
+ bits = bits & (bits - 1);
51
+ base_ptr[5] = idx + trailing_zeroes(bits);
52
+ bits = bits & (bits - 1);
53
+ base_ptr[6] = idx + trailing_zeroes(bits);
54
+ bits = bits & (bits - 1);
55
+ base_ptr[7] = idx + trailing_zeroes(bits);
56
+ bits = bits & (bits - 1);
57
+ base_ptr += 8;
58
+ }
59
+ // We hope that the next branch is easily predicted.
60
+ if (cnt > 8) {
61
+ base_ptr[0] = idx + trailing_zeroes(bits);
62
+ bits = bits & (bits - 1);
63
+ base_ptr[1] = idx + trailing_zeroes(bits);
64
+ bits = bits & (bits - 1);
65
+ base_ptr[2] = idx + trailing_zeroes(bits);
66
+ bits = bits & (bits - 1);
67
+ base_ptr[3] = idx + trailing_zeroes(bits);
68
+ bits = bits & (bits - 1);
69
+ base_ptr[4] = idx + trailing_zeroes(bits);
70
+ bits = bits & (bits - 1);
71
+ base_ptr[5] = idx + trailing_zeroes(bits);
72
+ bits = bits & (bits - 1);
73
+ base_ptr[6] = idx + trailing_zeroes(bits);
74
+ bits = bits & (bits - 1);
75
+ base_ptr[7] = idx + trailing_zeroes(bits);
76
+ bits = bits & (bits - 1);
77
+ base_ptr += 8;
78
+ }
79
+ if (cnt > 16) { // unluckly: we rarely get here
80
+ // since it means having one structural or pseudo-structral element
81
+ // every 4 characters (possible with inputs like "","","",...).
82
+ do {
83
+ base_ptr[0] = idx + trailing_zeroes(bits);
84
+ bits = bits & (bits - 1);
85
+ base_ptr++;
86
+ } while (bits != 0);
87
+ }
88
+ base = next_base;
89
+ }
90
+ #endif // SIMDJSON_NAIVE_FLATTEN
91
+ } // namespace simdjson
92
+
93
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
@@ -0,0 +1,95 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_HASWELL_H
3
+
4
+ // This file provides the same function as
5
+ // stage1_find_marks_flatten.h, but uses Intel intrinsics.
6
+ // This should provide better performance on Visual Studio
7
+ // and other compilers that do a conservative optimization.
8
+
9
+ // Specifically, on x64 processors with BMI,
10
+ // x & (x - 1) should be mapped to
11
+ // the blsr instruction. By using the
12
+ // _blsr_u64 intrinsic, we
13
+ // ensure that this will happen.
14
+ /////////
15
+
16
+ #include "simdjson/common_defs.h"
17
+ #include "simdjson/portability.h"
18
+
19
+ #ifdef IS_X86_64
20
+
21
+ TARGET_HASWELL
22
+ namespace simdjson {
23
+ namespace haswell {
24
+
25
+ // flatten out values in 'bits' assuming that they are are to have values of idx
26
+ // plus their position in the bitvector, and store these indexes at
27
+ // base_ptr[base] incrementing base as we go
28
+ // will potentially store extra values beyond end of valid bits, so base_ptr
29
+ // needs to be large enough to handle this
30
+ really_inline void flatten_bits(uint32_t *base_ptr, uint32_t &base,
31
+ uint32_t idx, uint64_t bits) {
32
+ // In some instances, the next branch is expensive because it is mispredicted.
33
+ // Unfortunately, in other cases,
34
+ // it helps tremendously.
35
+ if (bits == 0)
36
+ return;
37
+ uint32_t cnt = _mm_popcnt_u64(bits);
38
+ uint32_t next_base = base + cnt;
39
+ idx -= 64;
40
+ base_ptr += base;
41
+ {
42
+ base_ptr[0] = idx + trailing_zeroes(bits);
43
+ bits = _blsr_u64(bits);
44
+ base_ptr[1] = idx + trailing_zeroes(bits);
45
+ bits = _blsr_u64(bits);
46
+ base_ptr[2] = idx + trailing_zeroes(bits);
47
+ bits = _blsr_u64(bits);
48
+ base_ptr[3] = idx + trailing_zeroes(bits);
49
+ bits = _blsr_u64(bits);
50
+ base_ptr[4] = idx + trailing_zeroes(bits);
51
+ bits = _blsr_u64(bits);
52
+ base_ptr[5] = idx + trailing_zeroes(bits);
53
+ bits = _blsr_u64(bits);
54
+ base_ptr[6] = idx + trailing_zeroes(bits);
55
+ bits = _blsr_u64(bits);
56
+ base_ptr[7] = idx + trailing_zeroes(bits);
57
+ bits = _blsr_u64(bits);
58
+ base_ptr += 8;
59
+ }
60
+ // We hope that the next branch is easily predicted.
61
+ if (cnt > 8) {
62
+ base_ptr[0] = idx + trailing_zeroes(bits);
63
+ bits = _blsr_u64(bits);
64
+ base_ptr[1] = idx + trailing_zeroes(bits);
65
+ bits = _blsr_u64(bits);
66
+ base_ptr[2] = idx + trailing_zeroes(bits);
67
+ bits = _blsr_u64(bits);
68
+ base_ptr[3] = idx + trailing_zeroes(bits);
69
+ bits = _blsr_u64(bits);
70
+ base_ptr[4] = idx + trailing_zeroes(bits);
71
+ bits = _blsr_u64(bits);
72
+ base_ptr[5] = idx + trailing_zeroes(bits);
73
+ bits = _blsr_u64(bits);
74
+ base_ptr[6] = idx + trailing_zeroes(bits);
75
+ bits = _blsr_u64(bits);
76
+ base_ptr[7] = idx + trailing_zeroes(bits);
77
+ bits = _blsr_u64(bits);
78
+ base_ptr += 8;
79
+ }
80
+ if (cnt > 16) { // unluckly: we rarely get here
81
+ // since it means having one structural or pseudo-structral element
82
+ // every 4 characters (possible with inputs like "","","",...).
83
+ do {
84
+ base_ptr[0] = idx + trailing_zeroes(bits);
85
+ bits = _blsr_u64(bits);
86
+ base_ptr++;
87
+ } while (bits != 0);
88
+ }
89
+ base = next_base;
90
+ }
91
+ } // namespace haswell
92
+ } // namespace simdjson
93
+ UNTARGET_REGION
94
+ #endif // IS_X86_64
95
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_FLATTEN_H
@@ -0,0 +1,210 @@
1
+ #ifndef SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
2
+ #define SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H
3
+
4
+ #include "simdjson/simdutf8check_haswell.h"
5
+ #include "simdjson/stage1_find_marks.h"
6
+ #include "simdjson/stage1_find_marks_flatten_haswell.h"
7
+ #include "simdjson/stage1_find_marks_macros.h"
8
+
9
+ #ifdef IS_X86_64
10
+
11
+ TARGET_HASWELL
12
+ namespace simdjson {
13
+ template <> struct simd_input<Architecture::HASWELL> {
14
+ __m256i lo;
15
+ __m256i hi;
16
+ };
17
+
18
+ template <>
19
+ really_inline simd_input<Architecture::HASWELL>
20
+ fill_input<Architecture::HASWELL>(const uint8_t *ptr) {
21
+ struct simd_input<Architecture::HASWELL> in;
22
+ in.lo = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 0));
23
+ in.hi = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr + 32));
24
+ return in;
25
+ }
26
+
27
+ template <>
28
+ really_inline uint64_t
29
+ compute_quote_mask<Architecture::HASWELL>(uint64_t quote_bits) {
30
+ // There should be no such thing with a processing supporting avx2
31
+ // but not clmul.
32
+ uint64_t quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(
33
+ _mm_set_epi64x(0ULL, quote_bits), _mm_set1_epi8(0xFFu), 0));
34
+ return quote_mask;
35
+ }
36
+
37
+ template <> struct utf8_checking_state<Architecture::HASWELL> {
38
+ __m256i has_error;
39
+ avx_processed_utf_bytes previous;
40
+ utf8_checking_state() {
41
+ has_error = _mm256_setzero_si256();
42
+ previous.raw_bytes = _mm256_setzero_si256();
43
+ previous.high_nibbles = _mm256_setzero_si256();
44
+ previous.carried_continuations = _mm256_setzero_si256();
45
+ }
46
+ };
47
+
48
+ template <>
49
+ really_inline void check_utf8<Architecture::HASWELL>(
50
+ simd_input<Architecture::HASWELL> in,
51
+ utf8_checking_state<Architecture::HASWELL> &state) {
52
+ __m256i high_bit = _mm256_set1_epi8(0x80u);
53
+ if ((_mm256_testz_si256(_mm256_or_si256(in.lo, in.hi), high_bit)) == 1) {
54
+ // it is ascii, we just check continuation
55
+ state.has_error = _mm256_or_si256(
56
+ _mm256_cmpgt_epi8(state.previous.carried_continuations,
57
+ _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
58
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
59
+ 9, 9, 9, 9, 9, 9, 9, 1)),
60
+ state.has_error);
61
+ } else {
62
+ // it is not ascii so we have to do heavy work
63
+ state.previous =
64
+ avx_check_utf8_bytes(in.lo, &(state.previous), &(state.has_error));
65
+ state.previous =
66
+ avx_check_utf8_bytes(in.hi, &(state.previous), &(state.has_error));
67
+ }
68
+ }
69
+
70
+ template <>
71
+ really_inline ErrorValues check_utf8_errors<Architecture::HASWELL>(
72
+ utf8_checking_state<Architecture::HASWELL> &state) {
73
+ return _mm256_testz_si256(state.has_error, state.has_error) == 0
74
+ ? simdjson::UTF8_ERROR
75
+ : simdjson::SUCCESS;
76
+ }
77
+
78
+ template <>
79
+ really_inline uint64_t cmp_mask_against_input<Architecture::HASWELL>(
80
+ simd_input<Architecture::HASWELL> in, uint8_t m) {
81
+ const __m256i mask = _mm256_set1_epi8(m);
82
+ __m256i cmp_res_0 = _mm256_cmpeq_epi8(in.lo, mask);
83
+ uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
84
+ __m256i cmp_res_1 = _mm256_cmpeq_epi8(in.hi, mask);
85
+ uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
86
+ return res_0 | (res_1 << 32);
87
+ }
88
+
89
+ template <>
90
+ really_inline uint64_t unsigned_lteq_against_input<Architecture::HASWELL>(
91
+ simd_input<Architecture::HASWELL> in, uint8_t m) {
92
+ const __m256i maxval = _mm256_set1_epi8(m);
93
+ __m256i cmp_res_0 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.lo), maxval);
94
+ uint64_t res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(cmp_res_0));
95
+ __m256i cmp_res_1 = _mm256_cmpeq_epi8(_mm256_max_epu8(maxval, in.hi), maxval);
96
+ uint64_t res_1 = _mm256_movemask_epi8(cmp_res_1);
97
+ return res_0 | (res_1 << 32);
98
+ }
99
+
100
+ template <>
101
+ really_inline uint64_t find_odd_backslash_sequences<Architecture::HASWELL>(
102
+ simd_input<Architecture::HASWELL> in,
103
+ uint64_t &prev_iter_ends_odd_backslash) {
104
+ FIND_ODD_BACKSLASH_SEQUENCES(Architecture::HASWELL, in,
105
+ prev_iter_ends_odd_backslash);
106
+ }
107
+
108
+ template <>
109
+ really_inline uint64_t find_quote_mask_and_bits<Architecture::HASWELL>(
110
+ simd_input<Architecture::HASWELL> in, uint64_t odd_ends,
111
+ uint64_t &prev_iter_inside_quote, uint64_t &quote_bits,
112
+ uint64_t &error_mask) {
113
+ FIND_QUOTE_MASK_AND_BITS(Architecture::HASWELL, in, odd_ends,
114
+ prev_iter_inside_quote, quote_bits, error_mask)
115
+ }
116
+
117
+ template <>
118
+ really_inline void find_whitespace_and_structurals<Architecture::HASWELL>(
119
+ simd_input<Architecture::HASWELL> in, uint64_t &whitespace,
120
+ uint64_t &structurals) {
121
+ #ifdef SIMDJSON_NAIVE_STRUCTURAL
122
+ // You should never need this naive approach, but it can be useful
123
+ // for research purposes
124
+ const __m256i mask_open_brace = _mm256_set1_epi8(0x7b);
125
+ __m256i struct_lo = _mm256_cmpeq_epi8(in.lo, mask_open_brace);
126
+ __m256i struct_hi = _mm256_cmpeq_epi8(in.hi, mask_open_brace);
127
+ const __m256i mask_close_brace = _mm256_set1_epi8(0x7d);
128
+ struct_lo =
129
+ _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_brace));
130
+ struct_hi =
131
+ _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_brace));
132
+ const __m256i mask_open_bracket = _mm256_set1_epi8(0x5b);
133
+ struct_lo =
134
+ _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_open_bracket));
135
+ struct_hi =
136
+ _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_open_bracket));
137
+ const __m256i mask_close_bracket = _mm256_set1_epi8(0x5d);
138
+ struct_lo =
139
+ _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_close_bracket));
140
+ struct_hi =
141
+ _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_close_bracket));
142
+ const __m256i mask_column = _mm256_set1_epi8(0x3a);
143
+ struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_column));
144
+ struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_column));
145
+ const __m256i mask_comma = _mm256_set1_epi8(0x2c);
146
+ struct_lo = _mm256_or_si256(struct_lo, _mm256_cmpeq_epi8(in.lo, mask_comma));
147
+ struct_hi = _mm256_or_si256(struct_hi, _mm256_cmpeq_epi8(in.hi, mask_comma));
148
+ uint64_t structural_res_0 =
149
+ static_cast<uint32_t>(_mm256_movemask_epi8(struct_lo));
150
+ uint64_t structural_res_1 = _mm256_movemask_epi8(struct_hi);
151
+ structurals = (structural_res_0 | (structural_res_1 << 32));
152
+
153
+ const __m256i mask_space = _mm256_set1_epi8(0x20);
154
+ __m256i space_lo = _mm256_cmpeq_epi8(in.lo, mask_space);
155
+ __m256i space_hi = _mm256_cmpeq_epi8(in.hi, mask_space);
156
+ const __m256i mask_linefeed = _mm256_set1_epi8(0x0a);
157
+ space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_linefeed));
158
+ space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_linefeed));
159
+ const __m256i mask_tab = _mm256_set1_epi8(0x09);
160
+ space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_tab));
161
+ space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_tab));
162
+ const __m256i mask_carriage = _mm256_set1_epi8(0x0d);
163
+ space_lo = _mm256_or_si256(space_lo, _mm256_cmpeq_epi8(in.lo, mask_carriage));
164
+ space_hi = _mm256_or_si256(space_hi, _mm256_cmpeq_epi8(in.hi, mask_carriage));
165
+
166
+ uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(space_lo));
167
+ uint64_t ws_res_1 = _mm256_movemask_epi8(space_hi);
168
+ whitespace = (ws_res_0 | (ws_res_1 << 32));
169
+ // end of naive approach
170
+
171
+ #else // SIMDJSON_NAIVE_STRUCTURAL
172
+ // clang-format off
173
+ const __m256i structural_table =
174
+ _mm256_setr_epi8(44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123,
175
+ 44, 125, 0, 0, 0xc0u, 0, 0, 0, 0, 0, 0, 0, 0, 0, 58, 123);
176
+ const __m256i white_table = _mm256_setr_epi8(
177
+ 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100,
178
+ 32, 100, 100, 100, 17, 100, 113, 2, 100, 9, 10, 112, 100, 13, 100, 100);
179
+ // clang-format on
180
+ const __m256i struct_offset = _mm256_set1_epi8(0xd4u);
181
+ const __m256i struct_mask = _mm256_set1_epi8(32);
182
+
183
+ __m256i lo_white =
184
+ _mm256_cmpeq_epi8(in.lo, _mm256_shuffle_epi8(white_table, in.lo));
185
+ __m256i hi_white =
186
+ _mm256_cmpeq_epi8(in.hi, _mm256_shuffle_epi8(white_table, in.hi));
187
+ uint64_t ws_res_0 = static_cast<uint32_t>(_mm256_movemask_epi8(lo_white));
188
+ uint64_t ws_res_1 = _mm256_movemask_epi8(hi_white);
189
+ whitespace = (ws_res_0 | (ws_res_1 << 32));
190
+ __m256i lo_struct_r1 = _mm256_add_epi8(struct_offset, in.lo);
191
+ __m256i hi_struct_r1 = _mm256_add_epi8(struct_offset, in.hi);
192
+ __m256i lo_struct_r2 = _mm256_or_si256(in.lo, struct_mask);
193
+ __m256i hi_struct_r2 = _mm256_or_si256(in.hi, struct_mask);
194
+ __m256i lo_struct_r3 = _mm256_shuffle_epi8(structural_table, lo_struct_r1);
195
+ __m256i hi_struct_r3 = _mm256_shuffle_epi8(structural_table, hi_struct_r1);
196
+ __m256i lo_struct = _mm256_cmpeq_epi8(lo_struct_r2, lo_struct_r3);
197
+ __m256i hi_struct = _mm256_cmpeq_epi8(hi_struct_r2, hi_struct_r3);
198
+
199
+ uint64_t structural_res_0 =
200
+ static_cast<uint32_t>(_mm256_movemask_epi8(lo_struct));
201
+ uint64_t structural_res_1 = _mm256_movemask_epi8(hi_struct);
202
+ structurals = (structural_res_0 | (structural_res_1 << 32));
203
+ #endif // SIMDJSON_NAIVE_STRUCTURAL
204
+ }
205
+
206
+ } // namespace simdjson
207
+ UNTARGET_REGION
208
+
209
+ #endif // IS_X86_64
210
+ #endif // SIMDJSON_STAGE1_FIND_MARKS_HASWELL_H