simdjson 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.clang-format +5 -0
  3. data/.gitignore +14 -0
  4. data/.gitmodules +3 -0
  5. data/.rubocop.yml +9 -0
  6. data/.travis.yml +7 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +39 -0
  10. data/Rakefile +32 -0
  11. data/benchmark/apache_builds.json +4421 -0
  12. data/benchmark/demo.json +15 -0
  13. data/benchmark/github_events.json +1390 -0
  14. data/benchmark/run_benchmark.rb +30 -0
  15. data/ext/simdjson/extconf.rb +22 -0
  16. data/ext/simdjson/simdjson.cpp +76 -0
  17. data/ext/simdjson/simdjson.hpp +6 -0
  18. data/lib/simdjson/version.rb +3 -0
  19. data/lib/simdjson.rb +2 -0
  20. data/simdjson.gemspec +35 -0
  21. data/vendor/.gitkeep +0 -0
  22. data/vendor/simdjson/AUTHORS +3 -0
  23. data/vendor/simdjson/CMakeLists.txt +63 -0
  24. data/vendor/simdjson/CONTRIBUTORS +27 -0
  25. data/vendor/simdjson/Dockerfile +10 -0
  26. data/vendor/simdjson/LICENSE +201 -0
  27. data/vendor/simdjson/Makefile +203 -0
  28. data/vendor/simdjson/Notes.md +85 -0
  29. data/vendor/simdjson/README.md +581 -0
  30. data/vendor/simdjson/amalgamation.sh +158 -0
  31. data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
  32. data/vendor/simdjson/benchmark/benchmark.h +223 -0
  33. data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
  34. data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
  35. data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
  36. data/vendor/simdjson/benchmark/parse.cpp +393 -0
  37. data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
  38. data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
  39. data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
  40. data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
  41. data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
  42. data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
  43. data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
  44. data/vendor/simdjson/doc/gbps.png +0 -0
  45. data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
  46. data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
  47. data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
  48. data/vendor/simdjson/images/halvarflake.png +0 -0
  49. data/vendor/simdjson/images/logo.png +0 -0
  50. data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
  51. data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
  52. data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
  53. data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
  54. data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
  55. data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
  56. data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
  57. data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
  58. data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
  59. data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
  60. data/vendor/simdjson/include/simdjson/portability.h +172 -0
  61. data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
  62. data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
  63. data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
  64. data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
  65. data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
  66. data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
  67. data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
  68. data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
  69. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
  70. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
  71. data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
  72. data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
  73. data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
  74. data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
  75. data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
  76. data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
  77. data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
  78. data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
  79. data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
  80. data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
  81. data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
  82. data/vendor/simdjson/scripts/issue150.sh +14 -0
  83. data/vendor/simdjson/scripts/javascript/README.md +3 -0
  84. data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
  85. data/vendor/simdjson/scripts/minifier.sh +11 -0
  86. data/vendor/simdjson/scripts/parseandstat.sh +24 -0
  87. data/vendor/simdjson/scripts/parser.sh +11 -0
  88. data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
  89. data/vendor/simdjson/scripts/plotparse.sh +98 -0
  90. data/vendor/simdjson/scripts/selectparser.sh +11 -0
  91. data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
  92. data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
  93. data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
  94. data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
  95. data/vendor/simdjson/scripts/testjson2json.sh +99 -0
  96. data/vendor/simdjson/scripts/transitions/Makefile +10 -0
  97. data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
  98. data/vendor/simdjson/singleheader/README.md +1 -0
  99. data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
  100. data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
  101. data/vendor/simdjson/singleheader/simdjson.h +39692 -0
  102. data/vendor/simdjson/src/CMakeLists.txt +67 -0
  103. data/vendor/simdjson/src/jsonioutil.cpp +35 -0
  104. data/vendor/simdjson/src/jsonminifier.cpp +285 -0
  105. data/vendor/simdjson/src/jsonparser.cpp +91 -0
  106. data/vendor/simdjson/src/parsedjson.cpp +323 -0
  107. data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
  108. data/vendor/simdjson/src/simdjson.cpp +30 -0
  109. data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
  110. data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
  111. data/vendor/simdjson/style/clang-format-check.sh +25 -0
  112. data/vendor/simdjson/style/clang-format.sh +25 -0
  113. data/vendor/simdjson/style/run-clang-format.py +326 -0
  114. data/vendor/simdjson/tape.md +134 -0
  115. data/vendor/simdjson/tests/CMakeLists.txt +25 -0
  116. data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
  117. data/vendor/simdjson/tests/basictests.cpp +75 -0
  118. data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
  119. data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
  120. data/vendor/simdjson/tests/pointercheck.cpp +38 -0
  121. data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
  122. data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
  123. data/vendor/simdjson/tools/CMakeLists.txt +3 -0
  124. data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
  125. data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
  126. data/vendor/simdjson/tools/json2json.cpp +112 -0
  127. data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
  128. data/vendor/simdjson/tools/jsonstats.cpp +143 -0
  129. data/vendor/simdjson/tools/minify.cpp +21 -0
  130. data/vendor/simdjson/tools/release.py +125 -0
  131. data/vendor/simdjson/windows/dirent_portable.h +1043 -0
  132. metadata +273 -0
@@ -0,0 +1,393 @@
1
+ #include <cassert>
2
+ #include <cctype>
3
+ #ifndef _MSC_VER
4
+ #include <dirent.h>
5
+ #include <unistd.h>
6
+ #endif
7
+ #include <cinttypes>
8
+
9
+ #include <cstdio>
10
+ #include <cstdlib>
11
+ #include <cstring>
12
+
13
+ #include <algorithm>
14
+ #include <chrono>
15
+ #include <cstring>
16
+ #include <fstream>
17
+ #include <iomanip>
18
+ #include <iostream>
19
+ #include <map>
20
+ #include <set>
21
+ #include <sstream>
22
+ #include <string>
23
+ #include <vector>
24
+
25
+ #include "linux-perf-events.h"
26
+ #ifdef __linux__
27
+ #include <libgen.h>
28
+ #endif
29
+ //#define DEBUG
30
+ #include "simdjson/common_defs.h"
31
+ #include "simdjson/isadetection.h"
32
+ #include "simdjson/jsonioutil.h"
33
+ #include "simdjson/jsonparser.h"
34
+ #include "simdjson/parsedjson.h"
35
+ #include "simdjson/stage1_find_marks.h"
36
+ #include "simdjson/stage2_build_tape.h"
37
+ namespace simdjson {
38
+ Architecture _find_best_supported_implementation() {
39
+ constexpr uint32_t haswell_flags =
40
+ instruction_set::AVX2 | instruction_set::PCLMULQDQ |
41
+ instruction_set::BMI1 | instruction_set::BMI2;
42
+ constexpr uint32_t westmere_flags =
43
+ instruction_set::SSE42 | instruction_set::PCLMULQDQ;
44
+ uint32_t supports = detect_supported_architectures();
45
+ // Order from best to worst (within architecture)
46
+ if ((haswell_flags & supports) == haswell_flags) {
47
+ return Architecture::HASWELL;
48
+ }
49
+ if ((westmere_flags & supports) == westmere_flags) {
50
+ return Architecture::WESTMERE;
51
+ }
52
+ if (instruction_set::NEON)
53
+ return Architecture::ARM64;
54
+
55
+ return Architecture::NONE;
56
+ }
57
+
58
+ using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
59
+ using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
60
+
61
+ extern unified_functype *unified_ptr;
62
+
63
+ extern stage1_functype *stage1_ptr;
64
+
65
+ int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
66
+ Architecture best_implementation = _find_best_supported_implementation();
67
+ // Selecting the best implementation
68
+ switch (best_implementation) {
69
+ #ifdef IS_X86_64
70
+ case Architecture::HASWELL:
71
+ unified_ptr = &unified_machine<Architecture::HASWELL>;
72
+ break;
73
+ case Architecture::WESTMERE:
74
+ unified_ptr = &unified_machine<Architecture::WESTMERE>;
75
+ break;
76
+ #endif
77
+ #ifdef IS_ARM64
78
+ case Architecture::ARM64:
79
+ unified_ptr = &unified_machine<Architecture::ARM64>;
80
+ break;
81
+ #endif
82
+ default:
83
+ std::cerr << "The processor is not supported by simdjson." << std::endl;
84
+ return simdjson::UNEXPECTED_ERROR;
85
+ }
86
+
87
+ return unified_ptr(buf, len, pj);
88
+ }
89
+
90
+ // Responsible to select the best json_parse implementation
91
+ int find_structural_bits_dispatch(const uint8_t *buf, size_t len,
92
+ ParsedJson &pj) {
93
+ Architecture best_implementation = _find_best_supported_implementation();
94
+ // Selecting the best implementation
95
+ switch (best_implementation) {
96
+ #ifdef IS_X86_64
97
+ case Architecture::HASWELL:
98
+ stage1_ptr = &find_structural_bits<Architecture::HASWELL>;
99
+ break;
100
+ case Architecture::WESTMERE:
101
+ stage1_ptr = &find_structural_bits<Architecture::WESTMERE>;
102
+ break;
103
+ #endif
104
+ #ifdef IS_ARM64
105
+ case Architecture::ARM64:
106
+ stage1_ptr = &find_structural_bits<Architecture::ARM64>;
107
+ break;
108
+ #endif
109
+ default:
110
+ std::cerr << "The processor is not supported by simdjson." << std::endl;
111
+ return simdjson::UNEXPECTED_ERROR;
112
+ }
113
+
114
+ return stage1_ptr(buf, len, pj);
115
+ }
116
+
117
+ stage1_functype *stage1_ptr = &find_structural_bits_dispatch;
118
+ unified_functype *unified_ptr = &unified_machine_dispatch;
119
+ } // namespace simdjson
120
+
121
+ int main(int argc, char *argv[]) {
122
+ bool verbose = false;
123
+ bool dump = false;
124
+ bool json_output = false;
125
+ bool force_one_iteration = false;
126
+ bool just_data = false;
127
+ #ifndef _MSC_VER
128
+ int c;
129
+
130
+ while ((c = getopt(argc, argv, "1vdt")) != -1) {
131
+ switch (c) {
132
+ case 't':
133
+ just_data = true;
134
+ break;
135
+ case 'v':
136
+ verbose = true;
137
+ break;
138
+ case 'd':
139
+ dump = true;
140
+ break;
141
+ case 'j':
142
+ json_output = true;
143
+ break;
144
+ case '1':
145
+ force_one_iteration = true;
146
+ break;
147
+ default:
148
+ abort();
149
+ }
150
+ }
151
+ #else
152
+ int optind = 1;
153
+ #endif
154
+ if (optind >= argc) {
155
+ std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
156
+ exit(1);
157
+ }
158
+ const char *filename = argv[optind];
159
+ if (optind + 1 < argc) {
160
+ std::cerr << "warning: ignoring everything after " << argv[optind + 1]
161
+ << std::endl;
162
+ }
163
+ if (verbose) {
164
+ std::cout << "[verbose] loading " << filename << std::endl;
165
+ }
166
+ simdjson::padded_string p;
167
+ try {
168
+ simdjson::get_corpus(filename).swap(p);
169
+ } catch (const std::exception &) { // caught by reference to base
170
+ std::cout << "Could not load the file " << filename << std::endl;
171
+ return EXIT_FAILURE;
172
+ }
173
+ if (verbose) {
174
+ std::cout << "[verbose] loaded " << filename << " (" << p.size()
175
+ << " bytes)" << std::endl;
176
+ }
177
+ #if defined(DEBUG)
178
+ const uint32_t iterations = 1;
179
+ #else
180
+ const uint32_t iterations =
181
+ force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
182
+ #endif
183
+ std::vector<double> res;
184
+ res.resize(iterations);
185
+ if (!just_data)
186
+ printf("number of iterations %u \n", iterations);
187
+ #if !defined(__linux__)
188
+ #define SQUASH_COUNTERS
189
+ if (just_data) {
190
+ printf("just_data (-t) flag only works under linux.\n");
191
+ }
192
+ #endif
193
+ { // practice run
194
+ simdjson::ParsedJson pj;
195
+ bool allocok = pj.allocate_capacity(p.size());
196
+ if (allocok) {
197
+ simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj);
198
+ simdjson::unified_ptr(
199
+ (const uint8_t
200
+ *)(const uint8_t
201
+ *)(const uint8_t
202
+ *)(const uint8_t
203
+ *)(const uint8_t
204
+ *)(const uint8_t
205
+ *)(const uint8_t
206
+ *)(const uint8_t *)
207
+ p.data(),
208
+ p.size(), pj);
209
+ }
210
+ }
211
+ #ifndef SQUASH_COUNTERS
212
+ std::vector<int> evts;
213
+ evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
214
+ evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
215
+ evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
216
+ evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
217
+ evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
218
+ LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
219
+ std::vector<unsigned long long> results;
220
+ results.resize(evts.size());
221
+ unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
222
+ unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
223
+ unsigned long mis0 = 0, mis1 = 0, mis2 = 0;
224
+ unsigned long cref0 = 0, cref1 = 0, cref2 = 0;
225
+ unsigned long cmis0 = 0, cmis1 = 0, cmis2 = 0;
226
+ #endif
227
+ bool isok = true;
228
+ #ifndef SQUASH_COUNTERS
229
+ for (uint32_t i = 0; i < iterations; i++) {
230
+ if (verbose) {
231
+ std::cout << "[verbose] iteration # " << i << std::endl;
232
+ }
233
+ unified.start();
234
+ simdjson::ParsedJson pj;
235
+ bool allocok = pj.allocate_capacity(p.size());
236
+ if (!allocok) {
237
+ std::cerr << "failed to allocate memory" << std::endl;
238
+ return EXIT_FAILURE;
239
+ }
240
+ unified.end(results);
241
+ cy0 += results[0];
242
+ cl0 += results[1];
243
+ mis0 += results[2];
244
+ cref0 += results[3];
245
+ cmis0 += results[4];
246
+ if (verbose) {
247
+ std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
248
+ }
249
+ unified.start();
250
+ isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
251
+ simdjson::SUCCESS);
252
+ unified.end(results);
253
+ cy1 += results[0];
254
+ cl1 += results[1];
255
+ mis1 += results[2];
256
+ cref1 += results[3];
257
+ cmis1 += results[4];
258
+ if (!isok) {
259
+ std::cout << "Failed during stage 1" << std::endl;
260
+ break;
261
+ }
262
+ unified.start();
263
+ isok = isok &&
264
+ (simdjson::SUCCESS ==
265
+ simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
266
+ unified.end(results);
267
+ cy2 += results[0];
268
+ cl2 += results[1];
269
+ mis2 += results[2];
270
+ cref2 += results[3];
271
+ cmis2 += results[4];
272
+ if (!isok) {
273
+ std::cout << "Failed during stage 2" << std::endl;
274
+ break;
275
+ }
276
+ }
277
+ #endif
278
+ // we do it again, this time just measuring the elapsed time
279
+ for (uint32_t i = 0; i < iterations; i++) {
280
+ if (verbose) {
281
+ std::cout << "[verbose] iteration # " << i << std::endl;
282
+ }
283
+ simdjson::ParsedJson pj;
284
+ bool allocok = pj.allocate_capacity(p.size());
285
+ if (!allocok) {
286
+ std::cerr << "failed to allocate memory" << std::endl;
287
+ return EXIT_FAILURE;
288
+ }
289
+ if (verbose) {
290
+ std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
291
+ }
292
+
293
+ auto start = std::chrono::steady_clock::now();
294
+ isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
295
+ simdjson::SUCCESS);
296
+ isok = isok &&
297
+ (simdjson::SUCCESS ==
298
+ simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
299
+ auto end = std::chrono::steady_clock::now();
300
+ std::chrono::duration<double> secs = end - start;
301
+ res[i] = secs.count();
302
+ if (!isok) {
303
+ std::cerr << pj.get_error_message() << std::endl;
304
+ std::cerr << "Could not parse. " << std::endl;
305
+ return EXIT_FAILURE;
306
+ }
307
+ }
308
+ simdjson::ParsedJson pj =
309
+ build_parsed_json(p); // do the parsing again to get the stats
310
+ if (!pj.is_valid()) {
311
+ std::cerr << pj.get_error_message() << std::endl;
312
+ std::cerr << "Could not parse. " << std::endl;
313
+ return EXIT_FAILURE;
314
+ }
315
+ double min_result = *min_element(res.begin(), res.end());
316
+ double speedinGBs = (p.size()) / (min_result * 1000000000.0);
317
+ #ifndef SQUASH_COUNTERS
318
+ unsigned long total = cy0 + cy1 + cy2;
319
+ if (just_data) {
320
+ float cpb0 = (double)cy0 / (iterations * p.size());
321
+ float cpb1 = (double)cy1 / (iterations * p.size());
322
+ float cpb2 = (double)cy2 / (iterations * p.size());
323
+ float cpbtotal = (double)total / (iterations * p.size());
324
+ char *newfile = (char *)malloc(strlen(filename) + 1);
325
+ if (newfile == NULL) {
326
+ return EXIT_FAILURE;
327
+ }
328
+ ::strcpy(newfile, filename);
329
+ char *snewfile = ::basename(newfile);
330
+ size_t nl = strlen(snewfile);
331
+ for (size_t j = nl - 1; j > 0; j--) {
332
+ if (snewfile[j] == '.') {
333
+ snewfile[j] = '\0';
334
+ break;
335
+ }
336
+ }
337
+ printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal,
338
+ speedinGBs);
339
+ free(newfile);
340
+ } else {
341
+ printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
342
+ p.size(), pj.n_structural_indexes,
343
+ (double)pj.n_structural_indexes / p.size());
344
+ printf("mem alloc instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
345
+ "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
346
+ "%10lu (failure %10lu)\n",
347
+ cl0 / iterations, cy0 / iterations, 100. * cy0 / total,
348
+ (double)cl0 / cy0, mis0 / iterations, (double)cy0 / mis0,
349
+ cref1 / iterations, cmis0 / iterations);
350
+ printf(" mem alloc runs at %.2f cycles per input byte.\n",
351
+ (double)cy0 / (iterations * p.size()));
352
+ printf("stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
353
+ "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
354
+ "%10lu (failure %10lu)\n",
355
+ cl1 / iterations, cy1 / iterations, 100. * cy1 / total,
356
+ (double)cl1 / cy1, mis1 / iterations, (double)cy1 / mis1,
357
+ cref1 / iterations, cmis1 / iterations);
358
+ printf(" stage 1 runs at %.2f cycles per input byte.\n",
359
+ (double)cy1 / (iterations * p.size()));
360
+
361
+ printf("stage 2 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
362
+ "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache "
363
+ "accesses: %10lu (failure %10lu)\n",
364
+ cl2 / iterations, cy2 / iterations, 100. * cy2 / total,
365
+ (double)cl2 / cy2, mis2 / iterations, (double)cy2 / mis2,
366
+ cref2 / iterations, cmis2 / iterations);
367
+ printf(" stage 2 runs at %.2f cycles per input byte and ",
368
+ (double)cy2 / (iterations * p.size()));
369
+ printf("%.2f cycles per structural character.\n",
370
+ (double)cy2 / (iterations * pj.n_structural_indexes));
371
+
372
+ printf(" all stages: %.2f cycles per input byte.\n",
373
+ (double)total / (iterations * p.size()));
374
+ printf("Estimated average frequency: %.3f GHz.\n",
375
+ (double)total / (iterations * min_result * 1000000000.0));
376
+ }
377
+ #endif
378
+ if (!just_data) {
379
+ std::cout << "Min: " << min_result << " bytes read: " << p.size()
380
+ << " Gigabytes/second: " << speedinGBs << std::endl;
381
+ }
382
+ if (json_output) {
383
+ isok = isok && pj.print_json(std::cout);
384
+ }
385
+ if (dump) {
386
+ isok = isok && pj.dump_raw_tape(std::cout);
387
+ }
388
+ if (!isok) {
389
+ fprintf(stderr, " Parsing failed. \n ");
390
+ return EXIT_FAILURE;
391
+ }
392
+ return EXIT_SUCCESS;
393
+ }
@@ -0,0 +1,305 @@
1
+ #include "simdjson/jsonparser.h"
2
+ #include <unistd.h>
3
+
4
+ #include "benchmark.h"
5
+ // #define RAPIDJSON_SSE2 // bad for performance
6
+ // #define RAPIDJSON_SSE42 // bad for performance
7
+ #include "rapidjson/document.h"
8
+ #include "rapidjson/reader.h"
9
+ #include "rapidjson/stringbuffer.h"
10
+ #include "rapidjson/writer.h"
11
+
12
+ #include "sajson.h"
13
+
14
+ using namespace rapidjson;
15
+ using namespace simdjson;
16
+ struct stat_s {
17
+ size_t number_count;
18
+ size_t object_count;
19
+ size_t array_count;
20
+ size_t null_count;
21
+ size_t true_count;
22
+ size_t false_count;
23
+ bool valid;
24
+ };
25
+
26
+ typedef struct stat_s stat_t;
27
+
28
+ bool stat_equal(const stat_t &s1, const stat_t &s2) {
29
+ return (s1.valid == s2.valid) && (s1.number_count == s2.number_count) &&
30
+ (s1.object_count == s2.object_count) &&
31
+ (s1.array_count == s2.array_count) &&
32
+ (s1.null_count == s2.null_count) && (s1.true_count == s2.true_count) &&
33
+ (s1.false_count == s2.false_count);
34
+ }
35
+
36
+ void print_stat(const stat_t &s) {
37
+ if (!s.valid) {
38
+ printf("invalid\n");
39
+ return;
40
+ }
41
+ printf("number: %zu object: %zu array: %zu null: %zu true: %zu false: %zu\n",
42
+ s.number_count, s.object_count, s.array_count, s.null_count,
43
+ s.true_count, s.false_count);
44
+ }
45
+
46
+ __attribute__((noinline)) stat_t
47
+ simdjson_compute_stats(const simdjson::padded_string &p) {
48
+ stat_t answer;
49
+ simdjson::ParsedJson pj = build_parsed_json(p);
50
+ answer.valid = pj.is_valid();
51
+ if (!answer.valid) {
52
+ return answer;
53
+ }
54
+ answer.number_count = 0;
55
+ answer.object_count = 0;
56
+ answer.array_count = 0;
57
+ answer.null_count = 0;
58
+ answer.true_count = 0;
59
+ answer.false_count = 0;
60
+ size_t tape_idx = 0;
61
+ uint64_t tape_val = pj.tape[tape_idx++];
62
+ uint8_t type = (tape_val >> 56);
63
+ size_t how_many = 0;
64
+ assert(type == 'r');
65
+ how_many = tape_val & JSON_VALUE_MASK;
66
+ for (; tape_idx < how_many; tape_idx++) {
67
+ tape_val = pj.tape[tape_idx];
68
+ // uint64_t payload = tape_val & JSON_VALUE_MASK;
69
+ type = (tape_val >> 56);
70
+ switch (type) {
71
+ case 'l': // we have a long int
72
+ answer.number_count++;
73
+ tape_idx++; // skipping the integer
74
+ break;
75
+ case 'd': // we have a double
76
+ answer.number_count++;
77
+ tape_idx++; // skipping the double
78
+ break;
79
+ case 'n': // we have a null
80
+ answer.null_count++;
81
+ break;
82
+ case 't': // we have a true
83
+ answer.true_count++;
84
+ break;
85
+ case 'f': // we have a false
86
+ answer.false_count++;
87
+ break;
88
+ case '{': // we have an object
89
+ answer.object_count++;
90
+ break;
91
+ case '}': // we end an object
92
+ break;
93
+ case '[': // we start an array
94
+ answer.array_count++;
95
+ break;
96
+ case ']': // we end an array
97
+ break;
98
+ default:
99
+ break; // ignore
100
+ }
101
+ }
102
+ return answer;
103
+ }
104
+
105
+ // see
106
+ // https://github.com/miloyip/nativejson-benchmark/blob/master/src/tests/sajsontest.cpp
107
+ void sajson_traverse(stat_t &stats, const sajson::value &node) {
108
+ using namespace sajson;
109
+ switch (node.get_type()) {
110
+ case TYPE_NULL:
111
+ stats.null_count++;
112
+ break;
113
+ case TYPE_FALSE:
114
+ stats.false_count++;
115
+ break;
116
+ case TYPE_TRUE:
117
+ stats.true_count++;
118
+ break;
119
+ case TYPE_ARRAY: {
120
+ stats.array_count++;
121
+ auto length = node.get_length();
122
+ for (size_t i = 0; i < length; ++i) {
123
+ sajson_traverse(stats, node.get_array_element(i));
124
+ }
125
+ break;
126
+ }
127
+ case TYPE_OBJECT: {
128
+ stats.object_count++;
129
+ auto length = node.get_length();
130
+ for (auto i = 0u; i < length; ++i) {
131
+ sajson_traverse(stats, node.get_object_value(i));
132
+ }
133
+ break;
134
+ }
135
+ case TYPE_STRING:
136
+ // skip
137
+ break;
138
+
139
+ case TYPE_DOUBLE:
140
+ case TYPE_INTEGER:
141
+ stats.number_count++; // node.get_number_value();
142
+ break;
143
+ default:
144
+ assert(false && "unknown node type");
145
+ }
146
+ }
147
+
148
+ __attribute__((noinline)) stat_t
149
+ sasjon_compute_stats(const simdjson::padded_string &p) {
150
+ stat_t answer;
151
+ char *buffer = (char *)malloc(p.size());
152
+ memcpy(buffer, p.data(), p.size());
153
+ auto d = sajson::parse(sajson::dynamic_allocation(),
154
+ sajson::mutable_string_view(p.size(), buffer));
155
+ answer.valid = d.is_valid();
156
+ if (!answer.valid) {
157
+ return answer;
158
+ }
159
+ answer.number_count = 0;
160
+ answer.object_count = 0;
161
+ answer.array_count = 0;
162
+ answer.null_count = 0;
163
+ answer.true_count = 0;
164
+ answer.false_count = 0;
165
+ sajson_traverse(answer, d.get_root());
166
+ free(buffer);
167
+ return answer;
168
+ }
169
+
170
+ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
171
+ switch (v.GetType()) {
172
+ case kNullType:
173
+ stats.null_count++;
174
+ break;
175
+ case kFalseType:
176
+ stats.false_count++;
177
+ break;
178
+ case kTrueType:
179
+ stats.true_count++;
180
+ break;
181
+
182
+ case kObjectType:
183
+ for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd();
184
+ ++m) {
185
+ rapid_traverse(stats, m->value);
186
+ }
187
+ stats.object_count++;
188
+ break;
189
+ case kArrayType:
190
+ for (Value::ConstValueIterator i = v.Begin(); i != v.End();
191
+ ++i) { // v.Size();
192
+ rapid_traverse(stats, *i);
193
+ }
194
+ stats.array_count++;
195
+ break;
196
+
197
+ case kStringType:
198
+ break;
199
+
200
+ case kNumberType:
201
+ stats.number_count++;
202
+ break;
203
+ }
204
+ }
205
+
206
+ __attribute__((noinline)) stat_t
207
+ rapid_compute_stats(const simdjson::padded_string &p) {
208
+ stat_t answer;
209
+ char *buffer = (char *)malloc(p.size() + 1);
210
+ memcpy(buffer, p.data(), p.size());
211
+ buffer[p.size()] = '\0';
212
+ rapidjson::Document d;
213
+ d.ParseInsitu<kParseValidateEncodingFlag>(buffer);
214
+ answer.valid = !d.HasParseError();
215
+ if (!answer.valid) {
216
+ return answer;
217
+ }
218
+ answer.number_count = 0;
219
+ answer.object_count = 0;
220
+ answer.array_count = 0;
221
+ answer.null_count = 0;
222
+ answer.true_count = 0;
223
+ answer.false_count = 0;
224
+ rapid_traverse(answer, d);
225
+ free(buffer);
226
+ return answer;
227
+ }
228
+
229
+ int main(int argc, char *argv[]) {
230
+ bool verbose = false;
231
+ bool just_data = false;
232
+
233
+ int c;
234
+ while ((c = getopt(argc, argv, "vt")) != -1)
235
+ switch (c) {
236
+ case 't':
237
+ just_data = true;
238
+ break;
239
+ case 'v':
240
+ verbose = true;
241
+ break;
242
+ default:
243
+ abort();
244
+ }
245
+ if (optind >= argc) {
246
+ std::cerr
247
+ << "Using different parsers, we compute the content statistics of "
248
+ "JSON documents."
249
+ << std::endl;
250
+ std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
251
+ std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
252
+ exit(1);
253
+ }
254
+ const char *filename = argv[optind];
255
+ if (optind + 1 < argc) {
256
+ std::cerr << "warning: ignoring everything after " << argv[optind + 1]
257
+ << std::endl;
258
+ }
259
+ simdjson::padded_string p;
260
+ try {
261
+ simdjson::get_corpus(filename).swap(p);
262
+ } catch (const std::exception &e) { // caught by reference to base
263
+ std::cout << "Could not load the file " << filename << std::endl;
264
+ return EXIT_FAILURE;
265
+ }
266
+
267
+ if (verbose) {
268
+ std::cout << "Input has ";
269
+ if (p.size() > 1024 * 1024)
270
+ std::cout << p.size() / (1024 * 1024) << " MB ";
271
+ else if (p.size() > 1024)
272
+ std::cout << p.size() / 1024 << " KB ";
273
+ else
274
+ std::cout << p.size() << " B ";
275
+ std::cout << std::endl;
276
+ }
277
+ stat_t s1 = simdjson_compute_stats(p);
278
+ if (verbose) {
279
+ printf("simdjson: ");
280
+ print_stat(s1);
281
+ }
282
+ stat_t s2 = rapid_compute_stats(p);
283
+ if (verbose) {
284
+ printf("rapid: ");
285
+ print_stat(s2);
286
+ }
287
+ stat_t s3 = sasjon_compute_stats(p);
288
+ if (verbose) {
289
+ printf("sasjon: ");
290
+ print_stat(s3);
291
+ }
292
+ assert(stat_equal(s1, s2));
293
+ assert(stat_equal(s1, s3));
294
+ int repeat = 50;
295
+ int volume = p.size();
296
+ if (just_data) {
297
+ printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
298
+ }
299
+ BEST_TIME("simdjson ", simdjson_compute_stats(p).valid, true, , repeat,
300
+ volume, !just_data);
301
+ BEST_TIME("RapidJSON ", rapid_compute_stats(p).valid, true, , repeat, volume,
302
+ !just_data);
303
+ BEST_TIME("sasjon ", sasjon_compute_stats(p).valid, true, , repeat, volume,
304
+ !just_data);
305
+ }