simdjson 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.clang-format +5 -0
  3. data/.gitignore +14 -0
  4. data/.gitmodules +3 -0
  5. data/.rubocop.yml +9 -0
  6. data/.travis.yml +7 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +39 -0
  10. data/Rakefile +32 -0
  11. data/benchmark/apache_builds.json +4421 -0
  12. data/benchmark/demo.json +15 -0
  13. data/benchmark/github_events.json +1390 -0
  14. data/benchmark/run_benchmark.rb +30 -0
  15. data/ext/simdjson/extconf.rb +22 -0
  16. data/ext/simdjson/simdjson.cpp +76 -0
  17. data/ext/simdjson/simdjson.hpp +6 -0
  18. data/lib/simdjson/version.rb +3 -0
  19. data/lib/simdjson.rb +2 -0
  20. data/simdjson.gemspec +35 -0
  21. data/vendor/.gitkeep +0 -0
  22. data/vendor/simdjson/AUTHORS +3 -0
  23. data/vendor/simdjson/CMakeLists.txt +63 -0
  24. data/vendor/simdjson/CONTRIBUTORS +27 -0
  25. data/vendor/simdjson/Dockerfile +10 -0
  26. data/vendor/simdjson/LICENSE +201 -0
  27. data/vendor/simdjson/Makefile +203 -0
  28. data/vendor/simdjson/Notes.md +85 -0
  29. data/vendor/simdjson/README.md +581 -0
  30. data/vendor/simdjson/amalgamation.sh +158 -0
  31. data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
  32. data/vendor/simdjson/benchmark/benchmark.h +223 -0
  33. data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
  34. data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
  35. data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
  36. data/vendor/simdjson/benchmark/parse.cpp +393 -0
  37. data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
  38. data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
  39. data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
  40. data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
  41. data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
  42. data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
  43. data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
  44. data/vendor/simdjson/doc/gbps.png +0 -0
  45. data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
  46. data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
  47. data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
  48. data/vendor/simdjson/images/halvarflake.png +0 -0
  49. data/vendor/simdjson/images/logo.png +0 -0
  50. data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
  51. data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
  52. data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
  53. data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
  54. data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
  55. data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
  56. data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
  57. data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
  58. data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
  59. data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
  60. data/vendor/simdjson/include/simdjson/portability.h +172 -0
  61. data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
  62. data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
  63. data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
  64. data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
  65. data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
  66. data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
  67. data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
  68. data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
  69. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
  70. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
  71. data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
  72. data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
  73. data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
  74. data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
  75. data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
  76. data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
  77. data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
  78. data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
  79. data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
  80. data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
  81. data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
  82. data/vendor/simdjson/scripts/issue150.sh +14 -0
  83. data/vendor/simdjson/scripts/javascript/README.md +3 -0
  84. data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
  85. data/vendor/simdjson/scripts/minifier.sh +11 -0
  86. data/vendor/simdjson/scripts/parseandstat.sh +24 -0
  87. data/vendor/simdjson/scripts/parser.sh +11 -0
  88. data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
  89. data/vendor/simdjson/scripts/plotparse.sh +98 -0
  90. data/vendor/simdjson/scripts/selectparser.sh +11 -0
  91. data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
  92. data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
  93. data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
  94. data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
  95. data/vendor/simdjson/scripts/testjson2json.sh +99 -0
  96. data/vendor/simdjson/scripts/transitions/Makefile +10 -0
  97. data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
  98. data/vendor/simdjson/singleheader/README.md +1 -0
  99. data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
  100. data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
  101. data/vendor/simdjson/singleheader/simdjson.h +39692 -0
  102. data/vendor/simdjson/src/CMakeLists.txt +67 -0
  103. data/vendor/simdjson/src/jsonioutil.cpp +35 -0
  104. data/vendor/simdjson/src/jsonminifier.cpp +285 -0
  105. data/vendor/simdjson/src/jsonparser.cpp +91 -0
  106. data/vendor/simdjson/src/parsedjson.cpp +323 -0
  107. data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
  108. data/vendor/simdjson/src/simdjson.cpp +30 -0
  109. data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
  110. data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
  111. data/vendor/simdjson/style/clang-format-check.sh +25 -0
  112. data/vendor/simdjson/style/clang-format.sh +25 -0
  113. data/vendor/simdjson/style/run-clang-format.py +326 -0
  114. data/vendor/simdjson/tape.md +134 -0
  115. data/vendor/simdjson/tests/CMakeLists.txt +25 -0
  116. data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
  117. data/vendor/simdjson/tests/basictests.cpp +75 -0
  118. data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
  119. data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
  120. data/vendor/simdjson/tests/pointercheck.cpp +38 -0
  121. data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
  122. data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
  123. data/vendor/simdjson/tools/CMakeLists.txt +3 -0
  124. data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
  125. data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
  126. data/vendor/simdjson/tools/json2json.cpp +112 -0
  127. data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
  128. data/vendor/simdjson/tools/jsonstats.cpp +143 -0
  129. data/vendor/simdjson/tools/minify.cpp +21 -0
  130. data/vendor/simdjson/tools/release.py +125 -0
  131. data/vendor/simdjson/windows/dirent_portable.h +1043 -0
  132. metadata +273 -0
@@ -0,0 +1,393 @@
1
+ #include <cassert>
2
+ #include <cctype>
3
+ #ifndef _MSC_VER
4
+ #include <dirent.h>
5
+ #include <unistd.h>
6
+ #endif
7
+ #include <cinttypes>
8
+
9
+ #include <cstdio>
10
+ #include <cstdlib>
11
+ #include <cstring>
12
+
13
+ #include <algorithm>
14
+ #include <chrono>
15
+ #include <cstring>
16
+ #include <fstream>
17
+ #include <iomanip>
18
+ #include <iostream>
19
+ #include <map>
20
+ #include <set>
21
+ #include <sstream>
22
+ #include <string>
23
+ #include <vector>
24
+
25
+ #include "linux-perf-events.h"
26
+ #ifdef __linux__
27
+ #include <libgen.h>
28
+ #endif
29
+ //#define DEBUG
30
+ #include "simdjson/common_defs.h"
31
+ #include "simdjson/isadetection.h"
32
+ #include "simdjson/jsonioutil.h"
33
+ #include "simdjson/jsonparser.h"
34
+ #include "simdjson/parsedjson.h"
35
+ #include "simdjson/stage1_find_marks.h"
36
+ #include "simdjson/stage2_build_tape.h"
37
+ namespace simdjson {
38
+ Architecture _find_best_supported_implementation() {
39
+ constexpr uint32_t haswell_flags =
40
+ instruction_set::AVX2 | instruction_set::PCLMULQDQ |
41
+ instruction_set::BMI1 | instruction_set::BMI2;
42
+ constexpr uint32_t westmere_flags =
43
+ instruction_set::SSE42 | instruction_set::PCLMULQDQ;
44
+ uint32_t supports = detect_supported_architectures();
45
+ // Order from best to worst (within architecture)
46
+ if ((haswell_flags & supports) == haswell_flags) {
47
+ return Architecture::HASWELL;
48
+ }
49
+ if ((westmere_flags & supports) == westmere_flags) {
50
+ return Architecture::WESTMERE;
51
+ }
52
+ if (instruction_set::NEON)
53
+ return Architecture::ARM64;
54
+
55
+ return Architecture::NONE;
56
+ }
57
+
58
+ using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
59
+ using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
60
+
61
+ extern unified_functype *unified_ptr;
62
+
63
+ extern stage1_functype *stage1_ptr;
64
+
65
+ int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
66
+ Architecture best_implementation = _find_best_supported_implementation();
67
+ // Selecting the best implementation
68
+ switch (best_implementation) {
69
+ #ifdef IS_X86_64
70
+ case Architecture::HASWELL:
71
+ unified_ptr = &unified_machine<Architecture::HASWELL>;
72
+ break;
73
+ case Architecture::WESTMERE:
74
+ unified_ptr = &unified_machine<Architecture::WESTMERE>;
75
+ break;
76
+ #endif
77
+ #ifdef IS_ARM64
78
+ case Architecture::ARM64:
79
+ unified_ptr = &unified_machine<Architecture::ARM64>;
80
+ break;
81
+ #endif
82
+ default:
83
+ std::cerr << "The processor is not supported by simdjson." << std::endl;
84
+ return simdjson::UNEXPECTED_ERROR;
85
+ }
86
+
87
+ return unified_ptr(buf, len, pj);
88
+ }
89
+
90
+ // Responsible to select the best json_parse implementation
91
+ int find_structural_bits_dispatch(const uint8_t *buf, size_t len,
92
+ ParsedJson &pj) {
93
+ Architecture best_implementation = _find_best_supported_implementation();
94
+ // Selecting the best implementation
95
+ switch (best_implementation) {
96
+ #ifdef IS_X86_64
97
+ case Architecture::HASWELL:
98
+ stage1_ptr = &find_structural_bits<Architecture::HASWELL>;
99
+ break;
100
+ case Architecture::WESTMERE:
101
+ stage1_ptr = &find_structural_bits<Architecture::WESTMERE>;
102
+ break;
103
+ #endif
104
+ #ifdef IS_ARM64
105
+ case Architecture::ARM64:
106
+ stage1_ptr = &find_structural_bits<Architecture::ARM64>;
107
+ break;
108
+ #endif
109
+ default:
110
+ std::cerr << "The processor is not supported by simdjson." << std::endl;
111
+ return simdjson::UNEXPECTED_ERROR;
112
+ }
113
+
114
+ return stage1_ptr(buf, len, pj);
115
+ }
116
+
117
+ stage1_functype *stage1_ptr = &find_structural_bits_dispatch;
118
+ unified_functype *unified_ptr = &unified_machine_dispatch;
119
+ } // namespace simdjson
120
+
121
+ int main(int argc, char *argv[]) {
122
+ bool verbose = false;
123
+ bool dump = false;
124
+ bool json_output = false;
125
+ bool force_one_iteration = false;
126
+ bool just_data = false;
127
+ #ifndef _MSC_VER
128
+ int c;
129
+
130
+ while ((c = getopt(argc, argv, "1vdt")) != -1) {
131
+ switch (c) {
132
+ case 't':
133
+ just_data = true;
134
+ break;
135
+ case 'v':
136
+ verbose = true;
137
+ break;
138
+ case 'd':
139
+ dump = true;
140
+ break;
141
+ case 'j':
142
+ json_output = true;
143
+ break;
144
+ case '1':
145
+ force_one_iteration = true;
146
+ break;
147
+ default:
148
+ abort();
149
+ }
150
+ }
151
+ #else
152
+ int optind = 1;
153
+ #endif
154
+ if (optind >= argc) {
155
+ std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
156
+ exit(1);
157
+ }
158
+ const char *filename = argv[optind];
159
+ if (optind + 1 < argc) {
160
+ std::cerr << "warning: ignoring everything after " << argv[optind + 1]
161
+ << std::endl;
162
+ }
163
+ if (verbose) {
164
+ std::cout << "[verbose] loading " << filename << std::endl;
165
+ }
166
+ simdjson::padded_string p;
167
+ try {
168
+ simdjson::get_corpus(filename).swap(p);
169
+ } catch (const std::exception &) { // caught by reference to base
170
+ std::cout << "Could not load the file " << filename << std::endl;
171
+ return EXIT_FAILURE;
172
+ }
173
+ if (verbose) {
174
+ std::cout << "[verbose] loaded " << filename << " (" << p.size()
175
+ << " bytes)" << std::endl;
176
+ }
177
+ #if defined(DEBUG)
178
+ const uint32_t iterations = 1;
179
+ #else
180
+ const uint32_t iterations =
181
+ force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
182
+ #endif
183
+ std::vector<double> res;
184
+ res.resize(iterations);
185
+ if (!just_data)
186
+ printf("number of iterations %u \n", iterations);
187
+ #if !defined(__linux__)
188
+ #define SQUASH_COUNTERS
189
+ if (just_data) {
190
+ printf("just_data (-t) flag only works under linux.\n");
191
+ }
192
+ #endif
193
+ { // practice run
194
+ simdjson::ParsedJson pj;
195
+ bool allocok = pj.allocate_capacity(p.size());
196
+ if (allocok) {
197
+ simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj);
198
+ simdjson::unified_ptr(
199
+ (const uint8_t
200
+ *)(const uint8_t
201
+ *)(const uint8_t
202
+ *)(const uint8_t
203
+ *)(const uint8_t
204
+ *)(const uint8_t
205
+ *)(const uint8_t
206
+ *)(const uint8_t *)
207
+ p.data(),
208
+ p.size(), pj);
209
+ }
210
+ }
211
+ #ifndef SQUASH_COUNTERS
212
+ std::vector<int> evts;
213
+ evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
214
+ evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
215
+ evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
216
+ evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
217
+ evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
218
+ LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
219
+ std::vector<unsigned long long> results;
220
+ results.resize(evts.size());
221
+ unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
222
+ unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
223
+ unsigned long mis0 = 0, mis1 = 0, mis2 = 0;
224
+ unsigned long cref0 = 0, cref1 = 0, cref2 = 0;
225
+ unsigned long cmis0 = 0, cmis1 = 0, cmis2 = 0;
226
+ #endif
227
+ bool isok = true;
228
+ #ifndef SQUASH_COUNTERS
229
+ for (uint32_t i = 0; i < iterations; i++) {
230
+ if (verbose) {
231
+ std::cout << "[verbose] iteration # " << i << std::endl;
232
+ }
233
+ unified.start();
234
+ simdjson::ParsedJson pj;
235
+ bool allocok = pj.allocate_capacity(p.size());
236
+ if (!allocok) {
237
+ std::cerr << "failed to allocate memory" << std::endl;
238
+ return EXIT_FAILURE;
239
+ }
240
+ unified.end(results);
241
+ cy0 += results[0];
242
+ cl0 += results[1];
243
+ mis0 += results[2];
244
+ cref0 += results[3];
245
+ cmis0 += results[4];
246
+ if (verbose) {
247
+ std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
248
+ }
249
+ unified.start();
250
+ isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
251
+ simdjson::SUCCESS);
252
+ unified.end(results);
253
+ cy1 += results[0];
254
+ cl1 += results[1];
255
+ mis1 += results[2];
256
+ cref1 += results[3];
257
+ cmis1 += results[4];
258
+ if (!isok) {
259
+ std::cout << "Failed during stage 1" << std::endl;
260
+ break;
261
+ }
262
+ unified.start();
263
+ isok = isok &&
264
+ (simdjson::SUCCESS ==
265
+ simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
266
+ unified.end(results);
267
+ cy2 += results[0];
268
+ cl2 += results[1];
269
+ mis2 += results[2];
270
+ cref2 += results[3];
271
+ cmis2 += results[4];
272
+ if (!isok) {
273
+ std::cout << "Failed during stage 2" << std::endl;
274
+ break;
275
+ }
276
+ }
277
+ #endif
278
+ // we do it again, this time just measuring the elapsed time
279
+ for (uint32_t i = 0; i < iterations; i++) {
280
+ if (verbose) {
281
+ std::cout << "[verbose] iteration # " << i << std::endl;
282
+ }
283
+ simdjson::ParsedJson pj;
284
+ bool allocok = pj.allocate_capacity(p.size());
285
+ if (!allocok) {
286
+ std::cerr << "failed to allocate memory" << std::endl;
287
+ return EXIT_FAILURE;
288
+ }
289
+ if (verbose) {
290
+ std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
291
+ }
292
+
293
+ auto start = std::chrono::steady_clock::now();
294
+ isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
295
+ simdjson::SUCCESS);
296
+ isok = isok &&
297
+ (simdjson::SUCCESS ==
298
+ simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
299
+ auto end = std::chrono::steady_clock::now();
300
+ std::chrono::duration<double> secs = end - start;
301
+ res[i] = secs.count();
302
+ if (!isok) {
303
+ std::cerr << pj.get_error_message() << std::endl;
304
+ std::cerr << "Could not parse. " << std::endl;
305
+ return EXIT_FAILURE;
306
+ }
307
+ }
308
+ simdjson::ParsedJson pj =
309
+ build_parsed_json(p); // do the parsing again to get the stats
310
+ if (!pj.is_valid()) {
311
+ std::cerr << pj.get_error_message() << std::endl;
312
+ std::cerr << "Could not parse. " << std::endl;
313
+ return EXIT_FAILURE;
314
+ }
315
+ double min_result = *min_element(res.begin(), res.end());
316
+ double speedinGBs = (p.size()) / (min_result * 1000000000.0);
317
+ #ifndef SQUASH_COUNTERS
318
+ unsigned long total = cy0 + cy1 + cy2;
319
+ if (just_data) {
320
+ float cpb0 = (double)cy0 / (iterations * p.size());
321
+ float cpb1 = (double)cy1 / (iterations * p.size());
322
+ float cpb2 = (double)cy2 / (iterations * p.size());
323
+ float cpbtotal = (double)total / (iterations * p.size());
324
+ char *newfile = (char *)malloc(strlen(filename) + 1);
325
+ if (newfile == NULL) {
326
+ return EXIT_FAILURE;
327
+ }
328
+ ::strcpy(newfile, filename);
329
+ char *snewfile = ::basename(newfile);
330
+ size_t nl = strlen(snewfile);
331
+ for (size_t j = nl - 1; j > 0; j--) {
332
+ if (snewfile[j] == '.') {
333
+ snewfile[j] = '\0';
334
+ break;
335
+ }
336
+ }
337
+ printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal,
338
+ speedinGBs);
339
+ free(newfile);
340
+ } else {
341
+ printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
342
+ p.size(), pj.n_structural_indexes,
343
+ (double)pj.n_structural_indexes / p.size());
344
+ printf("mem alloc instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
345
+ "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
346
+ "%10lu (failure %10lu)\n",
347
+ cl0 / iterations, cy0 / iterations, 100. * cy0 / total,
348
+ (double)cl0 / cy0, mis0 / iterations, (double)cy0 / mis0,
349
+ cref1 / iterations, cmis0 / iterations);
350
+ printf(" mem alloc runs at %.2f cycles per input byte.\n",
351
+ (double)cy0 / (iterations * p.size()));
352
+ printf("stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
353
+ "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
354
+ "%10lu (failure %10lu)\n",
355
+ cl1 / iterations, cy1 / iterations, 100. * cy1 / total,
356
+ (double)cl1 / cy1, mis1 / iterations, (double)cy1 / mis1,
357
+ cref1 / iterations, cmis1 / iterations);
358
+ printf(" stage 1 runs at %.2f cycles per input byte.\n",
359
+ (double)cy1 / (iterations * p.size()));
360
+
361
+ printf("stage 2 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
362
+ "%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache "
363
+ "accesses: %10lu (failure %10lu)\n",
364
+ cl2 / iterations, cy2 / iterations, 100. * cy2 / total,
365
+ (double)cl2 / cy2, mis2 / iterations, (double)cy2 / mis2,
366
+ cref2 / iterations, cmis2 / iterations);
367
+ printf(" stage 2 runs at %.2f cycles per input byte and ",
368
+ (double)cy2 / (iterations * p.size()));
369
+ printf("%.2f cycles per structural character.\n",
370
+ (double)cy2 / (iterations * pj.n_structural_indexes));
371
+
372
+ printf(" all stages: %.2f cycles per input byte.\n",
373
+ (double)total / (iterations * p.size()));
374
+ printf("Estimated average frequency: %.3f GHz.\n",
375
+ (double)total / (iterations * min_result * 1000000000.0));
376
+ }
377
+ #endif
378
+ if (!just_data) {
379
+ std::cout << "Min: " << min_result << " bytes read: " << p.size()
380
+ << " Gigabytes/second: " << speedinGBs << std::endl;
381
+ }
382
+ if (json_output) {
383
+ isok = isok && pj.print_json(std::cout);
384
+ }
385
+ if (dump) {
386
+ isok = isok && pj.dump_raw_tape(std::cout);
387
+ }
388
+ if (!isok) {
389
+ fprintf(stderr, " Parsing failed. \n ");
390
+ return EXIT_FAILURE;
391
+ }
392
+ return EXIT_SUCCESS;
393
+ }
@@ -0,0 +1,305 @@
1
+ #include "simdjson/jsonparser.h"
2
+ #include <unistd.h>
3
+
4
+ #include "benchmark.h"
5
+ // #define RAPIDJSON_SSE2 // bad for performance
6
+ // #define RAPIDJSON_SSE42 // bad for performance
7
+ #include "rapidjson/document.h"
8
+ #include "rapidjson/reader.h"
9
+ #include "rapidjson/stringbuffer.h"
10
+ #include "rapidjson/writer.h"
11
+
12
+ #include "sajson.h"
13
+
14
+ using namespace rapidjson;
15
+ using namespace simdjson;
16
+ struct stat_s {
17
+ size_t number_count;
18
+ size_t object_count;
19
+ size_t array_count;
20
+ size_t null_count;
21
+ size_t true_count;
22
+ size_t false_count;
23
+ bool valid;
24
+ };
25
+
26
+ typedef struct stat_s stat_t;
27
+
28
+ bool stat_equal(const stat_t &s1, const stat_t &s2) {
29
+ return (s1.valid == s2.valid) && (s1.number_count == s2.number_count) &&
30
+ (s1.object_count == s2.object_count) &&
31
+ (s1.array_count == s2.array_count) &&
32
+ (s1.null_count == s2.null_count) && (s1.true_count == s2.true_count) &&
33
+ (s1.false_count == s2.false_count);
34
+ }
35
+
36
+ void print_stat(const stat_t &s) {
37
+ if (!s.valid) {
38
+ printf("invalid\n");
39
+ return;
40
+ }
41
+ printf("number: %zu object: %zu array: %zu null: %zu true: %zu false: %zu\n",
42
+ s.number_count, s.object_count, s.array_count, s.null_count,
43
+ s.true_count, s.false_count);
44
+ }
45
+
46
+ __attribute__((noinline)) stat_t
47
+ simdjson_compute_stats(const simdjson::padded_string &p) {
48
+ stat_t answer;
49
+ simdjson::ParsedJson pj = build_parsed_json(p);
50
+ answer.valid = pj.is_valid();
51
+ if (!answer.valid) {
52
+ return answer;
53
+ }
54
+ answer.number_count = 0;
55
+ answer.object_count = 0;
56
+ answer.array_count = 0;
57
+ answer.null_count = 0;
58
+ answer.true_count = 0;
59
+ answer.false_count = 0;
60
+ size_t tape_idx = 0;
61
+ uint64_t tape_val = pj.tape[tape_idx++];
62
+ uint8_t type = (tape_val >> 56);
63
+ size_t how_many = 0;
64
+ assert(type == 'r');
65
+ how_many = tape_val & JSON_VALUE_MASK;
66
+ for (; tape_idx < how_many; tape_idx++) {
67
+ tape_val = pj.tape[tape_idx];
68
+ // uint64_t payload = tape_val & JSON_VALUE_MASK;
69
+ type = (tape_val >> 56);
70
+ switch (type) {
71
+ case 'l': // we have a long int
72
+ answer.number_count++;
73
+ tape_idx++; // skipping the integer
74
+ break;
75
+ case 'd': // we have a double
76
+ answer.number_count++;
77
+ tape_idx++; // skipping the double
78
+ break;
79
+ case 'n': // we have a null
80
+ answer.null_count++;
81
+ break;
82
+ case 't': // we have a true
83
+ answer.true_count++;
84
+ break;
85
+ case 'f': // we have a false
86
+ answer.false_count++;
87
+ break;
88
+ case '{': // we have an object
89
+ answer.object_count++;
90
+ break;
91
+ case '}': // we end an object
92
+ break;
93
+ case '[': // we start an array
94
+ answer.array_count++;
95
+ break;
96
+ case ']': // we end an array
97
+ break;
98
+ default:
99
+ break; // ignore
100
+ }
101
+ }
102
+ return answer;
103
+ }
104
+
105
+ // see
106
+ // https://github.com/miloyip/nativejson-benchmark/blob/master/src/tests/sajsontest.cpp
107
+ void sajson_traverse(stat_t &stats, const sajson::value &node) {
108
+ using namespace sajson;
109
+ switch (node.get_type()) {
110
+ case TYPE_NULL:
111
+ stats.null_count++;
112
+ break;
113
+ case TYPE_FALSE:
114
+ stats.false_count++;
115
+ break;
116
+ case TYPE_TRUE:
117
+ stats.true_count++;
118
+ break;
119
+ case TYPE_ARRAY: {
120
+ stats.array_count++;
121
+ auto length = node.get_length();
122
+ for (size_t i = 0; i < length; ++i) {
123
+ sajson_traverse(stats, node.get_array_element(i));
124
+ }
125
+ break;
126
+ }
127
+ case TYPE_OBJECT: {
128
+ stats.object_count++;
129
+ auto length = node.get_length();
130
+ for (auto i = 0u; i < length; ++i) {
131
+ sajson_traverse(stats, node.get_object_value(i));
132
+ }
133
+ break;
134
+ }
135
+ case TYPE_STRING:
136
+ // skip
137
+ break;
138
+
139
+ case TYPE_DOUBLE:
140
+ case TYPE_INTEGER:
141
+ stats.number_count++; // node.get_number_value();
142
+ break;
143
+ default:
144
+ assert(false && "unknown node type");
145
+ }
146
+ }
147
+
148
+ __attribute__((noinline)) stat_t
149
+ sasjon_compute_stats(const simdjson::padded_string &p) {
150
+ stat_t answer;
151
+ char *buffer = (char *)malloc(p.size());
152
+ memcpy(buffer, p.data(), p.size());
153
+ auto d = sajson::parse(sajson::dynamic_allocation(),
154
+ sajson::mutable_string_view(p.size(), buffer));
155
+ answer.valid = d.is_valid();
156
+ if (!answer.valid) {
157
+ return answer;
158
+ }
159
+ answer.number_count = 0;
160
+ answer.object_count = 0;
161
+ answer.array_count = 0;
162
+ answer.null_count = 0;
163
+ answer.true_count = 0;
164
+ answer.false_count = 0;
165
+ sajson_traverse(answer, d.get_root());
166
+ free(buffer);
167
+ return answer;
168
+ }
169
+
170
+ void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
171
+ switch (v.GetType()) {
172
+ case kNullType:
173
+ stats.null_count++;
174
+ break;
175
+ case kFalseType:
176
+ stats.false_count++;
177
+ break;
178
+ case kTrueType:
179
+ stats.true_count++;
180
+ break;
181
+
182
+ case kObjectType:
183
+ for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd();
184
+ ++m) {
185
+ rapid_traverse(stats, m->value);
186
+ }
187
+ stats.object_count++;
188
+ break;
189
+ case kArrayType:
190
+ for (Value::ConstValueIterator i = v.Begin(); i != v.End();
191
+ ++i) { // v.Size();
192
+ rapid_traverse(stats, *i);
193
+ }
194
+ stats.array_count++;
195
+ break;
196
+
197
+ case kStringType:
198
+ break;
199
+
200
+ case kNumberType:
201
+ stats.number_count++;
202
+ break;
203
+ }
204
+ }
205
+
206
+ __attribute__((noinline)) stat_t
207
+ rapid_compute_stats(const simdjson::padded_string &p) {
208
+ stat_t answer;
209
+ char *buffer = (char *)malloc(p.size() + 1);
210
+ memcpy(buffer, p.data(), p.size());
211
+ buffer[p.size()] = '\0';
212
+ rapidjson::Document d;
213
+ d.ParseInsitu<kParseValidateEncodingFlag>(buffer);
214
+ answer.valid = !d.HasParseError();
215
+ if (!answer.valid) {
216
+ return answer;
217
+ }
218
+ answer.number_count = 0;
219
+ answer.object_count = 0;
220
+ answer.array_count = 0;
221
+ answer.null_count = 0;
222
+ answer.true_count = 0;
223
+ answer.false_count = 0;
224
+ rapid_traverse(answer, d);
225
+ free(buffer);
226
+ return answer;
227
+ }
228
+
229
+ int main(int argc, char *argv[]) {
230
+ bool verbose = false;
231
+ bool just_data = false;
232
+
233
+ int c;
234
+ while ((c = getopt(argc, argv, "vt")) != -1)
235
+ switch (c) {
236
+ case 't':
237
+ just_data = true;
238
+ break;
239
+ case 'v':
240
+ verbose = true;
241
+ break;
242
+ default:
243
+ abort();
244
+ }
245
+ if (optind >= argc) {
246
+ std::cerr
247
+ << "Using different parsers, we compute the content statistics of "
248
+ "JSON documents."
249
+ << std::endl;
250
+ std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
251
+ std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
252
+ exit(1);
253
+ }
254
+ const char *filename = argv[optind];
255
+ if (optind + 1 < argc) {
256
+ std::cerr << "warning: ignoring everything after " << argv[optind + 1]
257
+ << std::endl;
258
+ }
259
+ simdjson::padded_string p;
260
+ try {
261
+ simdjson::get_corpus(filename).swap(p);
262
+ } catch (const std::exception &e) { // caught by reference to base
263
+ std::cout << "Could not load the file " << filename << std::endl;
264
+ return EXIT_FAILURE;
265
+ }
266
+
267
+ if (verbose) {
268
+ std::cout << "Input has ";
269
+ if (p.size() > 1024 * 1024)
270
+ std::cout << p.size() / (1024 * 1024) << " MB ";
271
+ else if (p.size() > 1024)
272
+ std::cout << p.size() / 1024 << " KB ";
273
+ else
274
+ std::cout << p.size() << " B ";
275
+ std::cout << std::endl;
276
+ }
277
+ stat_t s1 = simdjson_compute_stats(p);
278
+ if (verbose) {
279
+ printf("simdjson: ");
280
+ print_stat(s1);
281
+ }
282
+ stat_t s2 = rapid_compute_stats(p);
283
+ if (verbose) {
284
+ printf("rapid: ");
285
+ print_stat(s2);
286
+ }
287
+ stat_t s3 = sasjon_compute_stats(p);
288
+ if (verbose) {
289
+ printf("sasjon: ");
290
+ print_stat(s3);
291
+ }
292
+ assert(stat_equal(s1, s2));
293
+ assert(stat_equal(s1, s3));
294
+ int repeat = 50;
295
+ int volume = p.size();
296
+ if (just_data) {
297
+ printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
298
+ }
299
+ BEST_TIME("simdjson ", simdjson_compute_stats(p).valid, true, , repeat,
300
+ volume, !just_data);
301
+ BEST_TIME("RapidJSON ", rapid_compute_stats(p).valid, true, , repeat, volume,
302
+ !just_data);
303
+ BEST_TIME("sasjon ", sasjon_compute_stats(p).valid, true, , repeat, volume,
304
+ !just_data);
305
+ }