simdjson 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.clang-format +5 -0
- data/.gitignore +14 -0
- data/.gitmodules +3 -0
- data/.rubocop.yml +9 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +32 -0
- data/benchmark/apache_builds.json +4421 -0
- data/benchmark/demo.json +15 -0
- data/benchmark/github_events.json +1390 -0
- data/benchmark/run_benchmark.rb +30 -0
- data/ext/simdjson/extconf.rb +22 -0
- data/ext/simdjson/simdjson.cpp +76 -0
- data/ext/simdjson/simdjson.hpp +6 -0
- data/lib/simdjson/version.rb +3 -0
- data/lib/simdjson.rb +2 -0
- data/simdjson.gemspec +35 -0
- data/vendor/.gitkeep +0 -0
- data/vendor/simdjson/AUTHORS +3 -0
- data/vendor/simdjson/CMakeLists.txt +63 -0
- data/vendor/simdjson/CONTRIBUTORS +27 -0
- data/vendor/simdjson/Dockerfile +10 -0
- data/vendor/simdjson/LICENSE +201 -0
- data/vendor/simdjson/Makefile +203 -0
- data/vendor/simdjson/Notes.md +85 -0
- data/vendor/simdjson/README.md +581 -0
- data/vendor/simdjson/amalgamation.sh +158 -0
- data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
- data/vendor/simdjson/benchmark/benchmark.h +223 -0
- data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
- data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
- data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
- data/vendor/simdjson/benchmark/parse.cpp +393 -0
- data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
- data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
- data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
- data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
- data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
- data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
- data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
- data/vendor/simdjson/doc/gbps.png +0 -0
- data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
- data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
- data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
- data/vendor/simdjson/images/halvarflake.png +0 -0
- data/vendor/simdjson/images/logo.png +0 -0
- data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
- data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
- data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
- data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
- data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
- data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
- data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
- data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
- data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
- data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
- data/vendor/simdjson/include/simdjson/portability.h +172 -0
- data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
- data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
- data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
- data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
- data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
- data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
- data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
- data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
- data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
- data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
- data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
- data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
- data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
- data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
- data/vendor/simdjson/scripts/issue150.sh +14 -0
- data/vendor/simdjson/scripts/javascript/README.md +3 -0
- data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
- data/vendor/simdjson/scripts/minifier.sh +11 -0
- data/vendor/simdjson/scripts/parseandstat.sh +24 -0
- data/vendor/simdjson/scripts/parser.sh +11 -0
- data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
- data/vendor/simdjson/scripts/plotparse.sh +98 -0
- data/vendor/simdjson/scripts/selectparser.sh +11 -0
- data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
- data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
- data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
- data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
- data/vendor/simdjson/scripts/testjson2json.sh +99 -0
- data/vendor/simdjson/scripts/transitions/Makefile +10 -0
- data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
- data/vendor/simdjson/singleheader/README.md +1 -0
- data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
- data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
- data/vendor/simdjson/singleheader/simdjson.h +39692 -0
- data/vendor/simdjson/src/CMakeLists.txt +67 -0
- data/vendor/simdjson/src/jsonioutil.cpp +35 -0
- data/vendor/simdjson/src/jsonminifier.cpp +285 -0
- data/vendor/simdjson/src/jsonparser.cpp +91 -0
- data/vendor/simdjson/src/parsedjson.cpp +323 -0
- data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
- data/vendor/simdjson/src/simdjson.cpp +30 -0
- data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
- data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
- data/vendor/simdjson/style/clang-format-check.sh +25 -0
- data/vendor/simdjson/style/clang-format.sh +25 -0
- data/vendor/simdjson/style/run-clang-format.py +326 -0
- data/vendor/simdjson/tape.md +134 -0
- data/vendor/simdjson/tests/CMakeLists.txt +25 -0
- data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
- data/vendor/simdjson/tests/basictests.cpp +75 -0
- data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
- data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
- data/vendor/simdjson/tests/pointercheck.cpp +38 -0
- data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
- data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
- data/vendor/simdjson/tools/CMakeLists.txt +3 -0
- data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
- data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
- data/vendor/simdjson/tools/json2json.cpp +112 -0
- data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
- data/vendor/simdjson/tools/jsonstats.cpp +143 -0
- data/vendor/simdjson/tools/minify.cpp +21 -0
- data/vendor/simdjson/tools/release.py +125 -0
- data/vendor/simdjson/windows/dirent_portable.h +1043 -0
- metadata +273 -0
@@ -0,0 +1,393 @@
|
|
1
|
+
#include <cassert>
|
2
|
+
#include <cctype>
|
3
|
+
#ifndef _MSC_VER
|
4
|
+
#include <dirent.h>
|
5
|
+
#include <unistd.h>
|
6
|
+
#endif
|
7
|
+
#include <cinttypes>
|
8
|
+
|
9
|
+
#include <cstdio>
|
10
|
+
#include <cstdlib>
|
11
|
+
#include <cstring>
|
12
|
+
|
13
|
+
#include <algorithm>
|
14
|
+
#include <chrono>
|
15
|
+
#include <cstring>
|
16
|
+
#include <fstream>
|
17
|
+
#include <iomanip>
|
18
|
+
#include <iostream>
|
19
|
+
#include <map>
|
20
|
+
#include <set>
|
21
|
+
#include <sstream>
|
22
|
+
#include <string>
|
23
|
+
#include <vector>
|
24
|
+
|
25
|
+
#include "linux-perf-events.h"
|
26
|
+
#ifdef __linux__
|
27
|
+
#include <libgen.h>
|
28
|
+
#endif
|
29
|
+
//#define DEBUG
|
30
|
+
#include "simdjson/common_defs.h"
|
31
|
+
#include "simdjson/isadetection.h"
|
32
|
+
#include "simdjson/jsonioutil.h"
|
33
|
+
#include "simdjson/jsonparser.h"
|
34
|
+
#include "simdjson/parsedjson.h"
|
35
|
+
#include "simdjson/stage1_find_marks.h"
|
36
|
+
#include "simdjson/stage2_build_tape.h"
|
37
|
+
namespace simdjson {
|
38
|
+
Architecture _find_best_supported_implementation() {
|
39
|
+
constexpr uint32_t haswell_flags =
|
40
|
+
instruction_set::AVX2 | instruction_set::PCLMULQDQ |
|
41
|
+
instruction_set::BMI1 | instruction_set::BMI2;
|
42
|
+
constexpr uint32_t westmere_flags =
|
43
|
+
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
|
44
|
+
uint32_t supports = detect_supported_architectures();
|
45
|
+
// Order from best to worst (within architecture)
|
46
|
+
if ((haswell_flags & supports) == haswell_flags) {
|
47
|
+
return Architecture::HASWELL;
|
48
|
+
}
|
49
|
+
if ((westmere_flags & supports) == westmere_flags) {
|
50
|
+
return Architecture::WESTMERE;
|
51
|
+
}
|
52
|
+
if (instruction_set::NEON)
|
53
|
+
return Architecture::ARM64;
|
54
|
+
|
55
|
+
return Architecture::NONE;
|
56
|
+
}
|
57
|
+
|
58
|
+
using unified_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
|
59
|
+
using stage1_functype = int(const uint8_t *buf, size_t len, ParsedJson &pj);
|
60
|
+
|
61
|
+
extern unified_functype *unified_ptr;
|
62
|
+
|
63
|
+
extern stage1_functype *stage1_ptr;
|
64
|
+
|
65
|
+
int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
|
66
|
+
Architecture best_implementation = _find_best_supported_implementation();
|
67
|
+
// Selecting the best implementation
|
68
|
+
switch (best_implementation) {
|
69
|
+
#ifdef IS_X86_64
|
70
|
+
case Architecture::HASWELL:
|
71
|
+
unified_ptr = &unified_machine<Architecture::HASWELL>;
|
72
|
+
break;
|
73
|
+
case Architecture::WESTMERE:
|
74
|
+
unified_ptr = &unified_machine<Architecture::WESTMERE>;
|
75
|
+
break;
|
76
|
+
#endif
|
77
|
+
#ifdef IS_ARM64
|
78
|
+
case Architecture::ARM64:
|
79
|
+
unified_ptr = &unified_machine<Architecture::ARM64>;
|
80
|
+
break;
|
81
|
+
#endif
|
82
|
+
default:
|
83
|
+
std::cerr << "The processor is not supported by simdjson." << std::endl;
|
84
|
+
return simdjson::UNEXPECTED_ERROR;
|
85
|
+
}
|
86
|
+
|
87
|
+
return unified_ptr(buf, len, pj);
|
88
|
+
}
|
89
|
+
|
90
|
+
// Responsible to select the best json_parse implementation
|
91
|
+
int find_structural_bits_dispatch(const uint8_t *buf, size_t len,
|
92
|
+
ParsedJson &pj) {
|
93
|
+
Architecture best_implementation = _find_best_supported_implementation();
|
94
|
+
// Selecting the best implementation
|
95
|
+
switch (best_implementation) {
|
96
|
+
#ifdef IS_X86_64
|
97
|
+
case Architecture::HASWELL:
|
98
|
+
stage1_ptr = &find_structural_bits<Architecture::HASWELL>;
|
99
|
+
break;
|
100
|
+
case Architecture::WESTMERE:
|
101
|
+
stage1_ptr = &find_structural_bits<Architecture::WESTMERE>;
|
102
|
+
break;
|
103
|
+
#endif
|
104
|
+
#ifdef IS_ARM64
|
105
|
+
case Architecture::ARM64:
|
106
|
+
stage1_ptr = &find_structural_bits<Architecture::ARM64>;
|
107
|
+
break;
|
108
|
+
#endif
|
109
|
+
default:
|
110
|
+
std::cerr << "The processor is not supported by simdjson." << std::endl;
|
111
|
+
return simdjson::UNEXPECTED_ERROR;
|
112
|
+
}
|
113
|
+
|
114
|
+
return stage1_ptr(buf, len, pj);
|
115
|
+
}
|
116
|
+
|
117
|
+
stage1_functype *stage1_ptr = &find_structural_bits_dispatch;
|
118
|
+
unified_functype *unified_ptr = &unified_machine_dispatch;
|
119
|
+
} // namespace simdjson
|
120
|
+
|
121
|
+
int main(int argc, char *argv[]) {
|
122
|
+
bool verbose = false;
|
123
|
+
bool dump = false;
|
124
|
+
bool json_output = false;
|
125
|
+
bool force_one_iteration = false;
|
126
|
+
bool just_data = false;
|
127
|
+
#ifndef _MSC_VER
|
128
|
+
int c;
|
129
|
+
|
130
|
+
while ((c = getopt(argc, argv, "1vdt")) != -1) {
|
131
|
+
switch (c) {
|
132
|
+
case 't':
|
133
|
+
just_data = true;
|
134
|
+
break;
|
135
|
+
case 'v':
|
136
|
+
verbose = true;
|
137
|
+
break;
|
138
|
+
case 'd':
|
139
|
+
dump = true;
|
140
|
+
break;
|
141
|
+
case 'j':
|
142
|
+
json_output = true;
|
143
|
+
break;
|
144
|
+
case '1':
|
145
|
+
force_one_iteration = true;
|
146
|
+
break;
|
147
|
+
default:
|
148
|
+
abort();
|
149
|
+
}
|
150
|
+
}
|
151
|
+
#else
|
152
|
+
int optind = 1;
|
153
|
+
#endif
|
154
|
+
if (optind >= argc) {
|
155
|
+
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
|
156
|
+
exit(1);
|
157
|
+
}
|
158
|
+
const char *filename = argv[optind];
|
159
|
+
if (optind + 1 < argc) {
|
160
|
+
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
|
161
|
+
<< std::endl;
|
162
|
+
}
|
163
|
+
if (verbose) {
|
164
|
+
std::cout << "[verbose] loading " << filename << std::endl;
|
165
|
+
}
|
166
|
+
simdjson::padded_string p;
|
167
|
+
try {
|
168
|
+
simdjson::get_corpus(filename).swap(p);
|
169
|
+
} catch (const std::exception &) { // caught by reference to base
|
170
|
+
std::cout << "Could not load the file " << filename << std::endl;
|
171
|
+
return EXIT_FAILURE;
|
172
|
+
}
|
173
|
+
if (verbose) {
|
174
|
+
std::cout << "[verbose] loaded " << filename << " (" << p.size()
|
175
|
+
<< " bytes)" << std::endl;
|
176
|
+
}
|
177
|
+
#if defined(DEBUG)
|
178
|
+
const uint32_t iterations = 1;
|
179
|
+
#else
|
180
|
+
const uint32_t iterations =
|
181
|
+
force_one_iteration ? 1 : (p.size() < 1 * 1000 * 1000 ? 1000 : 10);
|
182
|
+
#endif
|
183
|
+
std::vector<double> res;
|
184
|
+
res.resize(iterations);
|
185
|
+
if (!just_data)
|
186
|
+
printf("number of iterations %u \n", iterations);
|
187
|
+
#if !defined(__linux__)
|
188
|
+
#define SQUASH_COUNTERS
|
189
|
+
if (just_data) {
|
190
|
+
printf("just_data (-t) flag only works under linux.\n");
|
191
|
+
}
|
192
|
+
#endif
|
193
|
+
{ // practice run
|
194
|
+
simdjson::ParsedJson pj;
|
195
|
+
bool allocok = pj.allocate_capacity(p.size());
|
196
|
+
if (allocok) {
|
197
|
+
simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj);
|
198
|
+
simdjson::unified_ptr(
|
199
|
+
(const uint8_t
|
200
|
+
*)(const uint8_t
|
201
|
+
*)(const uint8_t
|
202
|
+
*)(const uint8_t
|
203
|
+
*)(const uint8_t
|
204
|
+
*)(const uint8_t
|
205
|
+
*)(const uint8_t
|
206
|
+
*)(const uint8_t *)
|
207
|
+
p.data(),
|
208
|
+
p.size(), pj);
|
209
|
+
}
|
210
|
+
}
|
211
|
+
#ifndef SQUASH_COUNTERS
|
212
|
+
std::vector<int> evts;
|
213
|
+
evts.push_back(PERF_COUNT_HW_CPU_CYCLES);
|
214
|
+
evts.push_back(PERF_COUNT_HW_INSTRUCTIONS);
|
215
|
+
evts.push_back(PERF_COUNT_HW_BRANCH_MISSES);
|
216
|
+
evts.push_back(PERF_COUNT_HW_CACHE_REFERENCES);
|
217
|
+
evts.push_back(PERF_COUNT_HW_CACHE_MISSES);
|
218
|
+
LinuxEvents<PERF_TYPE_HARDWARE> unified(evts);
|
219
|
+
std::vector<unsigned long long> results;
|
220
|
+
results.resize(evts.size());
|
221
|
+
unsigned long cy0 = 0, cy1 = 0, cy2 = 0;
|
222
|
+
unsigned long cl0 = 0, cl1 = 0, cl2 = 0;
|
223
|
+
unsigned long mis0 = 0, mis1 = 0, mis2 = 0;
|
224
|
+
unsigned long cref0 = 0, cref1 = 0, cref2 = 0;
|
225
|
+
unsigned long cmis0 = 0, cmis1 = 0, cmis2 = 0;
|
226
|
+
#endif
|
227
|
+
bool isok = true;
|
228
|
+
#ifndef SQUASH_COUNTERS
|
229
|
+
for (uint32_t i = 0; i < iterations; i++) {
|
230
|
+
if (verbose) {
|
231
|
+
std::cout << "[verbose] iteration # " << i << std::endl;
|
232
|
+
}
|
233
|
+
unified.start();
|
234
|
+
simdjson::ParsedJson pj;
|
235
|
+
bool allocok = pj.allocate_capacity(p.size());
|
236
|
+
if (!allocok) {
|
237
|
+
std::cerr << "failed to allocate memory" << std::endl;
|
238
|
+
return EXIT_FAILURE;
|
239
|
+
}
|
240
|
+
unified.end(results);
|
241
|
+
cy0 += results[0];
|
242
|
+
cl0 += results[1];
|
243
|
+
mis0 += results[2];
|
244
|
+
cref0 += results[3];
|
245
|
+
cmis0 += results[4];
|
246
|
+
if (verbose) {
|
247
|
+
std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
|
248
|
+
}
|
249
|
+
unified.start();
|
250
|
+
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
|
251
|
+
simdjson::SUCCESS);
|
252
|
+
unified.end(results);
|
253
|
+
cy1 += results[0];
|
254
|
+
cl1 += results[1];
|
255
|
+
mis1 += results[2];
|
256
|
+
cref1 += results[3];
|
257
|
+
cmis1 += results[4];
|
258
|
+
if (!isok) {
|
259
|
+
std::cout << "Failed during stage 1" << std::endl;
|
260
|
+
break;
|
261
|
+
}
|
262
|
+
unified.start();
|
263
|
+
isok = isok &&
|
264
|
+
(simdjson::SUCCESS ==
|
265
|
+
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
|
266
|
+
unified.end(results);
|
267
|
+
cy2 += results[0];
|
268
|
+
cl2 += results[1];
|
269
|
+
mis2 += results[2];
|
270
|
+
cref2 += results[3];
|
271
|
+
cmis2 += results[4];
|
272
|
+
if (!isok) {
|
273
|
+
std::cout << "Failed during stage 2" << std::endl;
|
274
|
+
break;
|
275
|
+
}
|
276
|
+
}
|
277
|
+
#endif
|
278
|
+
// we do it again, this time just measuring the elapsed time
|
279
|
+
for (uint32_t i = 0; i < iterations; i++) {
|
280
|
+
if (verbose) {
|
281
|
+
std::cout << "[verbose] iteration # " << i << std::endl;
|
282
|
+
}
|
283
|
+
simdjson::ParsedJson pj;
|
284
|
+
bool allocok = pj.allocate_capacity(p.size());
|
285
|
+
if (!allocok) {
|
286
|
+
std::cerr << "failed to allocate memory" << std::endl;
|
287
|
+
return EXIT_FAILURE;
|
288
|
+
}
|
289
|
+
if (verbose) {
|
290
|
+
std::cout << "[verbose] allocated memory for parsed JSON " << std::endl;
|
291
|
+
}
|
292
|
+
|
293
|
+
auto start = std::chrono::steady_clock::now();
|
294
|
+
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
|
295
|
+
simdjson::SUCCESS);
|
296
|
+
isok = isok &&
|
297
|
+
(simdjson::SUCCESS ==
|
298
|
+
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
|
299
|
+
auto end = std::chrono::steady_clock::now();
|
300
|
+
std::chrono::duration<double> secs = end - start;
|
301
|
+
res[i] = secs.count();
|
302
|
+
if (!isok) {
|
303
|
+
std::cerr << pj.get_error_message() << std::endl;
|
304
|
+
std::cerr << "Could not parse. " << std::endl;
|
305
|
+
return EXIT_FAILURE;
|
306
|
+
}
|
307
|
+
}
|
308
|
+
simdjson::ParsedJson pj =
|
309
|
+
build_parsed_json(p); // do the parsing again to get the stats
|
310
|
+
if (!pj.is_valid()) {
|
311
|
+
std::cerr << pj.get_error_message() << std::endl;
|
312
|
+
std::cerr << "Could not parse. " << std::endl;
|
313
|
+
return EXIT_FAILURE;
|
314
|
+
}
|
315
|
+
double min_result = *min_element(res.begin(), res.end());
|
316
|
+
double speedinGBs = (p.size()) / (min_result * 1000000000.0);
|
317
|
+
#ifndef SQUASH_COUNTERS
|
318
|
+
unsigned long total = cy0 + cy1 + cy2;
|
319
|
+
if (just_data) {
|
320
|
+
float cpb0 = (double)cy0 / (iterations * p.size());
|
321
|
+
float cpb1 = (double)cy1 / (iterations * p.size());
|
322
|
+
float cpb2 = (double)cy2 / (iterations * p.size());
|
323
|
+
float cpbtotal = (double)total / (iterations * p.size());
|
324
|
+
char *newfile = (char *)malloc(strlen(filename) + 1);
|
325
|
+
if (newfile == NULL) {
|
326
|
+
return EXIT_FAILURE;
|
327
|
+
}
|
328
|
+
::strcpy(newfile, filename);
|
329
|
+
char *snewfile = ::basename(newfile);
|
330
|
+
size_t nl = strlen(snewfile);
|
331
|
+
for (size_t j = nl - 1; j > 0; j--) {
|
332
|
+
if (snewfile[j] == '.') {
|
333
|
+
snewfile[j] = '\0';
|
334
|
+
break;
|
335
|
+
}
|
336
|
+
}
|
337
|
+
printf("\"%s\"\t%f\t%f\t%f\t%f\t%f\n", snewfile, cpb0, cpb1, cpb2, cpbtotal,
|
338
|
+
speedinGBs);
|
339
|
+
free(newfile);
|
340
|
+
} else {
|
341
|
+
printf("number of bytes %ld number of structural chars %u ratio %.3f\n",
|
342
|
+
p.size(), pj.n_structural_indexes,
|
343
|
+
(double)pj.n_structural_indexes / p.size());
|
344
|
+
printf("mem alloc instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
|
345
|
+
"%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
|
346
|
+
"%10lu (failure %10lu)\n",
|
347
|
+
cl0 / iterations, cy0 / iterations, 100. * cy0 / total,
|
348
|
+
(double)cl0 / cy0, mis0 / iterations, (double)cy0 / mis0,
|
349
|
+
cref1 / iterations, cmis0 / iterations);
|
350
|
+
printf(" mem alloc runs at %.2f cycles per input byte.\n",
|
351
|
+
(double)cy0 / (iterations * p.size()));
|
352
|
+
printf("stage 1 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
|
353
|
+
"%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache accesses: "
|
354
|
+
"%10lu (failure %10lu)\n",
|
355
|
+
cl1 / iterations, cy1 / iterations, 100. * cy1 / total,
|
356
|
+
(double)cl1 / cy1, mis1 / iterations, (double)cy1 / mis1,
|
357
|
+
cref1 / iterations, cmis1 / iterations);
|
358
|
+
printf(" stage 1 runs at %.2f cycles per input byte.\n",
|
359
|
+
(double)cy1 / (iterations * p.size()));
|
360
|
+
|
361
|
+
printf("stage 2 instructions: %10lu cycles: %10lu (%.2f %%) ins/cycles: "
|
362
|
+
"%.2f mis. branches: %10lu (cycles/mis.branch %.2f) cache "
|
363
|
+
"accesses: %10lu (failure %10lu)\n",
|
364
|
+
cl2 / iterations, cy2 / iterations, 100. * cy2 / total,
|
365
|
+
(double)cl2 / cy2, mis2 / iterations, (double)cy2 / mis2,
|
366
|
+
cref2 / iterations, cmis2 / iterations);
|
367
|
+
printf(" stage 2 runs at %.2f cycles per input byte and ",
|
368
|
+
(double)cy2 / (iterations * p.size()));
|
369
|
+
printf("%.2f cycles per structural character.\n",
|
370
|
+
(double)cy2 / (iterations * pj.n_structural_indexes));
|
371
|
+
|
372
|
+
printf(" all stages: %.2f cycles per input byte.\n",
|
373
|
+
(double)total / (iterations * p.size()));
|
374
|
+
printf("Estimated average frequency: %.3f GHz.\n",
|
375
|
+
(double)total / (iterations * min_result * 1000000000.0));
|
376
|
+
}
|
377
|
+
#endif
|
378
|
+
if (!just_data) {
|
379
|
+
std::cout << "Min: " << min_result << " bytes read: " << p.size()
|
380
|
+
<< " Gigabytes/second: " << speedinGBs << std::endl;
|
381
|
+
}
|
382
|
+
if (json_output) {
|
383
|
+
isok = isok && pj.print_json(std::cout);
|
384
|
+
}
|
385
|
+
if (dump) {
|
386
|
+
isok = isok && pj.dump_raw_tape(std::cout);
|
387
|
+
}
|
388
|
+
if (!isok) {
|
389
|
+
fprintf(stderr, " Parsing failed. \n ");
|
390
|
+
return EXIT_FAILURE;
|
391
|
+
}
|
392
|
+
return EXIT_SUCCESS;
|
393
|
+
}
|
@@ -0,0 +1,305 @@
|
|
1
|
+
#include "simdjson/jsonparser.h"
|
2
|
+
#include <unistd.h>
|
3
|
+
|
4
|
+
#include "benchmark.h"
|
5
|
+
// #define RAPIDJSON_SSE2 // bad for performance
|
6
|
+
// #define RAPIDJSON_SSE42 // bad for performance
|
7
|
+
#include "rapidjson/document.h"
|
8
|
+
#include "rapidjson/reader.h"
|
9
|
+
#include "rapidjson/stringbuffer.h"
|
10
|
+
#include "rapidjson/writer.h"
|
11
|
+
|
12
|
+
#include "sajson.h"
|
13
|
+
|
14
|
+
using namespace rapidjson;
|
15
|
+
using namespace simdjson;
|
16
|
+
struct stat_s {
|
17
|
+
size_t number_count;
|
18
|
+
size_t object_count;
|
19
|
+
size_t array_count;
|
20
|
+
size_t null_count;
|
21
|
+
size_t true_count;
|
22
|
+
size_t false_count;
|
23
|
+
bool valid;
|
24
|
+
};
|
25
|
+
|
26
|
+
typedef struct stat_s stat_t;
|
27
|
+
|
28
|
+
bool stat_equal(const stat_t &s1, const stat_t &s2) {
|
29
|
+
return (s1.valid == s2.valid) && (s1.number_count == s2.number_count) &&
|
30
|
+
(s1.object_count == s2.object_count) &&
|
31
|
+
(s1.array_count == s2.array_count) &&
|
32
|
+
(s1.null_count == s2.null_count) && (s1.true_count == s2.true_count) &&
|
33
|
+
(s1.false_count == s2.false_count);
|
34
|
+
}
|
35
|
+
|
36
|
+
void print_stat(const stat_t &s) {
|
37
|
+
if (!s.valid) {
|
38
|
+
printf("invalid\n");
|
39
|
+
return;
|
40
|
+
}
|
41
|
+
printf("number: %zu object: %zu array: %zu null: %zu true: %zu false: %zu\n",
|
42
|
+
s.number_count, s.object_count, s.array_count, s.null_count,
|
43
|
+
s.true_count, s.false_count);
|
44
|
+
}
|
45
|
+
|
46
|
+
__attribute__((noinline)) stat_t
|
47
|
+
simdjson_compute_stats(const simdjson::padded_string &p) {
|
48
|
+
stat_t answer;
|
49
|
+
simdjson::ParsedJson pj = build_parsed_json(p);
|
50
|
+
answer.valid = pj.is_valid();
|
51
|
+
if (!answer.valid) {
|
52
|
+
return answer;
|
53
|
+
}
|
54
|
+
answer.number_count = 0;
|
55
|
+
answer.object_count = 0;
|
56
|
+
answer.array_count = 0;
|
57
|
+
answer.null_count = 0;
|
58
|
+
answer.true_count = 0;
|
59
|
+
answer.false_count = 0;
|
60
|
+
size_t tape_idx = 0;
|
61
|
+
uint64_t tape_val = pj.tape[tape_idx++];
|
62
|
+
uint8_t type = (tape_val >> 56);
|
63
|
+
size_t how_many = 0;
|
64
|
+
assert(type == 'r');
|
65
|
+
how_many = tape_val & JSON_VALUE_MASK;
|
66
|
+
for (; tape_idx < how_many; tape_idx++) {
|
67
|
+
tape_val = pj.tape[tape_idx];
|
68
|
+
// uint64_t payload = tape_val & JSON_VALUE_MASK;
|
69
|
+
type = (tape_val >> 56);
|
70
|
+
switch (type) {
|
71
|
+
case 'l': // we have a long int
|
72
|
+
answer.number_count++;
|
73
|
+
tape_idx++; // skipping the integer
|
74
|
+
break;
|
75
|
+
case 'd': // we have a double
|
76
|
+
answer.number_count++;
|
77
|
+
tape_idx++; // skipping the double
|
78
|
+
break;
|
79
|
+
case 'n': // we have a null
|
80
|
+
answer.null_count++;
|
81
|
+
break;
|
82
|
+
case 't': // we have a true
|
83
|
+
answer.true_count++;
|
84
|
+
break;
|
85
|
+
case 'f': // we have a false
|
86
|
+
answer.false_count++;
|
87
|
+
break;
|
88
|
+
case '{': // we have an object
|
89
|
+
answer.object_count++;
|
90
|
+
break;
|
91
|
+
case '}': // we end an object
|
92
|
+
break;
|
93
|
+
case '[': // we start an array
|
94
|
+
answer.array_count++;
|
95
|
+
break;
|
96
|
+
case ']': // we end an array
|
97
|
+
break;
|
98
|
+
default:
|
99
|
+
break; // ignore
|
100
|
+
}
|
101
|
+
}
|
102
|
+
return answer;
|
103
|
+
}
|
104
|
+
|
105
|
+
// see
|
106
|
+
// https://github.com/miloyip/nativejson-benchmark/blob/master/src/tests/sajsontest.cpp
|
107
|
+
void sajson_traverse(stat_t &stats, const sajson::value &node) {
|
108
|
+
using namespace sajson;
|
109
|
+
switch (node.get_type()) {
|
110
|
+
case TYPE_NULL:
|
111
|
+
stats.null_count++;
|
112
|
+
break;
|
113
|
+
case TYPE_FALSE:
|
114
|
+
stats.false_count++;
|
115
|
+
break;
|
116
|
+
case TYPE_TRUE:
|
117
|
+
stats.true_count++;
|
118
|
+
break;
|
119
|
+
case TYPE_ARRAY: {
|
120
|
+
stats.array_count++;
|
121
|
+
auto length = node.get_length();
|
122
|
+
for (size_t i = 0; i < length; ++i) {
|
123
|
+
sajson_traverse(stats, node.get_array_element(i));
|
124
|
+
}
|
125
|
+
break;
|
126
|
+
}
|
127
|
+
case TYPE_OBJECT: {
|
128
|
+
stats.object_count++;
|
129
|
+
auto length = node.get_length();
|
130
|
+
for (auto i = 0u; i < length; ++i) {
|
131
|
+
sajson_traverse(stats, node.get_object_value(i));
|
132
|
+
}
|
133
|
+
break;
|
134
|
+
}
|
135
|
+
case TYPE_STRING:
|
136
|
+
// skip
|
137
|
+
break;
|
138
|
+
|
139
|
+
case TYPE_DOUBLE:
|
140
|
+
case TYPE_INTEGER:
|
141
|
+
stats.number_count++; // node.get_number_value();
|
142
|
+
break;
|
143
|
+
default:
|
144
|
+
assert(false && "unknown node type");
|
145
|
+
}
|
146
|
+
}
|
147
|
+
|
148
|
+
__attribute__((noinline)) stat_t
|
149
|
+
sasjon_compute_stats(const simdjson::padded_string &p) {
|
150
|
+
stat_t answer;
|
151
|
+
char *buffer = (char *)malloc(p.size());
|
152
|
+
memcpy(buffer, p.data(), p.size());
|
153
|
+
auto d = sajson::parse(sajson::dynamic_allocation(),
|
154
|
+
sajson::mutable_string_view(p.size(), buffer));
|
155
|
+
answer.valid = d.is_valid();
|
156
|
+
if (!answer.valid) {
|
157
|
+
return answer;
|
158
|
+
}
|
159
|
+
answer.number_count = 0;
|
160
|
+
answer.object_count = 0;
|
161
|
+
answer.array_count = 0;
|
162
|
+
answer.null_count = 0;
|
163
|
+
answer.true_count = 0;
|
164
|
+
answer.false_count = 0;
|
165
|
+
sajson_traverse(answer, d.get_root());
|
166
|
+
free(buffer);
|
167
|
+
return answer;
|
168
|
+
}
|
169
|
+
|
170
|
+
void rapid_traverse(stat_t &stats, const rapidjson::Value &v) {
|
171
|
+
switch (v.GetType()) {
|
172
|
+
case kNullType:
|
173
|
+
stats.null_count++;
|
174
|
+
break;
|
175
|
+
case kFalseType:
|
176
|
+
stats.false_count++;
|
177
|
+
break;
|
178
|
+
case kTrueType:
|
179
|
+
stats.true_count++;
|
180
|
+
break;
|
181
|
+
|
182
|
+
case kObjectType:
|
183
|
+
for (Value::ConstMemberIterator m = v.MemberBegin(); m != v.MemberEnd();
|
184
|
+
++m) {
|
185
|
+
rapid_traverse(stats, m->value);
|
186
|
+
}
|
187
|
+
stats.object_count++;
|
188
|
+
break;
|
189
|
+
case kArrayType:
|
190
|
+
for (Value::ConstValueIterator i = v.Begin(); i != v.End();
|
191
|
+
++i) { // v.Size();
|
192
|
+
rapid_traverse(stats, *i);
|
193
|
+
}
|
194
|
+
stats.array_count++;
|
195
|
+
break;
|
196
|
+
|
197
|
+
case kStringType:
|
198
|
+
break;
|
199
|
+
|
200
|
+
case kNumberType:
|
201
|
+
stats.number_count++;
|
202
|
+
break;
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
__attribute__((noinline)) stat_t
|
207
|
+
rapid_compute_stats(const simdjson::padded_string &p) {
|
208
|
+
stat_t answer;
|
209
|
+
char *buffer = (char *)malloc(p.size() + 1);
|
210
|
+
memcpy(buffer, p.data(), p.size());
|
211
|
+
buffer[p.size()] = '\0';
|
212
|
+
rapidjson::Document d;
|
213
|
+
d.ParseInsitu<kParseValidateEncodingFlag>(buffer);
|
214
|
+
answer.valid = !d.HasParseError();
|
215
|
+
if (!answer.valid) {
|
216
|
+
return answer;
|
217
|
+
}
|
218
|
+
answer.number_count = 0;
|
219
|
+
answer.object_count = 0;
|
220
|
+
answer.array_count = 0;
|
221
|
+
answer.null_count = 0;
|
222
|
+
answer.true_count = 0;
|
223
|
+
answer.false_count = 0;
|
224
|
+
rapid_traverse(answer, d);
|
225
|
+
free(buffer);
|
226
|
+
return answer;
|
227
|
+
}
|
228
|
+
|
229
|
+
int main(int argc, char *argv[]) {
|
230
|
+
bool verbose = false;
|
231
|
+
bool just_data = false;
|
232
|
+
|
233
|
+
int c;
|
234
|
+
while ((c = getopt(argc, argv, "vt")) != -1)
|
235
|
+
switch (c) {
|
236
|
+
case 't':
|
237
|
+
just_data = true;
|
238
|
+
break;
|
239
|
+
case 'v':
|
240
|
+
verbose = true;
|
241
|
+
break;
|
242
|
+
default:
|
243
|
+
abort();
|
244
|
+
}
|
245
|
+
if (optind >= argc) {
|
246
|
+
std::cerr
|
247
|
+
<< "Using different parsers, we compute the content statistics of "
|
248
|
+
"JSON documents."
|
249
|
+
<< std::endl;
|
250
|
+
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
|
251
|
+
std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
|
252
|
+
exit(1);
|
253
|
+
}
|
254
|
+
const char *filename = argv[optind];
|
255
|
+
if (optind + 1 < argc) {
|
256
|
+
std::cerr << "warning: ignoring everything after " << argv[optind + 1]
|
257
|
+
<< std::endl;
|
258
|
+
}
|
259
|
+
simdjson::padded_string p;
|
260
|
+
try {
|
261
|
+
simdjson::get_corpus(filename).swap(p);
|
262
|
+
} catch (const std::exception &e) { // caught by reference to base
|
263
|
+
std::cout << "Could not load the file " << filename << std::endl;
|
264
|
+
return EXIT_FAILURE;
|
265
|
+
}
|
266
|
+
|
267
|
+
if (verbose) {
|
268
|
+
std::cout << "Input has ";
|
269
|
+
if (p.size() > 1024 * 1024)
|
270
|
+
std::cout << p.size() / (1024 * 1024) << " MB ";
|
271
|
+
else if (p.size() > 1024)
|
272
|
+
std::cout << p.size() / 1024 << " KB ";
|
273
|
+
else
|
274
|
+
std::cout << p.size() << " B ";
|
275
|
+
std::cout << std::endl;
|
276
|
+
}
|
277
|
+
stat_t s1 = simdjson_compute_stats(p);
|
278
|
+
if (verbose) {
|
279
|
+
printf("simdjson: ");
|
280
|
+
print_stat(s1);
|
281
|
+
}
|
282
|
+
stat_t s2 = rapid_compute_stats(p);
|
283
|
+
if (verbose) {
|
284
|
+
printf("rapid: ");
|
285
|
+
print_stat(s2);
|
286
|
+
}
|
287
|
+
stat_t s3 = sasjon_compute_stats(p);
|
288
|
+
if (verbose) {
|
289
|
+
printf("sasjon: ");
|
290
|
+
print_stat(s3);
|
291
|
+
}
|
292
|
+
assert(stat_equal(s1, s2));
|
293
|
+
assert(stat_equal(s1, s3));
|
294
|
+
int repeat = 50;
|
295
|
+
int volume = p.size();
|
296
|
+
if (just_data) {
|
297
|
+
printf("name cycles_per_byte cycles_per_byte_err gb_per_s gb_per_s_err \n");
|
298
|
+
}
|
299
|
+
BEST_TIME("simdjson ", simdjson_compute_stats(p).valid, true, , repeat,
|
300
|
+
volume, !just_data);
|
301
|
+
BEST_TIME("RapidJSON ", rapid_compute_stats(p).valid, true, , repeat, volume,
|
302
|
+
!just_data);
|
303
|
+
BEST_TIME("sasjon ", sasjon_compute_stats(p).valid, true, , repeat, volume,
|
304
|
+
!just_data);
|
305
|
+
}
|