simdjson 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.clang-format +5 -0
- data/.gitignore +14 -0
- data/.gitmodules +3 -0
- data/.rubocop.yml +9 -0
- data/.travis.yml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +39 -0
- data/Rakefile +32 -0
- data/benchmark/apache_builds.json +4421 -0
- data/benchmark/demo.json +15 -0
- data/benchmark/github_events.json +1390 -0
- data/benchmark/run_benchmark.rb +30 -0
- data/ext/simdjson/extconf.rb +22 -0
- data/ext/simdjson/simdjson.cpp +76 -0
- data/ext/simdjson/simdjson.hpp +6 -0
- data/lib/simdjson/version.rb +3 -0
- data/lib/simdjson.rb +2 -0
- data/simdjson.gemspec +35 -0
- data/vendor/.gitkeep +0 -0
- data/vendor/simdjson/AUTHORS +3 -0
- data/vendor/simdjson/CMakeLists.txt +63 -0
- data/vendor/simdjson/CONTRIBUTORS +27 -0
- data/vendor/simdjson/Dockerfile +10 -0
- data/vendor/simdjson/LICENSE +201 -0
- data/vendor/simdjson/Makefile +203 -0
- data/vendor/simdjson/Notes.md +85 -0
- data/vendor/simdjson/README.md +581 -0
- data/vendor/simdjson/amalgamation.sh +158 -0
- data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
- data/vendor/simdjson/benchmark/benchmark.h +223 -0
- data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
- data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
- data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
- data/vendor/simdjson/benchmark/parse.cpp +393 -0
- data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
- data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
- data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
- data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
- data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
- data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
- data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
- data/vendor/simdjson/doc/gbps.png +0 -0
- data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
- data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
- data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
- data/vendor/simdjson/images/halvarflake.png +0 -0
- data/vendor/simdjson/images/logo.png +0 -0
- data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
- data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
- data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
- data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
- data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
- data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
- data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
- data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
- data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
- data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
- data/vendor/simdjson/include/simdjson/portability.h +172 -0
- data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
- data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
- data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
- data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
- data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
- data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
- data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
- data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
- data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
- data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
- data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
- data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
- data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
- data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
- data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
- data/vendor/simdjson/scripts/issue150.sh +14 -0
- data/vendor/simdjson/scripts/javascript/README.md +3 -0
- data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
- data/vendor/simdjson/scripts/minifier.sh +11 -0
- data/vendor/simdjson/scripts/parseandstat.sh +24 -0
- data/vendor/simdjson/scripts/parser.sh +11 -0
- data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
- data/vendor/simdjson/scripts/plotparse.sh +98 -0
- data/vendor/simdjson/scripts/selectparser.sh +11 -0
- data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
- data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
- data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
- data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
- data/vendor/simdjson/scripts/testjson2json.sh +99 -0
- data/vendor/simdjson/scripts/transitions/Makefile +10 -0
- data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
- data/vendor/simdjson/singleheader/README.md +1 -0
- data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
- data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
- data/vendor/simdjson/singleheader/simdjson.h +39692 -0
- data/vendor/simdjson/src/CMakeLists.txt +67 -0
- data/vendor/simdjson/src/jsonioutil.cpp +35 -0
- data/vendor/simdjson/src/jsonminifier.cpp +285 -0
- data/vendor/simdjson/src/jsonparser.cpp +91 -0
- data/vendor/simdjson/src/parsedjson.cpp +323 -0
- data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
- data/vendor/simdjson/src/simdjson.cpp +30 -0
- data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
- data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
- data/vendor/simdjson/style/clang-format-check.sh +25 -0
- data/vendor/simdjson/style/clang-format.sh +25 -0
- data/vendor/simdjson/style/run-clang-format.py +326 -0
- data/vendor/simdjson/tape.md +134 -0
- data/vendor/simdjson/tests/CMakeLists.txt +25 -0
- data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
- data/vendor/simdjson/tests/basictests.cpp +75 -0
- data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
- data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
- data/vendor/simdjson/tests/pointercheck.cpp +38 -0
- data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
- data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
- data/vendor/simdjson/tools/CMakeLists.txt +3 -0
- data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
- data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
- data/vendor/simdjson/tools/json2json.cpp +112 -0
- data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
- data/vendor/simdjson/tools/jsonstats.cpp +143 -0
- data/vendor/simdjson/tools/minify.cpp +21 -0
- data/vendor/simdjson/tools/release.py +125 -0
- data/vendor/simdjson/windows/dirent_portable.h +1043 -0
- metadata +273 -0
@@ -0,0 +1,326 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
"""A wrapper script around clang-format, suitable for linting multiple files
|
3
|
+
and to use for continuous integration.
|
4
|
+
|
5
|
+
This is an alternative API for the clang-format command line.
|
6
|
+
It runs over multiple files and directories in parallel.
|
7
|
+
A diff output is produced and a sensible exit code is returned.
|
8
|
+
|
9
|
+
"""
|
10
|
+
|
11
|
+
from __future__ import print_function, unicode_literals
|
12
|
+
|
13
|
+
import argparse
|
14
|
+
import codecs
|
15
|
+
import difflib
|
16
|
+
import fnmatch
|
17
|
+
import io
|
18
|
+
import multiprocessing
|
19
|
+
import os
|
20
|
+
import signal
|
21
|
+
import subprocess
|
22
|
+
import sys
|
23
|
+
import traceback
|
24
|
+
|
25
|
+
from functools import partial
|
26
|
+
|
27
|
+
DEFAULT_EXTENSIONS = 'c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx'
|
28
|
+
|
29
|
+
|
30
|
+
class ExitStatus:
|
31
|
+
SUCCESS = 0
|
32
|
+
DIFF = 1
|
33
|
+
TROUBLE = 2
|
34
|
+
|
35
|
+
|
36
|
+
def list_files(files, recursive=False, extensions=None, exclude=None):
|
37
|
+
if extensions is None:
|
38
|
+
extensions = []
|
39
|
+
if exclude is None:
|
40
|
+
exclude = []
|
41
|
+
|
42
|
+
out = []
|
43
|
+
for file in files:
|
44
|
+
if recursive and os.path.isdir(file):
|
45
|
+
for dirpath, dnames, fnames in os.walk(file):
|
46
|
+
fpaths = [os.path.join(dirpath, fname) for fname in fnames]
|
47
|
+
for pattern in exclude:
|
48
|
+
# os.walk() supports trimming down the dnames list
|
49
|
+
# by modifying it in-place,
|
50
|
+
# to avoid unnecessary directory listings.
|
51
|
+
dnames[:] = [
|
52
|
+
x for x in dnames
|
53
|
+
if
|
54
|
+
not fnmatch.fnmatch(os.path.join(dirpath, x), pattern)
|
55
|
+
]
|
56
|
+
fpaths = [
|
57
|
+
x for x in fpaths if not fnmatch.fnmatch(x, pattern)
|
58
|
+
]
|
59
|
+
for f in fpaths:
|
60
|
+
ext = os.path.splitext(f)[1][1:]
|
61
|
+
if ext in extensions:
|
62
|
+
out.append(f)
|
63
|
+
else:
|
64
|
+
out.append(file)
|
65
|
+
return out
|
66
|
+
|
67
|
+
|
68
|
+
def make_diff(file, original, reformatted):
|
69
|
+
return list(
|
70
|
+
difflib.unified_diff(
|
71
|
+
original,
|
72
|
+
reformatted,
|
73
|
+
fromfile='{}\t(original)'.format(file),
|
74
|
+
tofile='{}\t(reformatted)'.format(file),
|
75
|
+
n=3))
|
76
|
+
|
77
|
+
|
78
|
+
class DiffError(Exception):
|
79
|
+
def __init__(self, message, errs=None):
|
80
|
+
super(DiffError, self).__init__(message)
|
81
|
+
self.errs = errs or []
|
82
|
+
|
83
|
+
|
84
|
+
class UnexpectedError(Exception):
|
85
|
+
def __init__(self, message, exc=None):
|
86
|
+
super(UnexpectedError, self).__init__(message)
|
87
|
+
self.formatted_traceback = traceback.format_exc()
|
88
|
+
self.exc = exc
|
89
|
+
|
90
|
+
|
91
|
+
def run_clang_format_diff_wrapper(args, file):
|
92
|
+
try:
|
93
|
+
ret = run_clang_format_diff(args, file)
|
94
|
+
return ret
|
95
|
+
except DiffError:
|
96
|
+
raise
|
97
|
+
except Exception as e:
|
98
|
+
raise UnexpectedError('{}: {}: {}'.format(file, e.__class__.__name__,
|
99
|
+
e), e)
|
100
|
+
|
101
|
+
|
102
|
+
def run_clang_format_diff(args, file):
|
103
|
+
try:
|
104
|
+
with io.open(file, 'r', encoding='utf-8') as f:
|
105
|
+
original = f.readlines()
|
106
|
+
except IOError as exc:
|
107
|
+
raise DiffError(str(exc))
|
108
|
+
invocation = [args.clang_format_executable, file]
|
109
|
+
|
110
|
+
# Use of utf-8 to decode the process output.
|
111
|
+
#
|
112
|
+
# Hopefully, this is the correct thing to do.
|
113
|
+
#
|
114
|
+
# It's done due to the following assumptions (which may be incorrect):
|
115
|
+
# - clang-format will returns the bytes read from the files as-is,
|
116
|
+
# without conversion, and it is already assumed that the files use utf-8.
|
117
|
+
# - if the diagnostics were internationalized, they would use utf-8:
|
118
|
+
# > Adding Translations to Clang
|
119
|
+
# >
|
120
|
+
# > Not possible yet!
|
121
|
+
# > Diagnostic strings should be written in UTF-8,
|
122
|
+
# > the client can translate to the relevant code page if needed.
|
123
|
+
# > Each translation completely replaces the format string
|
124
|
+
# > for the diagnostic.
|
125
|
+
# > -- http://clang.llvm.org/docs/InternalsManual.html#internals-diag-translation
|
126
|
+
#
|
127
|
+
# It's not pretty, due to Python 2 & 3 compatibility.
|
128
|
+
encoding_py3 = {}
|
129
|
+
if sys.version_info[0] >= 3:
|
130
|
+
encoding_py3['encoding'] = 'utf-8'
|
131
|
+
|
132
|
+
try:
|
133
|
+
proc = subprocess.Popen(
|
134
|
+
invocation,
|
135
|
+
stdout=subprocess.PIPE,
|
136
|
+
stderr=subprocess.PIPE,
|
137
|
+
universal_newlines=True,
|
138
|
+
**encoding_py3)
|
139
|
+
except OSError as exc:
|
140
|
+
raise DiffError(str(exc))
|
141
|
+
proc_stdout = proc.stdout
|
142
|
+
proc_stderr = proc.stderr
|
143
|
+
if sys.version_info[0] < 3:
|
144
|
+
# make the pipes compatible with Python 3,
|
145
|
+
# reading lines should output unicode
|
146
|
+
encoding = 'utf-8'
|
147
|
+
proc_stdout = codecs.getreader(encoding)(proc_stdout)
|
148
|
+
proc_stderr = codecs.getreader(encoding)(proc_stderr)
|
149
|
+
# hopefully the stderr pipe won't get full and block the process
|
150
|
+
outs = list(proc_stdout.readlines())
|
151
|
+
errs = list(proc_stderr.readlines())
|
152
|
+
proc.wait()
|
153
|
+
if proc.returncode:
|
154
|
+
raise DiffError("clang-format exited with status {}: '{}'".format(
|
155
|
+
proc.returncode, file), errs)
|
156
|
+
return make_diff(file, original, outs), errs
|
157
|
+
|
158
|
+
|
159
|
+
def bold_red(s):
|
160
|
+
return '\x1b[1m\x1b[31m' + s + '\x1b[0m'
|
161
|
+
|
162
|
+
|
163
|
+
def colorize(diff_lines):
|
164
|
+
def bold(s):
|
165
|
+
return '\x1b[1m' + s + '\x1b[0m'
|
166
|
+
|
167
|
+
def cyan(s):
|
168
|
+
return '\x1b[36m' + s + '\x1b[0m'
|
169
|
+
|
170
|
+
def green(s):
|
171
|
+
return '\x1b[32m' + s + '\x1b[0m'
|
172
|
+
|
173
|
+
def red(s):
|
174
|
+
return '\x1b[31m' + s + '\x1b[0m'
|
175
|
+
|
176
|
+
for line in diff_lines:
|
177
|
+
if line[:4] in ['--- ', '+++ ']:
|
178
|
+
yield bold(line)
|
179
|
+
elif line.startswith('@@ '):
|
180
|
+
yield cyan(line)
|
181
|
+
elif line.startswith('+'):
|
182
|
+
yield green(line)
|
183
|
+
elif line.startswith('-'):
|
184
|
+
yield red(line)
|
185
|
+
else:
|
186
|
+
yield line
|
187
|
+
|
188
|
+
|
189
|
+
def print_diff(diff_lines, use_color):
|
190
|
+
if use_color:
|
191
|
+
diff_lines = colorize(diff_lines)
|
192
|
+
if sys.version_info[0] < 3:
|
193
|
+
sys.stdout.writelines((l.encode('utf-8') for l in diff_lines))
|
194
|
+
else:
|
195
|
+
sys.stdout.writelines(diff_lines)
|
196
|
+
|
197
|
+
|
198
|
+
def print_trouble(prog, message, use_colors):
|
199
|
+
error_text = 'error:'
|
200
|
+
if use_colors:
|
201
|
+
error_text = bold_red(error_text)
|
202
|
+
print("{}: {} {}".format(prog, error_text, message), file=sys.stderr)
|
203
|
+
|
204
|
+
|
205
|
+
def main():
|
206
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
207
|
+
parser.add_argument(
|
208
|
+
'--clang-format-executable',
|
209
|
+
metavar='EXECUTABLE',
|
210
|
+
help='path to the clang-format executable',
|
211
|
+
default='clang-format')
|
212
|
+
parser.add_argument(
|
213
|
+
'--extensions',
|
214
|
+
help='comma separated list of file extensions (default: {})'.format(
|
215
|
+
DEFAULT_EXTENSIONS),
|
216
|
+
default=DEFAULT_EXTENSIONS)
|
217
|
+
parser.add_argument(
|
218
|
+
'-r',
|
219
|
+
'--recursive',
|
220
|
+
action='store_true',
|
221
|
+
help='run recursively over directories')
|
222
|
+
parser.add_argument('files', metavar='file', nargs='+')
|
223
|
+
parser.add_argument(
|
224
|
+
'-q',
|
225
|
+
'--quiet',
|
226
|
+
action='store_true')
|
227
|
+
parser.add_argument(
|
228
|
+
'-j',
|
229
|
+
metavar='N',
|
230
|
+
type=int,
|
231
|
+
default=0,
|
232
|
+
help='run N clang-format jobs in parallel'
|
233
|
+
' (default number of cpus + 1)')
|
234
|
+
parser.add_argument(
|
235
|
+
'--color',
|
236
|
+
default='auto',
|
237
|
+
choices=['auto', 'always', 'never'],
|
238
|
+
help='show colored diff (default: auto)')
|
239
|
+
parser.add_argument(
|
240
|
+
'-e',
|
241
|
+
'--exclude',
|
242
|
+
metavar='PATTERN',
|
243
|
+
action='append',
|
244
|
+
default=[],
|
245
|
+
help='exclude paths matching the given glob-like pattern(s)'
|
246
|
+
' from recursive search')
|
247
|
+
|
248
|
+
args = parser.parse_args()
|
249
|
+
|
250
|
+
# use default signal handling, like diff return SIGINT value on ^C
|
251
|
+
# https://bugs.python.org/issue14229#msg156446
|
252
|
+
signal.signal(signal.SIGINT, signal.SIG_DFL)
|
253
|
+
try:
|
254
|
+
signal.SIGPIPE
|
255
|
+
except AttributeError:
|
256
|
+
# compatibility, SIGPIPE does not exist on Windows
|
257
|
+
pass
|
258
|
+
else:
|
259
|
+
signal.signal(signal.SIGPIPE, signal.SIG_DFL)
|
260
|
+
|
261
|
+
colored_stdout = False
|
262
|
+
colored_stderr = False
|
263
|
+
if args.color == 'always':
|
264
|
+
colored_stdout = True
|
265
|
+
colored_stderr = True
|
266
|
+
elif args.color == 'auto':
|
267
|
+
colored_stdout = sys.stdout.isatty()
|
268
|
+
colored_stderr = sys.stderr.isatty()
|
269
|
+
|
270
|
+
retcode = ExitStatus.SUCCESS
|
271
|
+
files = list_files(
|
272
|
+
args.files,
|
273
|
+
recursive=args.recursive,
|
274
|
+
exclude=args.exclude,
|
275
|
+
extensions=args.extensions.split(','))
|
276
|
+
|
277
|
+
if not files:
|
278
|
+
return
|
279
|
+
|
280
|
+
njobs = args.j
|
281
|
+
if njobs == 0:
|
282
|
+
njobs = multiprocessing.cpu_count() + 1
|
283
|
+
njobs = min(len(files), njobs)
|
284
|
+
|
285
|
+
if njobs == 1:
|
286
|
+
# execute directly instead of in a pool,
|
287
|
+
# less overhead, simpler stacktraces
|
288
|
+
it = (run_clang_format_diff_wrapper(args, file) for file in files)
|
289
|
+
pool = None
|
290
|
+
else:
|
291
|
+
pool = multiprocessing.Pool(njobs)
|
292
|
+
it = pool.imap_unordered(
|
293
|
+
partial(run_clang_format_diff_wrapper, args), files)
|
294
|
+
while True:
|
295
|
+
try:
|
296
|
+
outs, errs = next(it)
|
297
|
+
except StopIteration:
|
298
|
+
break
|
299
|
+
except DiffError as e:
|
300
|
+
print_trouble(parser.prog, str(e), use_colors=colored_stderr)
|
301
|
+
retcode = ExitStatus.TROUBLE
|
302
|
+
sys.stderr.writelines(e.errs)
|
303
|
+
except UnexpectedError as e:
|
304
|
+
print_trouble(parser.prog, str(e), use_colors=colored_stderr)
|
305
|
+
sys.stderr.write(e.formatted_traceback)
|
306
|
+
retcode = ExitStatus.TROUBLE
|
307
|
+
# stop at the first unexpected error,
|
308
|
+
# something could be very wrong,
|
309
|
+
# don't process all files unnecessarily
|
310
|
+
if pool:
|
311
|
+
pool.terminate()
|
312
|
+
break
|
313
|
+
else:
|
314
|
+
sys.stderr.writelines(errs)
|
315
|
+
if outs == []:
|
316
|
+
continue
|
317
|
+
if not args.quiet:
|
318
|
+
print_diff(outs, use_color=colored_stdout)
|
319
|
+
if retcode == ExitStatus.SUCCESS:
|
320
|
+
retcode = ExitStatus.DIFF
|
321
|
+
return retcode
|
322
|
+
|
323
|
+
|
324
|
+
if __name__ == '__main__':
|
325
|
+
#sys.exit(main())
|
326
|
+
main() # we don't want a hard failure on a style check.
|
@@ -0,0 +1,134 @@
|
|
1
|
+
|
2
|
+
# Tape structure in simdjson
|
3
|
+
|
4
|
+
We parse a JSON document to a tape. A tape is an array of 64-bit values. Each node encountered in the JSON document is written to the tape using one or more 64-bit tape elements; the layout of the tape is in "document order": elements are stored as they are encountered in the JSON document.
|
5
|
+
|
6
|
+
Throughout, little endian encoding is assumed. The tape is indexed starting at 0 (the first element is at index 0).
|
7
|
+
|
8
|
+
## Example
|
9
|
+
|
10
|
+
It is sometimes useful to start with an example. Consider the following JSON document:
|
11
|
+
|
12
|
+
```json
|
13
|
+
{
|
14
|
+
"Image": {
|
15
|
+
"Width": 800,
|
16
|
+
"Height": 600,
|
17
|
+
"Title": "View from 15th Floor",
|
18
|
+
"Thumbnail": {
|
19
|
+
"Url": "http://www.example.com/image/481989943",
|
20
|
+
"Height": 125,
|
21
|
+
"Width": 100
|
22
|
+
},
|
23
|
+
"Animated": false,
|
24
|
+
"IDs": [116, 943, 234, 38793]
|
25
|
+
}
|
26
|
+
}
|
27
|
+
```
|
28
|
+
|
29
|
+
The following is a dump of the content of the tape, with the first number of each line representing the index of a tape element.
|
30
|
+
|
31
|
+
```bash
|
32
|
+
$ ./json2json -d jsonexamples/small/demo.json
|
33
|
+
0 : r // pointing to 38 (right after last node)
|
34
|
+
1 : { // pointing to next tape location 38 (first node after the scope)
|
35
|
+
2 : string "Image"
|
36
|
+
3 : { // pointing to next tape location 37 (first node after the scope)
|
37
|
+
4 : string "Width"
|
38
|
+
5 : integer 800
|
39
|
+
7 : string "Height"
|
40
|
+
8 : integer 600
|
41
|
+
10 : string "Title"
|
42
|
+
11 : string "View from 15th Floor"
|
43
|
+
12 : string "Thumbnail"
|
44
|
+
13 : { // pointing to next tape location 23 (first node after the scope)
|
45
|
+
14 : string "Url"
|
46
|
+
15 : string "http://www.example.com/image/481989943"
|
47
|
+
16 : string "Height"
|
48
|
+
17 : integer 125
|
49
|
+
19 : string "Width"
|
50
|
+
20 : integer 100
|
51
|
+
22 : } // pointing to previous tape location 13 (start of the scope)
|
52
|
+
23 : string "Animated"
|
53
|
+
24 : false
|
54
|
+
25 : string "IDs"
|
55
|
+
26 : [ // pointing to next tape location 36 (first node after the scope)
|
56
|
+
27 : integer 116
|
57
|
+
29 : integer 943
|
58
|
+
31 : integer 234
|
59
|
+
33 : integer 38793
|
60
|
+
35 : ] // pointing to previous tape location 26 (start of the scope)
|
61
|
+
36 : } // pointing to previous tape location 3 (start of the scope)
|
62
|
+
37 : } // pointing to previous tape location 1 (start of the scope)
|
63
|
+
38 : r // pointing to 0 (start root)
|
64
|
+
|
65
|
+
```
|
66
|
+
|
67
|
+
## General formal of the tape elements
|
68
|
+
|
69
|
+
Most tape elements are written as `('c' << 56) + x` where `'c'` is some ASCII character determining the type of the element (out of 't', 'f', 'n', 'l', 'd', '"', '{', '}', '[', ']' ,'r') and where `x` is a 56-bit value called the payload. The payload is normally interpreted as an unsigned 56-bit integer. Note that 56-bit integers can be quite large.
|
70
|
+
|
71
|
+
|
72
|
+
Performance consideration: We believe that accessing the tape in regular units of 64 bits is more important for performance than saving memory.
|
73
|
+
|
74
|
+
## Simple JSON values
|
75
|
+
|
76
|
+
Simple JSON nodes are represented with one tape element:
|
77
|
+
|
78
|
+
- null is represented as the 64-bit value `('n' << 56)` where `'n'` is the 8-bit code point values (in ASCII) corresponding to the letter `'n'`.
|
79
|
+
- true is represented as the 64-bit value `('t' << 56)`.
|
80
|
+
- false is represented as the 64-bit value `('f' << 56)`.
|
81
|
+
|
82
|
+
|
83
|
+
## Integer and Double values
|
84
|
+
|
85
|
+
Integer values are represented as two 64-bit tape elements:
|
86
|
+
- The 64-bit value `('l' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
|
87
|
+
|
88
|
+
Float values are represented as two 64-bit tape elements:
|
89
|
+
- The 64-bit value `('d' << 56)` followed by the 64-bit double value litterally in standard IEEE 754 notation.
|
90
|
+
|
91
|
+
Performance consideration: We store numbers of the main tape because we believe that locality of reference is helpful for performance.
|
92
|
+
|
93
|
+
## Root node
|
94
|
+
|
95
|
+
Each JSON document will have two special 64-bit tape elements representing a root node, one at the beginning and one at the end.
|
96
|
+
|
97
|
+
- The first 64-bit tape element contains the value `('r' << 56) + x` where `x` is the location on the tape of the last root element.
|
98
|
+
- The last 64-bit tape element contains the value `('r' << 56)`.
|
99
|
+
|
100
|
+
All of the parsed document is located between these two 64-bit tape elements.
|
101
|
+
|
102
|
+
Hint: We can read the first tape element to determine the length of the tape.
|
103
|
+
|
104
|
+
|
105
|
+
## Strings
|
106
|
+
|
107
|
+
We prefix the string data itself by a 32-bit header to be interpreted as a 32-bit integer. It indicates the length of the string. The actual string data starts at an offset of 4 bytes.
|
108
|
+
|
109
|
+
We store string values using UTF-8 encoding with null termination on a separate tape. A string value is represented on the main tape as the 64-bit tape element `('"' << 56) + x` where the payload `x` is the location on the string tape of the null-terminated string.
|
110
|
+
|
111
|
+
## Arrays
|
112
|
+
|
113
|
+
JSON arrays are represented using two 64-bit tape elements.
|
114
|
+
|
115
|
+
- The first 64-bit tape element contains the value `('[' << 56) + x` where the payload `x` is 1 + the index of the second 64-bit tape element on the tape.
|
116
|
+
- The second 64-bit tape element contains the value `(']' << 56) + x` where the payload `x` contains the index of the first 64-bit tape element on the tape.
|
117
|
+
|
118
|
+
All the content of the array is located between these two tape elements, including arrays and objects.
|
119
|
+
|
120
|
+
Performance consideration: We can skip the content of an array entirely by accessing the first 64-bit tape element, reading the payload and moving to the corresponding index on the tape.
|
121
|
+
|
122
|
+
## Objects
|
123
|
+
|
124
|
+
JSON objects are represented using two 64-bit tape elements.
|
125
|
+
|
126
|
+
- The first 64-bit tape element contains the value `('{' << 56) + x` where the payload `x` is 1 + the index of the second 64-bit tape element on the tape.
|
127
|
+
- The second 64-bit tape element contains the value `('}' << 56) + x` where the payload `x` contains the index of the first 64-bit tape element on the tape.
|
128
|
+
|
129
|
+
In-between these two tape elements, we alternate between key (which must be strings) and values. A value could be an object or an array.
|
130
|
+
|
131
|
+
All the content of the object is located between these two tape elements, including arrays and objects.
|
132
|
+
|
133
|
+
Performance consideration: We can skip the content of an object entirely by accessing the first 64-bit tape element, reading the payload and moving to the corresponding index on the tape.
|
134
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
if(MSVC)
|
2
|
+
target_include_directories(${SIMDJSON_LIB_NAME}
|
3
|
+
INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/windows>
|
4
|
+
)
|
5
|
+
endif()
|
6
|
+
|
7
|
+
add_cpp_test(basictests)
|
8
|
+
add_cpp_test(jsoncheck)
|
9
|
+
add_cpp_test(pointercheck)
|
10
|
+
|
11
|
+
## This causes problems
|
12
|
+
# add_executable(singleheader ./singleheadertest.cpp ${PROJECT_SOURCE_DIR}/singleheader/simdjson.cpp)
|
13
|
+
# target_compile_definitions(singleheader PRIVATE JSON_TEST_PATH="${PROJECT_SOURCE_DIR}/jsonexamples/twitter.json")
|
14
|
+
# target_link_libraries(singleheader ${SIMDJSON_LIB_NAME})
|
15
|
+
# add_test(singleheader singleheader)
|
16
|
+
|
17
|
+
if(MSVC)
|
18
|
+
add_custom_command(TARGET basictests POST_BUILD # Adds a post-build event
|
19
|
+
COMMAND ${CMAKE_COMMAND} -E echo "$<TARGET_FILE:simdjson>"
|
20
|
+
COMMAND ${CMAKE_COMMAND} -E echo "$<TARGET_FILE_DIR:basictests>"
|
21
|
+
COMMAND ${CMAKE_COMMAND} -E copy_if_different # which executes "cmake -E copy_if_different..."
|
22
|
+
"$<TARGET_FILE:simdjson>" # <--this is in-file
|
23
|
+
"$<TARGET_FILE_DIR:basictests>") # <--this is out-file path
|
24
|
+
endif()
|
25
|
+
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#include <unistd.h>
|
2
|
+
|
3
|
+
#include "simdjson/jsonparser.h"
|
4
|
+
|
5
|
+
// #define RAPIDJSON_SSE2 // bad
|
6
|
+
// #define RAPIDJSON_SSE42 // bad
|
7
|
+
#include "fastjson.cpp"
|
8
|
+
#include "fastjson_dom.cpp"
|
9
|
+
#include "gason.cpp"
|
10
|
+
#include "json11.cpp"
|
11
|
+
#include "rapidjson/document.h"
|
12
|
+
#include "rapidjson/reader.h" // you have to check in the submodule
|
13
|
+
#include "rapidjson/stringbuffer.h"
|
14
|
+
#include "rapidjson/writer.h"
|
15
|
+
#include "sajson.h"
|
16
|
+
extern "C" {
|
17
|
+
#include "cJSON.c"
|
18
|
+
#include "cJSON.h"
|
19
|
+
#include "jsmn.c"
|
20
|
+
#include "jsmn.h"
|
21
|
+
#include "ujdecode.h"
|
22
|
+
#include "ultrajsondec.c"
|
23
|
+
}
|
24
|
+
#include "jsoncpp.cpp"
|
25
|
+
#include "json/json.h"
|
26
|
+
|
27
|
+
// fastjson has a tricky interface
|
28
|
+
void on_json_error(void *, const fastjson::ErrorContext &ec) {
|
29
|
+
// std::cerr<<"ERROR: "<<ec.mesg<<std::endl;
|
30
|
+
}
|
31
|
+
bool fastjson_parse(const char *input) {
|
32
|
+
fastjson::Token token;
|
33
|
+
fastjson::dom::Chunk chunk;
|
34
|
+
return fastjson::dom::parse_string(input, &token, &chunk, 0, &on_json_error,
|
35
|
+
NULL);
|
36
|
+
}
|
37
|
+
// end of fastjson stuff
|
38
|
+
|
39
|
+
using namespace rapidjson;
|
40
|
+
|
41
|
+
int main(int argc, char *argv[]) {
|
42
|
+
bool verbose = false;
|
43
|
+
bool just_favorites = false;
|
44
|
+
int c;
|
45
|
+
while ((c = getopt(argc, argv, "vm")) != -1)
|
46
|
+
switch (c) {
|
47
|
+
case 'v':
|
48
|
+
verbose = true;
|
49
|
+
break;
|
50
|
+
case 'm':
|
51
|
+
just_favorites = true;
|
52
|
+
break;
|
53
|
+
default:
|
54
|
+
abort();
|
55
|
+
}
|
56
|
+
if (optind >= argc) {
|
57
|
+
std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
|
58
|
+
std::cerr << "Or " << argv[0] << " -v <jsonfile>" << std::endl;
|
59
|
+
exit(1);
|
60
|
+
}
|
61
|
+
const char *filename = argv[optind];
|
62
|
+
simdjson::padded_string p;
|
63
|
+
try {
|
64
|
+
simdjson::get_corpus(filename).swap(p);
|
65
|
+
} catch (const std::exception &e) { // caught by reference to base
|
66
|
+
std::cout << "Could not load the file " << filename << std::endl;
|
67
|
+
return EXIT_FAILURE;
|
68
|
+
}
|
69
|
+
if (verbose) {
|
70
|
+
std::cout << "Input has ";
|
71
|
+
if (p.size() > 1024 * 1024)
|
72
|
+
std::cout << p.size() / (1024 * 1024) << " MB ";
|
73
|
+
else if (p.size() > 1024)
|
74
|
+
std::cout << p.size() / 1024 << " KB ";
|
75
|
+
else
|
76
|
+
std::cout << p.size() << " B ";
|
77
|
+
std::cout << std::endl;
|
78
|
+
}
|
79
|
+
simdjson::ParsedJson pj;
|
80
|
+
size_t max_depth = 1024 * 4;
|
81
|
+
bool allocok = pj.allocate_capacity(p.size(), max_depth);
|
82
|
+
if (!allocok) {
|
83
|
+
std::cerr << "can't allocate memory" << std::endl;
|
84
|
+
return EXIT_FAILURE;
|
85
|
+
}
|
86
|
+
int oursreturn = json_parse(p, pj);
|
87
|
+
bool ours_correct = (oursreturn == 0); // returns 0 on success
|
88
|
+
|
89
|
+
rapidjson::Document d;
|
90
|
+
|
91
|
+
char *buffer = (char *)malloc(p.size() + 1);
|
92
|
+
memcpy(buffer, p.data(), p.size());
|
93
|
+
buffer[p.size()] = '\0';
|
94
|
+
bool rapid_correct_checkencoding =
|
95
|
+
(d.Parse<kParseValidateEncodingFlag>((const char *)buffer)
|
96
|
+
.HasParseError() == false);
|
97
|
+
bool sajson_correct =
|
98
|
+
sajson::parse(sajson::dynamic_allocation(),
|
99
|
+
sajson::mutable_string_view(p.size(), buffer))
|
100
|
+
.is_valid();
|
101
|
+
if (just_favorites) {
|
102
|
+
printf("our parser : %s \n",
|
103
|
+
ours_correct ? "correct" : "invalid");
|
104
|
+
printf("rapid (check encoding) : %s \n",
|
105
|
+
rapid_correct_checkencoding ? "correct" : "invalid");
|
106
|
+
printf("sajson : %s \n",
|
107
|
+
sajson_correct ? "correct" : "invalid");
|
108
|
+
if (oursreturn == simdjson::DEPTH_ERROR) {
|
109
|
+
printf("simdjson encountered a DEPTH_ERROR, it was parametrized to "
|
110
|
+
"reject documents with depth exceeding %zu.\n",
|
111
|
+
max_depth);
|
112
|
+
}
|
113
|
+
if ((ours_correct != rapid_correct_checkencoding) ||
|
114
|
+
(rapid_correct_checkencoding != sajson_correct) ||
|
115
|
+
(ours_correct != sajson_correct)) {
|
116
|
+
printf("WARNING: THEY DISAGREE\n\n");
|
117
|
+
return EXIT_FAILURE;
|
118
|
+
}
|
119
|
+
free(buffer);
|
120
|
+
return EXIT_SUCCESS;
|
121
|
+
}
|
122
|
+
bool rapid_correct = (d.Parse((const char *)buffer).HasParseError() == false);
|
123
|
+
|
124
|
+
std::string json11err;
|
125
|
+
bool dropbox_correct = ((json11::Json::parse(buffer, json11err).is_null()) ||
|
126
|
+
(!json11err.empty())) == false;
|
127
|
+
bool fastjson_correct = fastjson_parse(buffer);
|
128
|
+
JsonValue value;
|
129
|
+
JsonAllocator allocator;
|
130
|
+
char *endptr;
|
131
|
+
bool gason_correct =
|
132
|
+
(jsonParse(buffer, &endptr, &value, allocator) == JSON_OK);
|
133
|
+
void *state;
|
134
|
+
bool ultrajson_correct =
|
135
|
+
((UJDecode(buffer, p.size(), NULL, &state) == NULL) == false);
|
136
|
+
|
137
|
+
auto tokens = std::make_unique<jsmntok_t[]>(p.size());
|
138
|
+
bool jsmn_correct = false;
|
139
|
+
if (tokens == nullptr) {
|
140
|
+
printf("Failed to alloc memory for jsmn\n");
|
141
|
+
} else {
|
142
|
+
jsmn_parser parser;
|
143
|
+
jsmn_init(&parser);
|
144
|
+
memcpy(buffer, p.data(), p.size());
|
145
|
+
buffer[p.size()] = '\0';
|
146
|
+
int r = jsmn_parse(&parser, buffer, p.size(), tokens.get(), p.size());
|
147
|
+
tokens = nullptr;
|
148
|
+
jsmn_correct = (r > 0);
|
149
|
+
}
|
150
|
+
|
151
|
+
memcpy(buffer, p.data(), p.size());
|
152
|
+
buffer[p.size()] = '\0';
|
153
|
+
cJSON *tree = cJSON_Parse(buffer);
|
154
|
+
bool cjson_correct = (tree != NULL);
|
155
|
+
if (tree != NULL) {
|
156
|
+
cJSON_Delete(tree);
|
157
|
+
}
|
158
|
+
|
159
|
+
Json::CharReaderBuilder b;
|
160
|
+
Json::CharReader *json_cpp_reader = b.newCharReader();
|
161
|
+
Json::Value root;
|
162
|
+
Json::String errs;
|
163
|
+
bool is_json_cpp_ok =
|
164
|
+
json_cpp_reader->parse(buffer, buffer + p.size(), &root, &errs);
|
165
|
+
delete json_cpp_reader;
|
166
|
+
|
167
|
+
printf("our parser : %s \n",
|
168
|
+
ours_correct ? "correct" : "invalid");
|
169
|
+
printf("rapid : %s \n",
|
170
|
+
rapid_correct ? "correct" : "invalid");
|
171
|
+
printf("rapid (check encoding) : %s \n",
|
172
|
+
rapid_correct_checkencoding ? "correct" : "invalid");
|
173
|
+
printf("sajson : %s \n",
|
174
|
+
sajson_correct ? "correct" : "invalid");
|
175
|
+
printf("dropbox : %s \n",
|
176
|
+
dropbox_correct ? "correct" : "invalid");
|
177
|
+
printf("fastjson : %s \n",
|
178
|
+
fastjson_correct ? "correct" : "invalid");
|
179
|
+
printf("gason : %s \n",
|
180
|
+
gason_correct ? "correct" : "invalid");
|
181
|
+
printf("ultrajson : %s \n",
|
182
|
+
ultrajson_correct ? "correct" : "invalid");
|
183
|
+
printf("jsmn : %s \n",
|
184
|
+
jsmn_correct ? "correct" : "invalid");
|
185
|
+
printf("cjson : %s \n",
|
186
|
+
cjson_correct ? "correct" : "invalid");
|
187
|
+
printf("jsoncpp : %s \n",
|
188
|
+
is_json_cpp_ok ? "correct" : "invalid");
|
189
|
+
|
190
|
+
free(buffer);
|
191
|
+
return EXIT_SUCCESS;
|
192
|
+
}
|