simdjson 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.clang-format +5 -0
  3. data/.gitignore +14 -0
  4. data/.gitmodules +3 -0
  5. data/.rubocop.yml +9 -0
  6. data/.travis.yml +7 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +39 -0
  10. data/Rakefile +32 -0
  11. data/benchmark/apache_builds.json +4421 -0
  12. data/benchmark/demo.json +15 -0
  13. data/benchmark/github_events.json +1390 -0
  14. data/benchmark/run_benchmark.rb +30 -0
  15. data/ext/simdjson/extconf.rb +22 -0
  16. data/ext/simdjson/simdjson.cpp +76 -0
  17. data/ext/simdjson/simdjson.hpp +6 -0
  18. data/lib/simdjson/version.rb +3 -0
  19. data/lib/simdjson.rb +2 -0
  20. data/simdjson.gemspec +35 -0
  21. data/vendor/.gitkeep +0 -0
  22. data/vendor/simdjson/AUTHORS +3 -0
  23. data/vendor/simdjson/CMakeLists.txt +63 -0
  24. data/vendor/simdjson/CONTRIBUTORS +27 -0
  25. data/vendor/simdjson/Dockerfile +10 -0
  26. data/vendor/simdjson/LICENSE +201 -0
  27. data/vendor/simdjson/Makefile +203 -0
  28. data/vendor/simdjson/Notes.md +85 -0
  29. data/vendor/simdjson/README.md +581 -0
  30. data/vendor/simdjson/amalgamation.sh +158 -0
  31. data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
  32. data/vendor/simdjson/benchmark/benchmark.h +223 -0
  33. data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
  34. data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
  35. data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
  36. data/vendor/simdjson/benchmark/parse.cpp +393 -0
  37. data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
  38. data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
  39. data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
  40. data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
  41. data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
  42. data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
  43. data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
  44. data/vendor/simdjson/doc/gbps.png +0 -0
  45. data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
  46. data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
  47. data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
  48. data/vendor/simdjson/images/halvarflake.png +0 -0
  49. data/vendor/simdjson/images/logo.png +0 -0
  50. data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
  51. data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
  52. data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
  53. data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
  54. data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
  55. data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
  56. data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
  57. data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
  58. data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
  59. data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
  60. data/vendor/simdjson/include/simdjson/portability.h +172 -0
  61. data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
  62. data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
  63. data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
  64. data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
  65. data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
  66. data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
  67. data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
  68. data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
  69. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
  70. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
  71. data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
  72. data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
  73. data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
  74. data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
  75. data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
  76. data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
  77. data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
  78. data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
  79. data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
  80. data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
  81. data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
  82. data/vendor/simdjson/scripts/issue150.sh +14 -0
  83. data/vendor/simdjson/scripts/javascript/README.md +3 -0
  84. data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
  85. data/vendor/simdjson/scripts/minifier.sh +11 -0
  86. data/vendor/simdjson/scripts/parseandstat.sh +24 -0
  87. data/vendor/simdjson/scripts/parser.sh +11 -0
  88. data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
  89. data/vendor/simdjson/scripts/plotparse.sh +98 -0
  90. data/vendor/simdjson/scripts/selectparser.sh +11 -0
  91. data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
  92. data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
  93. data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
  94. data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
  95. data/vendor/simdjson/scripts/testjson2json.sh +99 -0
  96. data/vendor/simdjson/scripts/transitions/Makefile +10 -0
  97. data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
  98. data/vendor/simdjson/singleheader/README.md +1 -0
  99. data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
  100. data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
  101. data/vendor/simdjson/singleheader/simdjson.h +39692 -0
  102. data/vendor/simdjson/src/CMakeLists.txt +67 -0
  103. data/vendor/simdjson/src/jsonioutil.cpp +35 -0
  104. data/vendor/simdjson/src/jsonminifier.cpp +285 -0
  105. data/vendor/simdjson/src/jsonparser.cpp +91 -0
  106. data/vendor/simdjson/src/parsedjson.cpp +323 -0
  107. data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
  108. data/vendor/simdjson/src/simdjson.cpp +30 -0
  109. data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
  110. data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
  111. data/vendor/simdjson/style/clang-format-check.sh +25 -0
  112. data/vendor/simdjson/style/clang-format.sh +25 -0
  113. data/vendor/simdjson/style/run-clang-format.py +326 -0
  114. data/vendor/simdjson/tape.md +134 -0
  115. data/vendor/simdjson/tests/CMakeLists.txt +25 -0
  116. data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
  117. data/vendor/simdjson/tests/basictests.cpp +75 -0
  118. data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
  119. data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
  120. data/vendor/simdjson/tests/pointercheck.cpp +38 -0
  121. data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
  122. data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
  123. data/vendor/simdjson/tools/CMakeLists.txt +3 -0
  124. data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
  125. data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
  126. data/vendor/simdjson/tools/json2json.cpp +112 -0
  127. data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
  128. data/vendor/simdjson/tools/jsonstats.cpp +143 -0
  129. data/vendor/simdjson/tools/minify.cpp +21 -0
  130. data/vendor/simdjson/tools/release.py +125 -0
  131. data/vendor/simdjson/windows/dirent_portable.h +1043 -0
  132. metadata +273 -0
@@ -0,0 +1,587 @@
1
+ #ifndef SIMDJSON_NUMBERPARSING_H
2
+ #define SIMDJSON_NUMBERPARSING_H
3
+
4
+ #include "simdjson/common_defs.h"
5
+ #include "simdjson/jsoncharutils.h"
6
+ #include "simdjson/parsedjson.h"
7
+ #include "simdjson/portability.h"
8
+
9
+ #ifdef JSON_TEST_NUMBERS // for unit testing
10
+ void found_invalid_number(const uint8_t *buf);
11
+ void found_integer(int64_t result, const uint8_t *buf);
12
+ void found_float(double result, const uint8_t *buf);
13
+ #endif
14
+
15
+ namespace simdjson {
16
+ // Allowable floating-point values range from
17
+ // std::numeric_limits<double>::lowest() to std::numeric_limits<double>::max(),
18
+ // so from -1.7976e308 all the way to 1.7975e308 in binary64. The lowest
19
+ // non-zero normal values is std::numeric_limits<double>::min() or
20
+ // about 2.225074e-308.
21
+ static const double power_of_ten[] = {
22
+ 1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300,
23
+ 1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291,
24
+ 1e-290, 1e-289, 1e-288, 1e-287, 1e-286, 1e-285, 1e-284, 1e-283, 1e-282,
25
+ 1e-281, 1e-280, 1e-279, 1e-278, 1e-277, 1e-276, 1e-275, 1e-274, 1e-273,
26
+ 1e-272, 1e-271, 1e-270, 1e-269, 1e-268, 1e-267, 1e-266, 1e-265, 1e-264,
27
+ 1e-263, 1e-262, 1e-261, 1e-260, 1e-259, 1e-258, 1e-257, 1e-256, 1e-255,
28
+ 1e-254, 1e-253, 1e-252, 1e-251, 1e-250, 1e-249, 1e-248, 1e-247, 1e-246,
29
+ 1e-245, 1e-244, 1e-243, 1e-242, 1e-241, 1e-240, 1e-239, 1e-238, 1e-237,
30
+ 1e-236, 1e-235, 1e-234, 1e-233, 1e-232, 1e-231, 1e-230, 1e-229, 1e-228,
31
+ 1e-227, 1e-226, 1e-225, 1e-224, 1e-223, 1e-222, 1e-221, 1e-220, 1e-219,
32
+ 1e-218, 1e-217, 1e-216, 1e-215, 1e-214, 1e-213, 1e-212, 1e-211, 1e-210,
33
+ 1e-209, 1e-208, 1e-207, 1e-206, 1e-205, 1e-204, 1e-203, 1e-202, 1e-201,
34
+ 1e-200, 1e-199, 1e-198, 1e-197, 1e-196, 1e-195, 1e-194, 1e-193, 1e-192,
35
+ 1e-191, 1e-190, 1e-189, 1e-188, 1e-187, 1e-186, 1e-185, 1e-184, 1e-183,
36
+ 1e-182, 1e-181, 1e-180, 1e-179, 1e-178, 1e-177, 1e-176, 1e-175, 1e-174,
37
+ 1e-173, 1e-172, 1e-171, 1e-170, 1e-169, 1e-168, 1e-167, 1e-166, 1e-165,
38
+ 1e-164, 1e-163, 1e-162, 1e-161, 1e-160, 1e-159, 1e-158, 1e-157, 1e-156,
39
+ 1e-155, 1e-154, 1e-153, 1e-152, 1e-151, 1e-150, 1e-149, 1e-148, 1e-147,
40
+ 1e-146, 1e-145, 1e-144, 1e-143, 1e-142, 1e-141, 1e-140, 1e-139, 1e-138,
41
+ 1e-137, 1e-136, 1e-135, 1e-134, 1e-133, 1e-132, 1e-131, 1e-130, 1e-129,
42
+ 1e-128, 1e-127, 1e-126, 1e-125, 1e-124, 1e-123, 1e-122, 1e-121, 1e-120,
43
+ 1e-119, 1e-118, 1e-117, 1e-116, 1e-115, 1e-114, 1e-113, 1e-112, 1e-111,
44
+ 1e-110, 1e-109, 1e-108, 1e-107, 1e-106, 1e-105, 1e-104, 1e-103, 1e-102,
45
+ 1e-101, 1e-100, 1e-99, 1e-98, 1e-97, 1e-96, 1e-95, 1e-94, 1e-93,
46
+ 1e-92, 1e-91, 1e-90, 1e-89, 1e-88, 1e-87, 1e-86, 1e-85, 1e-84,
47
+ 1e-83, 1e-82, 1e-81, 1e-80, 1e-79, 1e-78, 1e-77, 1e-76, 1e-75,
48
+ 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66,
49
+ 1e-65, 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57,
50
+ 1e-56, 1e-55, 1e-54, 1e-53, 1e-52, 1e-51, 1e-50, 1e-49, 1e-48,
51
+ 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, 1e-40, 1e-39,
52
+ 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30,
53
+ 1e-29, 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21,
54
+ 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12,
55
+ 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3,
56
+ 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6,
57
+ 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15,
58
+ 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24,
59
+ 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, 1e32, 1e33,
60
+ 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42,
61
+ 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51,
62
+ 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60,
63
+ 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
64
+ 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78,
65
+ 1e79, 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87,
66
+ 1e88, 1e89, 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96,
67
+ 1e97, 1e98, 1e99, 1e100, 1e101, 1e102, 1e103, 1e104, 1e105,
68
+ 1e106, 1e107, 1e108, 1e109, 1e110, 1e111, 1e112, 1e113, 1e114,
69
+ 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, 1e121, 1e122, 1e123,
70
+ 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, 1e131, 1e132,
71
+ 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, 1e141,
72
+ 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150,
73
+ 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
74
+ 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168,
75
+ 1e169, 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177,
76
+ 1e178, 1e179, 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186,
77
+ 1e187, 1e188, 1e189, 1e190, 1e191, 1e192, 1e193, 1e194, 1e195,
78
+ 1e196, 1e197, 1e198, 1e199, 1e200, 1e201, 1e202, 1e203, 1e204,
79
+ 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, 1e211, 1e212, 1e213,
80
+ 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, 1e221, 1e222,
81
+ 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, 1e231,
82
+ 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240,
83
+ 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
84
+ 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258,
85
+ 1e259, 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267,
86
+ 1e268, 1e269, 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276,
87
+ 1e277, 1e278, 1e279, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285,
88
+ 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294,
89
+ 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303,
90
+ 1e304, 1e305, 1e306, 1e307, 1e308};
91
+
92
+ static inline bool is_integer(char c) {
93
+ return (c >= '0' && c <= '9');
94
+ // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
95
+ }
96
+
97
+ // We need to check that the character following a zero is valid. This is
98
+ // probably frequent and it is hard than it looks. We are building all of this
99
+ // just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
100
+ const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
101
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
103
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
104
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
105
+ 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106
+ 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
112
+
113
+ really_inline bool
114
+ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
115
+ return structural_or_whitespace_or_exponent_or_decimal_negated[c];
116
+ }
117
+ } // namespace simdjson
118
+ #ifndef SIMDJSON_DISABLE_SWAR_NUMBER_PARSING
119
+ #define SWAR_NUMBER_PARSING
120
+ #endif
121
+
122
+ #ifdef SWAR_NUMBER_PARSING
123
+
124
+ namespace simdjson {
125
+ // check quickly whether the next 8 chars are made of digits
126
+ // at a glance, it looks better than Mula's
127
+ // http://0x80.pl/articles/swar-digits-validate.html
128
+ static inline bool is_made_of_eight_digits_fast(const char *chars) {
129
+ uint64_t val;
130
+ // this can read up to 7 bytes beyond the buffer size, but we require
131
+ // SIMDJSON_PADDING of padding
132
+ static_assert(7 <= SIMDJSON_PADDING);
133
+ memcpy(&val, chars, 8);
134
+ // a branchy method might be faster:
135
+ // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
136
+ // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
137
+ // 0x3030303030303030);
138
+ return (((val & 0xF0F0F0F0F0F0F0F0) |
139
+ (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
140
+ 0x3333333333333333);
141
+ }
142
+ } // namespace simdjson
143
+ #ifdef IS_X86_64
144
+ TARGET_WESTMERE
145
+ namespace simdjson {
146
+ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
147
+ // this actually computes *16* values so we are being wasteful.
148
+ const __m128i ascii0 = _mm_set1_epi8('0');
149
+ const __m128i mul_1_10 =
150
+ _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
151
+ const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
152
+ const __m128i mul_1_10000 =
153
+ _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
154
+ const __m128i input = _mm_sub_epi8(
155
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
156
+ const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
157
+ const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
158
+ const __m128i t3 = _mm_packus_epi32(t2, t2);
159
+ const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
160
+ return _mm_cvtsi128_si32(
161
+ t4); // only captures the sum of the first 8 digits, drop the rest
162
+ }
163
+ } // namespace simdjson
164
+ UNTARGET_REGION
165
+ #endif
166
+
167
+ namespace simdjson {
168
+ #ifdef IS_ARM64
169
+ // we don't have SSE, so let us use a scalar function
170
+ // credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
171
+ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
172
+ uint64_t val;
173
+ memcpy(&val, chars, sizeof(uint64_t));
174
+ val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
175
+ val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
176
+ return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
177
+ }
178
+ #endif
179
+
180
+ #endif
181
+
182
+ //
183
+ // This function computes base * 10 ^ (- negative_exponent ).
184
+ // It is only even going to be used when negative_exponent is tiny.
185
+ static double subnormal_power10(double base, int negative_exponent) {
186
+ // this is probably not going to be fast
187
+ return base * 1e-308 * pow(10, negative_exponent + 308);
188
+ }
189
+
190
+ // called by parse_number when we know that the output is a float,
191
+ // but where there might be some integer overflow. The trick here is to
192
+ // parse using floats from the start.
193
+ // Do not call this function directly as it skips some of the checks from
194
+ // parse_number
195
+ //
196
+ // This function will almost never be called!!!
197
+ //
198
+ // Note: a redesign could avoid this function entirely.
199
+ //
200
+ static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
201
+ const uint32_t offset, bool found_minus) {
202
+ const char *p = reinterpret_cast<const char *>(buf + offset);
203
+ bool negative = false;
204
+ if (found_minus) {
205
+ ++p;
206
+ negative = true;
207
+ }
208
+ long double i;
209
+ if (*p == '0') { // 0 cannot be followed by an integer
210
+ ++p;
211
+ i = 0;
212
+ } else {
213
+ unsigned char digit = *p - '0';
214
+ i = digit;
215
+ p++;
216
+ while (is_integer(*p)) {
217
+ digit = *p - '0';
218
+ i = 10 * i + digit;
219
+ ++p;
220
+ }
221
+ }
222
+ if ('.' == *p) {
223
+ ++p;
224
+ int fractional_weight = 308;
225
+ if (is_integer(*p)) {
226
+ unsigned char digit = *p - '0';
227
+ ++p;
228
+
229
+ fractional_weight--;
230
+ i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
231
+ : 0);
232
+ } else {
233
+ #ifdef JSON_TEST_NUMBERS // for unit testing
234
+ found_invalid_number(buf + offset);
235
+ #endif
236
+ return false;
237
+ }
238
+ while (is_integer(*p)) {
239
+ unsigned char digit = *p - '0';
240
+ ++p;
241
+ fractional_weight--;
242
+ i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
243
+ : 0);
244
+ }
245
+ }
246
+ if (('e' == *p) || ('E' == *p)) {
247
+ ++p;
248
+ bool neg_exp = false;
249
+ if ('-' == *p) {
250
+ neg_exp = true;
251
+ ++p;
252
+ } else if ('+' == *p) {
253
+ ++p;
254
+ }
255
+ if (!is_integer(*p)) {
256
+ #ifdef JSON_TEST_NUMBERS // for unit testing
257
+ found_invalid_number(buf + offset);
258
+ #endif
259
+ return false;
260
+ }
261
+ unsigned char digit = *p - '0';
262
+ int64_t exp_number = digit; // exponential part
263
+ p++;
264
+ if (is_integer(*p)) {
265
+ digit = *p - '0';
266
+ exp_number = 10 * exp_number + digit;
267
+ ++p;
268
+ }
269
+ if (is_integer(*p)) {
270
+ digit = *p - '0';
271
+ exp_number = 10 * exp_number + digit;
272
+ ++p;
273
+ }
274
+ if (is_integer(*p)) {
275
+ digit = *p - '0';
276
+ exp_number = 10 * exp_number + digit;
277
+ ++p;
278
+ }
279
+ while (is_integer(*p)) {
280
+ if (exp_number > 0x100000000) { // we need to check for overflows
281
+ // we refuse to parse this
282
+ #ifdef JSON_TEST_NUMBERS // for unit testing
283
+ found_invalid_number(buf + offset);
284
+ #endif
285
+ return false;
286
+ }
287
+ digit = *p - '0';
288
+ exp_number = 10 * exp_number + digit;
289
+ ++p;
290
+ }
291
+ if (unlikely(exp_number > 308)) {
292
+ // this path is unlikely
293
+ if (neg_exp) {
294
+ // We either have zero or a subnormal.
295
+ // We expect this to be uncommon so we go through a slow path.
296
+ i = subnormal_power10(i, -exp_number);
297
+ } else {
298
+ // We know for sure that we have a number that is too large,
299
+ // we refuse to parse this
300
+ #ifdef JSON_TEST_NUMBERS // for unit testing
301
+ found_invalid_number(buf + offset);
302
+ #endif
303
+ return false;
304
+ }
305
+ } else {
306
+ int exponent = (neg_exp ? -exp_number : exp_number);
307
+ // we have that exp_number is [0,308] so that
308
+ // exponent is [-308,308] so that
309
+ // 308 + exponent is in [0, 2 * 308]
310
+ i *= power_of_ten[308 + exponent];
311
+ }
312
+ }
313
+ if (is_not_structural_or_whitespace(*p)) {
314
+ return false;
315
+ }
316
+ double d = negative ? -i : i;
317
+ pj.write_tape_double(d);
318
+ #ifdef JSON_TEST_NUMBERS // for unit testing
319
+ found_float(d, buf + offset);
320
+ #endif
321
+ return is_structural_or_whitespace(*p);
322
+ }
323
+
324
+ // called by parse_number when we know that the output is an integer,
325
+ // but where there might be some integer overflow.
326
+ // we want to catch overflows!
327
+ // Do not call this function directly as it skips some of the checks from
328
+ // parse_number
329
+ //
330
+ // This function will almost never be called!!!
331
+ //
332
+ static never_inline bool parse_large_integer(const uint8_t *const buf,
333
+ ParsedJson &pj,
334
+ const uint32_t offset,
335
+ bool found_minus) {
336
+ const char *p = reinterpret_cast<const char *>(buf + offset);
337
+
338
+ bool negative = false;
339
+ if (found_minus) {
340
+ ++p;
341
+ negative = true;
342
+ }
343
+ uint64_t i;
344
+ if (*p == '0') { // 0 cannot be followed by an integer
345
+ ++p;
346
+ i = 0;
347
+ } else {
348
+ unsigned char digit = *p - '0';
349
+ i = digit;
350
+ p++;
351
+ // the is_made_of_eight_digits_fast routine is unlikely to help here because
352
+ // we rarely see large integer parts like 123456789
353
+ while (is_integer(*p)) {
354
+ digit = *p - '0';
355
+ if (mul_overflow(i, 10, &i)) {
356
+ #ifdef JSON_TEST_NUMBERS // for unit testing
357
+ found_invalid_number(buf + offset);
358
+ #endif
359
+ return false; // overflow
360
+ }
361
+ if (add_overflow(i, digit, &i)) {
362
+ #ifdef JSON_TEST_NUMBERS // for unit testing
363
+ found_invalid_number(buf + offset);
364
+ #endif
365
+ return false; // overflow
366
+ }
367
+ ++p;
368
+ }
369
+ }
370
+ if (negative) {
371
+ if (i > 0x8000000000000000) {
372
+ // overflows!
373
+ #ifdef JSON_TEST_NUMBERS // for unit testing
374
+ found_invalid_number(buf + offset);
375
+ #endif
376
+ return false; // overflow
377
+ }
378
+ } else {
379
+ if (i >= 0x8000000000000000) {
380
+ // overflows!
381
+ #ifdef JSON_TEST_NUMBERS // for unit testing
382
+ found_invalid_number(buf + offset);
383
+ #endif
384
+ return false; // overflow
385
+ }
386
+ }
387
+ int64_t signed_answer =
388
+ negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);
389
+ pj.write_tape_s64(signed_answer);
390
+ #ifdef JSON_TEST_NUMBERS // for unit testing
391
+ found_integer(signed_answer, buf + offset);
392
+ #endif
393
+ return is_structural_or_whitespace(*p);
394
+ }
395
+
396
+ // parse the number at buf + offset
397
+ // define JSON_TEST_NUMBERS for unit testing
398
+ //
399
+ // It is assumed that the number is followed by a structural ({,},],[) character
400
+ // or a white space character. If that is not the case (e.g., when the JSON
401
+ // document is made of a single number), then it is necessary to copy the
402
+ // content and append a space before calling this function.
403
+ //
404
+ // Our objective is accurate parsing (ULP of 0 or 1) at high speed.
405
+ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
406
+ const uint32_t offset,
407
+ bool found_minus) {
408
+ #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
409
+ // useful to skip parsing
410
+ pj.write_tape_s64(0); // always write zero
411
+ return true; // always succeeds
412
+ #else
413
+ const char *p = reinterpret_cast<const char *>(buf + offset);
414
+ bool negative = false;
415
+ if (found_minus) {
416
+ ++p;
417
+ negative = true;
418
+ if (!is_integer(*p)) { // a negative sign must be followed by an integer
419
+ #ifdef JSON_TEST_NUMBERS // for unit testing
420
+ found_invalid_number(buf + offset);
421
+ #endif
422
+ return false;
423
+ }
424
+ }
425
+ const char *const start_digits = p;
426
+
427
+ uint64_t i; // an unsigned int avoids signed overflows (which are bad)
428
+ if (*p == '0') { // 0 cannot be followed by an integer
429
+ ++p;
430
+ if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
431
+ #ifdef JSON_TEST_NUMBERS // for unit testing
432
+ found_invalid_number(buf + offset);
433
+ #endif
434
+ return false;
435
+ }
436
+ i = 0;
437
+ } else {
438
+ if (!(is_integer(*p))) { // must start with an integer
439
+ #ifdef JSON_TEST_NUMBERS // for unit testing
440
+ found_invalid_number(buf + offset);
441
+ #endif
442
+ return false;
443
+ }
444
+ unsigned char digit = *p - '0';
445
+ i = digit;
446
+ p++;
447
+ // the is_made_of_eight_digits_fast routine is unlikely to help here because
448
+ // we rarely see large integer parts like 123456789
449
+ while (is_integer(*p)) {
450
+ digit = *p - '0';
451
+ // a multiplication by 10 is cheaper than an arbitrary integer
452
+ // multiplication
453
+ i = 10 * i + digit; // might overflow, we will handle the overflow later
454
+ ++p;
455
+ }
456
+ }
457
+ int64_t exponent = 0;
458
+ bool is_float = false;
459
+ if ('.' == *p) {
460
+ is_float = true; // At this point we know that we have a float
461
+ // we continue with the fiction that we have an integer. If the
462
+ // floating point number is representable as x * 10^z for some integer
463
+ // z that fits in 53 bits, then we will be able to convert back the
464
+ // the integer into a float in a lossless manner.
465
+ ++p;
466
+ const char *const first_after_period = p;
467
+ if (is_integer(*p)) {
468
+ unsigned char digit = *p - '0';
469
+ ++p;
470
+ i = i * 10 + digit; // might overflow + multiplication by 10 is likely
471
+ // cheaper than arbitrary mult.
472
+ // we will handle the overflow later
473
+ } else {
474
+ #ifdef JSON_TEST_NUMBERS // for unit testing
475
+ found_invalid_number(buf + offset);
476
+ #endif
477
+ return false;
478
+ }
479
+ #ifdef SWAR_NUMBER_PARSING
480
+ // this helps if we have lots of decimals!
481
+ // this turns out to be frequent enough.
482
+ if (is_made_of_eight_digits_fast(p)) {
483
+ i = i * 100000000 + parse_eight_digits_unrolled(p);
484
+ p += 8;
485
+ }
486
+ #endif
487
+ while (is_integer(*p)) {
488
+ unsigned char digit = *p - '0';
489
+ ++p;
490
+ i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
491
+ // because we have parse_highprecision_float later.
492
+ }
493
+ exponent = first_after_period - p;
494
+ }
495
+ int digit_count =
496
+ p - start_digits - 1; // used later to guard against overflows
497
+ int64_t exp_number = 0; // exponential part
498
+ if (('e' == *p) || ('E' == *p)) {
499
+ is_float = true;
500
+ ++p;
501
+ bool neg_exp = false;
502
+ if ('-' == *p) {
503
+ neg_exp = true;
504
+ ++p;
505
+ } else if ('+' == *p) {
506
+ ++p;
507
+ }
508
+ if (!is_integer(*p)) {
509
+ #ifdef JSON_TEST_NUMBERS // for unit testing
510
+ found_invalid_number(buf + offset);
511
+ #endif
512
+ return false;
513
+ }
514
+ unsigned char digit = *p - '0';
515
+ exp_number = digit;
516
+ p++;
517
+ if (is_integer(*p)) {
518
+ digit = *p - '0';
519
+ exp_number = 10 * exp_number + digit;
520
+ ++p;
521
+ }
522
+ if (is_integer(*p)) {
523
+ digit = *p - '0';
524
+ exp_number = 10 * exp_number + digit;
525
+ ++p;
526
+ }
527
+ while (is_integer(*p)) {
528
+ if (exp_number > 0x100000000) { // we need to check for overflows
529
+ // we refuse to parse this
530
+ #ifdef JSON_TEST_NUMBERS // for unit testing
531
+ found_invalid_number(buf + offset);
532
+ #endif
533
+ return false;
534
+ }
535
+ digit = *p - '0';
536
+ exp_number = 10 * exp_number + digit;
537
+ ++p;
538
+ }
539
+ exponent += (neg_exp ? -exp_number : exp_number);
540
+ }
541
+ if (is_float) {
542
+ uint64_t power_index = 308 + exponent;
543
+ if (unlikely((digit_count >= 19))) { // this is uncommon
544
+ // It is possible that the integer had an overflow.
545
+ // We have to handle the case where we have 0.0000somenumber.
546
+ const char *start = start_digits;
547
+ while ((*start == '0') || (*start == '.')) {
548
+ start++;
549
+ }
550
+ // we over-decrement by one when there is a '.'
551
+ digit_count -= (start - start_digits);
552
+ if (digit_count >= 19) {
553
+ // Ok, chances are good that we had an overflow!
554
+ // this is almost never going to get called!!!
555
+ // we start anew, going slowly!!!
556
+ return parse_float(buf, pj, offset, found_minus);
557
+ }
558
+ }
559
+ if (unlikely((power_index > 2 * 308))) { // this is uncommon!!!
560
+ // this is almost never going to get called!!!
561
+ // we start anew, going slowly!!!
562
+ return parse_float(buf, pj, offset, found_minus);
563
+ }
564
+ double factor = power_of_ten[power_index];
565
+ factor = negative ? -factor : factor;
566
+ double d = i * factor;
567
+ pj.write_tape_double(d);
568
+ #ifdef JSON_TEST_NUMBERS // for unit testing
569
+ found_float(d, buf + offset);
570
+ #endif
571
+ } else {
572
+ if (unlikely(digit_count >= 18)) { // this is uncommon!!!
573
+ // there is a good chance that we had an overflow, so we need
574
+ // need to recover: we parse the whole thing again.
575
+ return parse_large_integer(buf, pj, offset, found_minus);
576
+ }
577
+ i = negative ? 0 - i : i;
578
+ pj.write_tape_s64(i);
579
+ #ifdef JSON_TEST_NUMBERS // for unit testing
580
+ found_integer(i, buf + offset);
581
+ #endif
582
+ }
583
+ return is_structural_or_whitespace(*p);
584
+ #endif // SIMDJSON_SKIPNUMBERPARSING
585
+ }
586
+ } // simdjson
587
+ #endif
@@ -0,0 +1,70 @@
1
+ #ifndef SIMDJSON_PADDING_STRING_H
2
+ #define SIMDJSON_PADDING_STRING_H
3
+ #include "simdjson/portability.h"
4
+ #include <cstring>
5
+ #include <memory>
6
+
7
+ namespace simdjson {
8
+ // low-level function to allocate memory with padding so we can read passed the
9
+ // "length" bytes safely. if you must provide a pointer to some data, create it
10
+ // with this function: length is the max. size in bytes of the string caller is
11
+ // responsible to free the memory (free(...))
12
+ char *allocate_padded_buffer(size_t length);
13
+
14
+ // Simple string with padded allocation.
15
+ // We deliberately forbid copies, users should rely on swap or move
16
+ // constructors.
17
+ class padded_string {
18
+ public:
19
+ explicit padded_string() noexcept : viable_size(0), data_ptr(nullptr) {}
20
+ explicit padded_string(size_t length) noexcept
21
+ : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
22
+
23
+ if (data_ptr != nullptr)
24
+ data_ptr[length] = '\0'; // easier when you need a c_str
25
+ }
26
+ explicit padded_string(char *data, size_t length) noexcept
27
+ : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
28
+ if (data_ptr != nullptr) {
29
+ memcpy(data_ptr, data, length);
30
+ data_ptr[length] = '\0'; // easier when you need a c_str
31
+ }
32
+ }
33
+ padded_string(std::string s) noexcept
34
+ : viable_size(s.size()), data_ptr(allocate_padded_buffer(s.size())) {
35
+ if (data_ptr != nullptr) {
36
+ memcpy(data_ptr, s.data(), s.size());
37
+ data_ptr[s.size()] = '\0'; // easier when you need a c_str
38
+ }
39
+ }
40
+ padded_string(padded_string &&o) noexcept
41
+ : viable_size(o.viable_size), data_ptr(o.data_ptr) {
42
+ o.data_ptr = nullptr; // we take ownership
43
+ }
44
+ void swap(padded_string &o) {
45
+ size_t tmp_viable_size = viable_size;
46
+ char *tmp_data_ptr = data_ptr;
47
+ viable_size = o.viable_size;
48
+ data_ptr = o.data_ptr;
49
+ o.data_ptr = tmp_data_ptr;
50
+ o.viable_size = tmp_viable_size;
51
+ }
52
+
53
+ ~padded_string() { aligned_free_char(data_ptr); }
54
+
55
+ size_t size() const { return viable_size; }
56
+
57
+ size_t length() const { return viable_size; }
58
+
59
+ char *data() const { return data_ptr; }
60
+
61
+ private:
62
+ padded_string &operator=(const padded_string &o) = delete;
63
+ padded_string(const padded_string &o) = delete;
64
+
65
+ size_t viable_size;
66
+ char *data_ptr;
67
+ };
68
+ } // namespace simdjson
69
+
70
+ #endif