simdjson 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. checksums.yaml +7 -0
  2. data/.clang-format +5 -0
  3. data/.gitignore +14 -0
  4. data/.gitmodules +3 -0
  5. data/.rubocop.yml +9 -0
  6. data/.travis.yml +7 -0
  7. data/Gemfile +4 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +39 -0
  10. data/Rakefile +32 -0
  11. data/benchmark/apache_builds.json +4421 -0
  12. data/benchmark/demo.json +15 -0
  13. data/benchmark/github_events.json +1390 -0
  14. data/benchmark/run_benchmark.rb +30 -0
  15. data/ext/simdjson/extconf.rb +22 -0
  16. data/ext/simdjson/simdjson.cpp +76 -0
  17. data/ext/simdjson/simdjson.hpp +6 -0
  18. data/lib/simdjson/version.rb +3 -0
  19. data/lib/simdjson.rb +2 -0
  20. data/simdjson.gemspec +35 -0
  21. data/vendor/.gitkeep +0 -0
  22. data/vendor/simdjson/AUTHORS +3 -0
  23. data/vendor/simdjson/CMakeLists.txt +63 -0
  24. data/vendor/simdjson/CONTRIBUTORS +27 -0
  25. data/vendor/simdjson/Dockerfile +10 -0
  26. data/vendor/simdjson/LICENSE +201 -0
  27. data/vendor/simdjson/Makefile +203 -0
  28. data/vendor/simdjson/Notes.md +85 -0
  29. data/vendor/simdjson/README.md +581 -0
  30. data/vendor/simdjson/amalgamation.sh +158 -0
  31. data/vendor/simdjson/benchmark/CMakeLists.txt +8 -0
  32. data/vendor/simdjson/benchmark/benchmark.h +223 -0
  33. data/vendor/simdjson/benchmark/distinctuseridcompetition.cpp +347 -0
  34. data/vendor/simdjson/benchmark/linux/linux-perf-events.h +93 -0
  35. data/vendor/simdjson/benchmark/minifiercompetition.cpp +181 -0
  36. data/vendor/simdjson/benchmark/parse.cpp +393 -0
  37. data/vendor/simdjson/benchmark/parseandstatcompetition.cpp +305 -0
  38. data/vendor/simdjson/benchmark/parsingcompetition.cpp +298 -0
  39. data/vendor/simdjson/benchmark/statisticalmodel.cpp +208 -0
  40. data/vendor/simdjson/dependencies/jsoncppdist/json/json-forwards.h +344 -0
  41. data/vendor/simdjson/dependencies/jsoncppdist/json/json.h +2366 -0
  42. data/vendor/simdjson/dependencies/jsoncppdist/jsoncpp.cpp +5418 -0
  43. data/vendor/simdjson/doc/apache_builds.jsonparseandstat.png +0 -0
  44. data/vendor/simdjson/doc/gbps.png +0 -0
  45. data/vendor/simdjson/doc/github_events.jsonparseandstat.png +0 -0
  46. data/vendor/simdjson/doc/twitter.jsonparseandstat.png +0 -0
  47. data/vendor/simdjson/doc/update-center.jsonparseandstat.png +0 -0
  48. data/vendor/simdjson/images/halvarflake.png +0 -0
  49. data/vendor/simdjson/images/logo.png +0 -0
  50. data/vendor/simdjson/include/simdjson/common_defs.h +102 -0
  51. data/vendor/simdjson/include/simdjson/isadetection.h +152 -0
  52. data/vendor/simdjson/include/simdjson/jsoncharutils.h +301 -0
  53. data/vendor/simdjson/include/simdjson/jsonformatutils.h +202 -0
  54. data/vendor/simdjson/include/simdjson/jsonioutil.h +32 -0
  55. data/vendor/simdjson/include/simdjson/jsonminifier.h +30 -0
  56. data/vendor/simdjson/include/simdjson/jsonparser.h +250 -0
  57. data/vendor/simdjson/include/simdjson/numberparsing.h +587 -0
  58. data/vendor/simdjson/include/simdjson/padded_string.h +70 -0
  59. data/vendor/simdjson/include/simdjson/parsedjson.h +544 -0
  60. data/vendor/simdjson/include/simdjson/portability.h +172 -0
  61. data/vendor/simdjson/include/simdjson/simdjson.h +44 -0
  62. data/vendor/simdjson/include/simdjson/simdjson_version.h +13 -0
  63. data/vendor/simdjson/include/simdjson/simdprune_tables.h +35074 -0
  64. data/vendor/simdjson/include/simdjson/simdutf8check_arm64.h +180 -0
  65. data/vendor/simdjson/include/simdjson/simdutf8check_haswell.h +198 -0
  66. data/vendor/simdjson/include/simdjson/simdutf8check_westmere.h +169 -0
  67. data/vendor/simdjson/include/simdjson/stage1_find_marks.h +121 -0
  68. data/vendor/simdjson/include/simdjson/stage1_find_marks_arm64.h +210 -0
  69. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten.h +93 -0
  70. data/vendor/simdjson/include/simdjson/stage1_find_marks_flatten_haswell.h +95 -0
  71. data/vendor/simdjson/include/simdjson/stage1_find_marks_haswell.h +210 -0
  72. data/vendor/simdjson/include/simdjson/stage1_find_marks_macros.h +239 -0
  73. data/vendor/simdjson/include/simdjson/stage1_find_marks_westmere.h +194 -0
  74. data/vendor/simdjson/include/simdjson/stage2_build_tape.h +85 -0
  75. data/vendor/simdjson/include/simdjson/stringparsing.h +105 -0
  76. data/vendor/simdjson/include/simdjson/stringparsing_arm64.h +56 -0
  77. data/vendor/simdjson/include/simdjson/stringparsing_haswell.h +43 -0
  78. data/vendor/simdjson/include/simdjson/stringparsing_macros.h +88 -0
  79. data/vendor/simdjson/include/simdjson/stringparsing_westmere.h +41 -0
  80. data/vendor/simdjson/jsonexamples/small/jsoniter_scala/README.md +4 -0
  81. data/vendor/simdjson/scripts/dumpsimplestats.sh +11 -0
  82. data/vendor/simdjson/scripts/issue150.sh +14 -0
  83. data/vendor/simdjson/scripts/javascript/README.md +3 -0
  84. data/vendor/simdjson/scripts/javascript/generatelargejson.js +19 -0
  85. data/vendor/simdjson/scripts/minifier.sh +11 -0
  86. data/vendor/simdjson/scripts/parseandstat.sh +24 -0
  87. data/vendor/simdjson/scripts/parser.sh +11 -0
  88. data/vendor/simdjson/scripts/parsingcompdata.sh +26 -0
  89. data/vendor/simdjson/scripts/plotparse.sh +98 -0
  90. data/vendor/simdjson/scripts/selectparser.sh +11 -0
  91. data/vendor/simdjson/scripts/setupfortesting/disablehyperthreading.sh +15 -0
  92. data/vendor/simdjson/scripts/setupfortesting/powerpolicy.sh +32 -0
  93. data/vendor/simdjson/scripts/setupfortesting/setupfortesting.sh +6 -0
  94. data/vendor/simdjson/scripts/setupfortesting/turboboost.sh +51 -0
  95. data/vendor/simdjson/scripts/testjson2json.sh +99 -0
  96. data/vendor/simdjson/scripts/transitions/Makefile +10 -0
  97. data/vendor/simdjson/scripts/transitions/generatetransitions.cpp +20 -0
  98. data/vendor/simdjson/singleheader/README.md +1 -0
  99. data/vendor/simdjson/singleheader/amalgamation_demo.cpp +20 -0
  100. data/vendor/simdjson/singleheader/simdjson.cpp +1652 -0
  101. data/vendor/simdjson/singleheader/simdjson.h +39692 -0
  102. data/vendor/simdjson/src/CMakeLists.txt +67 -0
  103. data/vendor/simdjson/src/jsonioutil.cpp +35 -0
  104. data/vendor/simdjson/src/jsonminifier.cpp +285 -0
  105. data/vendor/simdjson/src/jsonparser.cpp +91 -0
  106. data/vendor/simdjson/src/parsedjson.cpp +323 -0
  107. data/vendor/simdjson/src/parsedjsoniterator.cpp +272 -0
  108. data/vendor/simdjson/src/simdjson.cpp +30 -0
  109. data/vendor/simdjson/src/stage1_find_marks.cpp +41 -0
  110. data/vendor/simdjson/src/stage2_build_tape.cpp +567 -0
  111. data/vendor/simdjson/style/clang-format-check.sh +25 -0
  112. data/vendor/simdjson/style/clang-format.sh +25 -0
  113. data/vendor/simdjson/style/run-clang-format.py +326 -0
  114. data/vendor/simdjson/tape.md +134 -0
  115. data/vendor/simdjson/tests/CMakeLists.txt +25 -0
  116. data/vendor/simdjson/tests/allparserscheckfile.cpp +192 -0
  117. data/vendor/simdjson/tests/basictests.cpp +75 -0
  118. data/vendor/simdjson/tests/jsoncheck.cpp +136 -0
  119. data/vendor/simdjson/tests/numberparsingcheck.cpp +224 -0
  120. data/vendor/simdjson/tests/pointercheck.cpp +38 -0
  121. data/vendor/simdjson/tests/singleheadertest.cpp +22 -0
  122. data/vendor/simdjson/tests/stringparsingcheck.cpp +408 -0
  123. data/vendor/simdjson/tools/CMakeLists.txt +3 -0
  124. data/vendor/simdjson/tools/cmake/FindCTargets.cmake +15 -0
  125. data/vendor/simdjson/tools/cmake/FindOptions.cmake +52 -0
  126. data/vendor/simdjson/tools/json2json.cpp +112 -0
  127. data/vendor/simdjson/tools/jsonpointer.cpp +93 -0
  128. data/vendor/simdjson/tools/jsonstats.cpp +143 -0
  129. data/vendor/simdjson/tools/minify.cpp +21 -0
  130. data/vendor/simdjson/tools/release.py +125 -0
  131. data/vendor/simdjson/windows/dirent_portable.h +1043 -0
  132. metadata +273 -0
@@ -0,0 +1,587 @@
1
+ #ifndef SIMDJSON_NUMBERPARSING_H
2
+ #define SIMDJSON_NUMBERPARSING_H
3
+
4
+ #include "simdjson/common_defs.h"
5
+ #include "simdjson/jsoncharutils.h"
6
+ #include "simdjson/parsedjson.h"
7
+ #include "simdjson/portability.h"
8
+
9
+ #ifdef JSON_TEST_NUMBERS // for unit testing
10
+ void found_invalid_number(const uint8_t *buf);
11
+ void found_integer(int64_t result, const uint8_t *buf);
12
+ void found_float(double result, const uint8_t *buf);
13
+ #endif
14
+
15
+ namespace simdjson {
16
+ // Allowable floating-point values range from
17
+ // std::numeric_limits<double>::lowest() to std::numeric_limits<double>::max(),
18
+ // so from -1.7976e308 all the way to 1.7975e308 in binary64. The lowest
19
+ // non-zero normal values is std::numeric_limits<double>::min() or
20
+ // about 2.225074e-308.
21
+ static const double power_of_ten[] = {
22
+ 1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300,
23
+ 1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291,
24
+ 1e-290, 1e-289, 1e-288, 1e-287, 1e-286, 1e-285, 1e-284, 1e-283, 1e-282,
25
+ 1e-281, 1e-280, 1e-279, 1e-278, 1e-277, 1e-276, 1e-275, 1e-274, 1e-273,
26
+ 1e-272, 1e-271, 1e-270, 1e-269, 1e-268, 1e-267, 1e-266, 1e-265, 1e-264,
27
+ 1e-263, 1e-262, 1e-261, 1e-260, 1e-259, 1e-258, 1e-257, 1e-256, 1e-255,
28
+ 1e-254, 1e-253, 1e-252, 1e-251, 1e-250, 1e-249, 1e-248, 1e-247, 1e-246,
29
+ 1e-245, 1e-244, 1e-243, 1e-242, 1e-241, 1e-240, 1e-239, 1e-238, 1e-237,
30
+ 1e-236, 1e-235, 1e-234, 1e-233, 1e-232, 1e-231, 1e-230, 1e-229, 1e-228,
31
+ 1e-227, 1e-226, 1e-225, 1e-224, 1e-223, 1e-222, 1e-221, 1e-220, 1e-219,
32
+ 1e-218, 1e-217, 1e-216, 1e-215, 1e-214, 1e-213, 1e-212, 1e-211, 1e-210,
33
+ 1e-209, 1e-208, 1e-207, 1e-206, 1e-205, 1e-204, 1e-203, 1e-202, 1e-201,
34
+ 1e-200, 1e-199, 1e-198, 1e-197, 1e-196, 1e-195, 1e-194, 1e-193, 1e-192,
35
+ 1e-191, 1e-190, 1e-189, 1e-188, 1e-187, 1e-186, 1e-185, 1e-184, 1e-183,
36
+ 1e-182, 1e-181, 1e-180, 1e-179, 1e-178, 1e-177, 1e-176, 1e-175, 1e-174,
37
+ 1e-173, 1e-172, 1e-171, 1e-170, 1e-169, 1e-168, 1e-167, 1e-166, 1e-165,
38
+ 1e-164, 1e-163, 1e-162, 1e-161, 1e-160, 1e-159, 1e-158, 1e-157, 1e-156,
39
+ 1e-155, 1e-154, 1e-153, 1e-152, 1e-151, 1e-150, 1e-149, 1e-148, 1e-147,
40
+ 1e-146, 1e-145, 1e-144, 1e-143, 1e-142, 1e-141, 1e-140, 1e-139, 1e-138,
41
+ 1e-137, 1e-136, 1e-135, 1e-134, 1e-133, 1e-132, 1e-131, 1e-130, 1e-129,
42
+ 1e-128, 1e-127, 1e-126, 1e-125, 1e-124, 1e-123, 1e-122, 1e-121, 1e-120,
43
+ 1e-119, 1e-118, 1e-117, 1e-116, 1e-115, 1e-114, 1e-113, 1e-112, 1e-111,
44
+ 1e-110, 1e-109, 1e-108, 1e-107, 1e-106, 1e-105, 1e-104, 1e-103, 1e-102,
45
+ 1e-101, 1e-100, 1e-99, 1e-98, 1e-97, 1e-96, 1e-95, 1e-94, 1e-93,
46
+ 1e-92, 1e-91, 1e-90, 1e-89, 1e-88, 1e-87, 1e-86, 1e-85, 1e-84,
47
+ 1e-83, 1e-82, 1e-81, 1e-80, 1e-79, 1e-78, 1e-77, 1e-76, 1e-75,
48
+ 1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66,
49
+ 1e-65, 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57,
50
+ 1e-56, 1e-55, 1e-54, 1e-53, 1e-52, 1e-51, 1e-50, 1e-49, 1e-48,
51
+ 1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, 1e-40, 1e-39,
52
+ 1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30,
53
+ 1e-29, 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21,
54
+ 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12,
55
+ 1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3,
56
+ 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6,
57
+ 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15,
58
+ 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24,
59
+ 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, 1e32, 1e33,
60
+ 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42,
61
+ 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51,
62
+ 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60,
63
+ 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
64
+ 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78,
65
+ 1e79, 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87,
66
+ 1e88, 1e89, 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96,
67
+ 1e97, 1e98, 1e99, 1e100, 1e101, 1e102, 1e103, 1e104, 1e105,
68
+ 1e106, 1e107, 1e108, 1e109, 1e110, 1e111, 1e112, 1e113, 1e114,
69
+ 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, 1e121, 1e122, 1e123,
70
+ 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, 1e131, 1e132,
71
+ 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, 1e141,
72
+ 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150,
73
+ 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
74
+ 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168,
75
+ 1e169, 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177,
76
+ 1e178, 1e179, 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186,
77
+ 1e187, 1e188, 1e189, 1e190, 1e191, 1e192, 1e193, 1e194, 1e195,
78
+ 1e196, 1e197, 1e198, 1e199, 1e200, 1e201, 1e202, 1e203, 1e204,
79
+ 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, 1e211, 1e212, 1e213,
80
+ 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, 1e221, 1e222,
81
+ 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, 1e231,
82
+ 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240,
83
+ 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
84
+ 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258,
85
+ 1e259, 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267,
86
+ 1e268, 1e269, 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276,
87
+ 1e277, 1e278, 1e279, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285,
88
+ 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294,
89
+ 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303,
90
+ 1e304, 1e305, 1e306, 1e307, 1e308};
91
+
92
+ static inline bool is_integer(char c) {
93
+ return (c >= '0' && c <= '9');
94
+ // this gets compiled to (uint8_t)(c - '0') <= 9 on all decent compilers
95
+ }
96
+
97
+ // We need to check that the character following a zero is valid. This is
98
+ // probably frequent and it is hard than it looks. We are building all of this
99
+ // just to differentiate between 0x1 (invalid), 0,1 (valid) 0e1 (valid)...
100
+ const bool structural_or_whitespace_or_exponent_or_decimal_negated[256] = {
101
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
103
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
104
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
105
+ 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
106
+ 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
108
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
110
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
111
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
112
+
113
+ really_inline bool
114
+ is_not_structural_or_whitespace_or_exponent_or_decimal(unsigned char c) {
115
+ return structural_or_whitespace_or_exponent_or_decimal_negated[c];
116
+ }
117
+ } // namespace simdjson
118
+ #ifndef SIMDJSON_DISABLE_SWAR_NUMBER_PARSING
119
+ #define SWAR_NUMBER_PARSING
120
+ #endif
121
+
122
+ #ifdef SWAR_NUMBER_PARSING
123
+
124
+ namespace simdjson {
125
+ // check quickly whether the next 8 chars are made of digits
126
+ // at a glance, it looks better than Mula's
127
+ // http://0x80.pl/articles/swar-digits-validate.html
128
+ static inline bool is_made_of_eight_digits_fast(const char *chars) {
129
+ uint64_t val;
130
+ // this can read up to 7 bytes beyond the buffer size, but we require
131
+ // SIMDJSON_PADDING of padding
132
+ static_assert(7 <= SIMDJSON_PADDING);
133
+ memcpy(&val, chars, 8);
134
+ // a branchy method might be faster:
135
+ // return (( val & 0xF0F0F0F0F0F0F0F0 ) == 0x3030303030303030)
136
+ // && (( (val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0 ) ==
137
+ // 0x3030303030303030);
138
+ return (((val & 0xF0F0F0F0F0F0F0F0) |
139
+ (((val + 0x0606060606060606) & 0xF0F0F0F0F0F0F0F0) >> 4)) ==
140
+ 0x3333333333333333);
141
+ }
142
+ } // namespace simdjson
143
+ #ifdef IS_X86_64
144
+ TARGET_WESTMERE
145
+ namespace simdjson {
146
+ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
147
+ // this actually computes *16* values so we are being wasteful.
148
+ const __m128i ascii0 = _mm_set1_epi8('0');
149
+ const __m128i mul_1_10 =
150
+ _mm_setr_epi8(10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1, 10, 1);
151
+ const __m128i mul_1_100 = _mm_setr_epi16(100, 1, 100, 1, 100, 1, 100, 1);
152
+ const __m128i mul_1_10000 =
153
+ _mm_setr_epi16(10000, 1, 10000, 1, 10000, 1, 10000, 1);
154
+ const __m128i input = _mm_sub_epi8(
155
+ _mm_loadu_si128(reinterpret_cast<const __m128i *>(chars)), ascii0);
156
+ const __m128i t1 = _mm_maddubs_epi16(input, mul_1_10);
157
+ const __m128i t2 = _mm_madd_epi16(t1, mul_1_100);
158
+ const __m128i t3 = _mm_packus_epi32(t2, t2);
159
+ const __m128i t4 = _mm_madd_epi16(t3, mul_1_10000);
160
+ return _mm_cvtsi128_si32(
161
+ t4); // only captures the sum of the first 8 digits, drop the rest
162
+ }
163
+ } // namespace simdjson
164
+ UNTARGET_REGION
165
+ #endif
166
+
167
+ namespace simdjson {
168
+ #ifdef IS_ARM64
169
+ // we don't have SSE, so let us use a scalar function
170
+ // credit: https://johnnylee-sde.github.io/Fast-numeric-string-to-int/
171
+ static inline uint32_t parse_eight_digits_unrolled(const char *chars) {
172
+ uint64_t val;
173
+ memcpy(&val, chars, sizeof(uint64_t));
174
+ val = (val & 0x0F0F0F0F0F0F0F0F) * 2561 >> 8;
175
+ val = (val & 0x00FF00FF00FF00FF) * 6553601 >> 16;
176
+ return (val & 0x0000FFFF0000FFFF) * 42949672960001 >> 32;
177
+ }
178
+ #endif
179
+
180
+ #endif
181
+
182
+ //
183
+ // This function computes base * 10 ^ (- negative_exponent ).
184
+ // It is only even going to be used when negative_exponent is tiny.
185
+ static double subnormal_power10(double base, int negative_exponent) {
186
+ // this is probably not going to be fast
187
+ return base * 1e-308 * pow(10, negative_exponent + 308);
188
+ }
189
+
190
+ // called by parse_number when we know that the output is a float,
191
+ // but where there might be some integer overflow. The trick here is to
192
+ // parse using floats from the start.
193
+ // Do not call this function directly as it skips some of the checks from
194
+ // parse_number
195
+ //
196
+ // This function will almost never be called!!!
197
+ //
198
+ // Note: a redesign could avoid this function entirely.
199
+ //
200
+ static never_inline bool parse_float(const uint8_t *const buf, ParsedJson &pj,
201
+ const uint32_t offset, bool found_minus) {
202
+ const char *p = reinterpret_cast<const char *>(buf + offset);
203
+ bool negative = false;
204
+ if (found_minus) {
205
+ ++p;
206
+ negative = true;
207
+ }
208
+ long double i;
209
+ if (*p == '0') { // 0 cannot be followed by an integer
210
+ ++p;
211
+ i = 0;
212
+ } else {
213
+ unsigned char digit = *p - '0';
214
+ i = digit;
215
+ p++;
216
+ while (is_integer(*p)) {
217
+ digit = *p - '0';
218
+ i = 10 * i + digit;
219
+ ++p;
220
+ }
221
+ }
222
+ if ('.' == *p) {
223
+ ++p;
224
+ int fractional_weight = 308;
225
+ if (is_integer(*p)) {
226
+ unsigned char digit = *p - '0';
227
+ ++p;
228
+
229
+ fractional_weight--;
230
+ i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
231
+ : 0);
232
+ } else {
233
+ #ifdef JSON_TEST_NUMBERS // for unit testing
234
+ found_invalid_number(buf + offset);
235
+ #endif
236
+ return false;
237
+ }
238
+ while (is_integer(*p)) {
239
+ unsigned char digit = *p - '0';
240
+ ++p;
241
+ fractional_weight--;
242
+ i = i + digit * (fractional_weight >= 0 ? power_of_ten[fractional_weight]
243
+ : 0);
244
+ }
245
+ }
246
+ if (('e' == *p) || ('E' == *p)) {
247
+ ++p;
248
+ bool neg_exp = false;
249
+ if ('-' == *p) {
250
+ neg_exp = true;
251
+ ++p;
252
+ } else if ('+' == *p) {
253
+ ++p;
254
+ }
255
+ if (!is_integer(*p)) {
256
+ #ifdef JSON_TEST_NUMBERS // for unit testing
257
+ found_invalid_number(buf + offset);
258
+ #endif
259
+ return false;
260
+ }
261
+ unsigned char digit = *p - '0';
262
+ int64_t exp_number = digit; // exponential part
263
+ p++;
264
+ if (is_integer(*p)) {
265
+ digit = *p - '0';
266
+ exp_number = 10 * exp_number + digit;
267
+ ++p;
268
+ }
269
+ if (is_integer(*p)) {
270
+ digit = *p - '0';
271
+ exp_number = 10 * exp_number + digit;
272
+ ++p;
273
+ }
274
+ if (is_integer(*p)) {
275
+ digit = *p - '0';
276
+ exp_number = 10 * exp_number + digit;
277
+ ++p;
278
+ }
279
+ while (is_integer(*p)) {
280
+ if (exp_number > 0x100000000) { // we need to check for overflows
281
+ // we refuse to parse this
282
+ #ifdef JSON_TEST_NUMBERS // for unit testing
283
+ found_invalid_number(buf + offset);
284
+ #endif
285
+ return false;
286
+ }
287
+ digit = *p - '0';
288
+ exp_number = 10 * exp_number + digit;
289
+ ++p;
290
+ }
291
+ if (unlikely(exp_number > 308)) {
292
+ // this path is unlikely
293
+ if (neg_exp) {
294
+ // We either have zero or a subnormal.
295
+ // We expect this to be uncommon so we go through a slow path.
296
+ i = subnormal_power10(i, -exp_number);
297
+ } else {
298
+ // We know for sure that we have a number that is too large,
299
+ // we refuse to parse this
300
+ #ifdef JSON_TEST_NUMBERS // for unit testing
301
+ found_invalid_number(buf + offset);
302
+ #endif
303
+ return false;
304
+ }
305
+ } else {
306
+ int exponent = (neg_exp ? -exp_number : exp_number);
307
+ // we have that exp_number is [0,308] so that
308
+ // exponent is [-308,308] so that
309
+ // 308 + exponent is in [0, 2 * 308]
310
+ i *= power_of_ten[308 + exponent];
311
+ }
312
+ }
313
+ if (is_not_structural_or_whitespace(*p)) {
314
+ return false;
315
+ }
316
+ double d = negative ? -i : i;
317
+ pj.write_tape_double(d);
318
+ #ifdef JSON_TEST_NUMBERS // for unit testing
319
+ found_float(d, buf + offset);
320
+ #endif
321
+ return is_structural_or_whitespace(*p);
322
+ }
323
+
324
+ // called by parse_number when we know that the output is an integer,
325
+ // but where there might be some integer overflow.
326
+ // we want to catch overflows!
327
+ // Do not call this function directly as it skips some of the checks from
328
+ // parse_number
329
+ //
330
+ // This function will almost never be called!!!
331
+ //
332
+ static never_inline bool parse_large_integer(const uint8_t *const buf,
333
+ ParsedJson &pj,
334
+ const uint32_t offset,
335
+ bool found_minus) {
336
+ const char *p = reinterpret_cast<const char *>(buf + offset);
337
+
338
+ bool negative = false;
339
+ if (found_minus) {
340
+ ++p;
341
+ negative = true;
342
+ }
343
+ uint64_t i;
344
+ if (*p == '0') { // 0 cannot be followed by an integer
345
+ ++p;
346
+ i = 0;
347
+ } else {
348
+ unsigned char digit = *p - '0';
349
+ i = digit;
350
+ p++;
351
+ // the is_made_of_eight_digits_fast routine is unlikely to help here because
352
+ // we rarely see large integer parts like 123456789
353
+ while (is_integer(*p)) {
354
+ digit = *p - '0';
355
+ if (mul_overflow(i, 10, &i)) {
356
+ #ifdef JSON_TEST_NUMBERS // for unit testing
357
+ found_invalid_number(buf + offset);
358
+ #endif
359
+ return false; // overflow
360
+ }
361
+ if (add_overflow(i, digit, &i)) {
362
+ #ifdef JSON_TEST_NUMBERS // for unit testing
363
+ found_invalid_number(buf + offset);
364
+ #endif
365
+ return false; // overflow
366
+ }
367
+ ++p;
368
+ }
369
+ }
370
+ if (negative) {
371
+ if (i > 0x8000000000000000) {
372
+ // overflows!
373
+ #ifdef JSON_TEST_NUMBERS // for unit testing
374
+ found_invalid_number(buf + offset);
375
+ #endif
376
+ return false; // overflow
377
+ }
378
+ } else {
379
+ if (i >= 0x8000000000000000) {
380
+ // overflows!
381
+ #ifdef JSON_TEST_NUMBERS // for unit testing
382
+ found_invalid_number(buf + offset);
383
+ #endif
384
+ return false; // overflow
385
+ }
386
+ }
387
+ int64_t signed_answer =
388
+ negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);
389
+ pj.write_tape_s64(signed_answer);
390
+ #ifdef JSON_TEST_NUMBERS // for unit testing
391
+ found_integer(signed_answer, buf + offset);
392
+ #endif
393
+ return is_structural_or_whitespace(*p);
394
+ }
395
+
396
+ // parse the number at buf + offset
397
+ // define JSON_TEST_NUMBERS for unit testing
398
+ //
399
+ // It is assumed that the number is followed by a structural ({,},],[) character
400
+ // or a white space character. If that is not the case (e.g., when the JSON
401
+ // document is made of a single number), then it is necessary to copy the
402
+ // content and append a space before calling this function.
403
+ //
404
+ // Our objective is accurate parsing (ULP of 0 or 1) at high speed.
405
+ static really_inline bool parse_number(const uint8_t *const buf, ParsedJson &pj,
406
+ const uint32_t offset,
407
+ bool found_minus) {
408
+ #ifdef SIMDJSON_SKIPNUMBERPARSING // for performance analysis, it is sometimes
409
+ // useful to skip parsing
410
+ pj.write_tape_s64(0); // always write zero
411
+ return true; // always succeeds
412
+ #else
413
+ const char *p = reinterpret_cast<const char *>(buf + offset);
414
+ bool negative = false;
415
+ if (found_minus) {
416
+ ++p;
417
+ negative = true;
418
+ if (!is_integer(*p)) { // a negative sign must be followed by an integer
419
+ #ifdef JSON_TEST_NUMBERS // for unit testing
420
+ found_invalid_number(buf + offset);
421
+ #endif
422
+ return false;
423
+ }
424
+ }
425
+ const char *const start_digits = p;
426
+
427
+ uint64_t i; // an unsigned int avoids signed overflows (which are bad)
428
+ if (*p == '0') { // 0 cannot be followed by an integer
429
+ ++p;
430
+ if (is_not_structural_or_whitespace_or_exponent_or_decimal(*p)) {
431
+ #ifdef JSON_TEST_NUMBERS // for unit testing
432
+ found_invalid_number(buf + offset);
433
+ #endif
434
+ return false;
435
+ }
436
+ i = 0;
437
+ } else {
438
+ if (!(is_integer(*p))) { // must start with an integer
439
+ #ifdef JSON_TEST_NUMBERS // for unit testing
440
+ found_invalid_number(buf + offset);
441
+ #endif
442
+ return false;
443
+ }
444
+ unsigned char digit = *p - '0';
445
+ i = digit;
446
+ p++;
447
+ // the is_made_of_eight_digits_fast routine is unlikely to help here because
448
+ // we rarely see large integer parts like 123456789
449
+ while (is_integer(*p)) {
450
+ digit = *p - '0';
451
+ // a multiplication by 10 is cheaper than an arbitrary integer
452
+ // multiplication
453
+ i = 10 * i + digit; // might overflow, we will handle the overflow later
454
+ ++p;
455
+ }
456
+ }
457
+ int64_t exponent = 0;
458
+ bool is_float = false;
459
+ if ('.' == *p) {
460
+ is_float = true; // At this point we know that we have a float
461
+ // we continue with the fiction that we have an integer. If the
462
+ // floating point number is representable as x * 10^z for some integer
463
+ // z that fits in 53 bits, then we will be able to convert back the
464
+ // the integer into a float in a lossless manner.
465
+ ++p;
466
+ const char *const first_after_period = p;
467
+ if (is_integer(*p)) {
468
+ unsigned char digit = *p - '0';
469
+ ++p;
470
+ i = i * 10 + digit; // might overflow + multiplication by 10 is likely
471
+ // cheaper than arbitrary mult.
472
+ // we will handle the overflow later
473
+ } else {
474
+ #ifdef JSON_TEST_NUMBERS // for unit testing
475
+ found_invalid_number(buf + offset);
476
+ #endif
477
+ return false;
478
+ }
479
+ #ifdef SWAR_NUMBER_PARSING
480
+ // this helps if we have lots of decimals!
481
+ // this turns out to be frequent enough.
482
+ if (is_made_of_eight_digits_fast(p)) {
483
+ i = i * 100000000 + parse_eight_digits_unrolled(p);
484
+ p += 8;
485
+ }
486
+ #endif
487
+ while (is_integer(*p)) {
488
+ unsigned char digit = *p - '0';
489
+ ++p;
490
+ i = i * 10 + digit; // in rare cases, this will overflow, but that's ok
491
+ // because we have parse_highprecision_float later.
492
+ }
493
+ exponent = first_after_period - p;
494
+ }
495
+ int digit_count =
496
+ p - start_digits - 1; // used later to guard against overflows
497
+ int64_t exp_number = 0; // exponential part
498
+ if (('e' == *p) || ('E' == *p)) {
499
+ is_float = true;
500
+ ++p;
501
+ bool neg_exp = false;
502
+ if ('-' == *p) {
503
+ neg_exp = true;
504
+ ++p;
505
+ } else if ('+' == *p) {
506
+ ++p;
507
+ }
508
+ if (!is_integer(*p)) {
509
+ #ifdef JSON_TEST_NUMBERS // for unit testing
510
+ found_invalid_number(buf + offset);
511
+ #endif
512
+ return false;
513
+ }
514
+ unsigned char digit = *p - '0';
515
+ exp_number = digit;
516
+ p++;
517
+ if (is_integer(*p)) {
518
+ digit = *p - '0';
519
+ exp_number = 10 * exp_number + digit;
520
+ ++p;
521
+ }
522
+ if (is_integer(*p)) {
523
+ digit = *p - '0';
524
+ exp_number = 10 * exp_number + digit;
525
+ ++p;
526
+ }
527
+ while (is_integer(*p)) {
528
+ if (exp_number > 0x100000000) { // we need to check for overflows
529
+ // we refuse to parse this
530
+ #ifdef JSON_TEST_NUMBERS // for unit testing
531
+ found_invalid_number(buf + offset);
532
+ #endif
533
+ return false;
534
+ }
535
+ digit = *p - '0';
536
+ exp_number = 10 * exp_number + digit;
537
+ ++p;
538
+ }
539
+ exponent += (neg_exp ? -exp_number : exp_number);
540
+ }
541
+ if (is_float) {
542
+ uint64_t power_index = 308 + exponent;
543
+ if (unlikely((digit_count >= 19))) { // this is uncommon
544
+ // It is possible that the integer had an overflow.
545
+ // We have to handle the case where we have 0.0000somenumber.
546
+ const char *start = start_digits;
547
+ while ((*start == '0') || (*start == '.')) {
548
+ start++;
549
+ }
550
+ // we over-decrement by one when there is a '.'
551
+ digit_count -= (start - start_digits);
552
+ if (digit_count >= 19) {
553
+ // Ok, chances are good that we had an overflow!
554
+ // this is almost never going to get called!!!
555
+ // we start anew, going slowly!!!
556
+ return parse_float(buf, pj, offset, found_minus);
557
+ }
558
+ }
559
+ if (unlikely((power_index > 2 * 308))) { // this is uncommon!!!
560
+ // this is almost never going to get called!!!
561
+ // we start anew, going slowly!!!
562
+ return parse_float(buf, pj, offset, found_minus);
563
+ }
564
+ double factor = power_of_ten[power_index];
565
+ factor = negative ? -factor : factor;
566
+ double d = i * factor;
567
+ pj.write_tape_double(d);
568
+ #ifdef JSON_TEST_NUMBERS // for unit testing
569
+ found_float(d, buf + offset);
570
+ #endif
571
+ } else {
572
+ if (unlikely(digit_count >= 18)) { // this is uncommon!!!
573
+ // there is a good chance that we had an overflow, so we need
574
+ // need to recover: we parse the whole thing again.
575
+ return parse_large_integer(buf, pj, offset, found_minus);
576
+ }
577
+ i = negative ? 0 - i : i;
578
+ pj.write_tape_s64(i);
579
+ #ifdef JSON_TEST_NUMBERS // for unit testing
580
+ found_integer(i, buf + offset);
581
+ #endif
582
+ }
583
+ return is_structural_or_whitespace(*p);
584
+ #endif // SIMDJSON_SKIPNUMBERPARSING
585
+ }
586
+ } // simdjson
587
+ #endif
@@ -0,0 +1,70 @@
1
+ #ifndef SIMDJSON_PADDING_STRING_H
2
+ #define SIMDJSON_PADDING_STRING_H
3
+ #include "simdjson/portability.h"
4
+ #include <cstring>
5
+ #include <memory>
6
+
7
+ namespace simdjson {
8
+ // low-level function to allocate memory with padding so we can read passed the
9
+ // "length" bytes safely. if you must provide a pointer to some data, create it
10
+ // with this function: length is the max. size in bytes of the string caller is
11
+ // responsible to free the memory (free(...))
12
+ char *allocate_padded_buffer(size_t length);
13
+
14
+ // Simple string with padded allocation.
15
+ // We deliberately forbid copies, users should rely on swap or move
16
+ // constructors.
17
+ class padded_string {
18
+ public:
19
+ explicit padded_string() noexcept : viable_size(0), data_ptr(nullptr) {}
20
+ explicit padded_string(size_t length) noexcept
21
+ : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
22
+
23
+ if (data_ptr != nullptr)
24
+ data_ptr[length] = '\0'; // easier when you need a c_str
25
+ }
26
+ explicit padded_string(char *data, size_t length) noexcept
27
+ : viable_size(length), data_ptr(allocate_padded_buffer(length)) {
28
+ if (data_ptr != nullptr) {
29
+ memcpy(data_ptr, data, length);
30
+ data_ptr[length] = '\0'; // easier when you need a c_str
31
+ }
32
+ }
33
+ padded_string(std::string s) noexcept
34
+ : viable_size(s.size()), data_ptr(allocate_padded_buffer(s.size())) {
35
+ if (data_ptr != nullptr) {
36
+ memcpy(data_ptr, s.data(), s.size());
37
+ data_ptr[s.size()] = '\0'; // easier when you need a c_str
38
+ }
39
+ }
40
+ padded_string(padded_string &&o) noexcept
41
+ : viable_size(o.viable_size), data_ptr(o.data_ptr) {
42
+ o.data_ptr = nullptr; // we take ownership
43
+ }
44
+ void swap(padded_string &o) {
45
+ size_t tmp_viable_size = viable_size;
46
+ char *tmp_data_ptr = data_ptr;
47
+ viable_size = o.viable_size;
48
+ data_ptr = o.data_ptr;
49
+ o.data_ptr = tmp_data_ptr;
50
+ o.viable_size = tmp_viable_size;
51
+ }
52
+
53
+ ~padded_string() { aligned_free_char(data_ptr); }
54
+
55
+ size_t size() const { return viable_size; }
56
+
57
+ size_t length() const { return viable_size; }
58
+
59
+ char *data() const { return data_ptr; }
60
+
61
+ private:
62
+ padded_string &operator=(const padded_string &o) = delete;
63
+ padded_string(const padded_string &o) = delete;
64
+
65
+ size_t viable_size;
66
+ char *data_ptr;
67
+ };
68
+ } // namespace simdjson
69
+
70
+ #endif