react-native-quick-crypto 1.0.18 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (751) hide show
  1. package/QuickCrypto.podspec +12 -38
  2. package/README.md +2 -0
  3. package/android/CMakeLists.txt +3 -0
  4. package/cpp/utils/HybridUtils.cpp +39 -77
  5. package/deps/simdutf/.clang-format +4 -0
  6. package/deps/simdutf/.github/ISSUE_TEMPLATE/bug_report.md +62 -0
  7. package/deps/simdutf/.github/ISSUE_TEMPLATE/config.yml +1 -0
  8. package/deps/simdutf/.github/ISSUE_TEMPLATE/feature_request.md +35 -0
  9. package/deps/simdutf/.github/ISSUE_TEMPLATE/standard-issue-template.md +29 -0
  10. package/deps/simdutf/.github/pull_request_template.md +51 -0
  11. package/deps/simdutf/.github/workflows/aarch64.yml +39 -0
  12. package/deps/simdutf/.github/workflows/alpine.yml +27 -0
  13. package/deps/simdutf/.github/workflows/amalgamation_demos.yml +34 -0
  14. package/deps/simdutf/.github/workflows/armv7.yml +32 -0
  15. package/deps/simdutf/.github/workflows/atomic_fuzz.yml +25 -0
  16. package/deps/simdutf/.github/workflows/cifuzz.yml +37 -0
  17. package/deps/simdutf/.github/workflows/clangformat.yml +36 -0
  18. package/deps/simdutf/.github/workflows/debian-latestcxxstandards.yml +40 -0
  19. package/deps/simdutf/.github/workflows/debian.yml +33 -0
  20. package/deps/simdutf/.github/workflows/documentation.yml +36 -0
  21. package/deps/simdutf/.github/workflows/emscripten.yml +19 -0
  22. package/deps/simdutf/.github/workflows/loongarch64-gcc-14.2.yml +39 -0
  23. package/deps/simdutf/.github/workflows/macos-latest.yml +29 -0
  24. package/deps/simdutf/.github/workflows/msys2-clang.yml +48 -0
  25. package/deps/simdutf/.github/workflows/msys2.yml +50 -0
  26. package/deps/simdutf/.github/workflows/ppc64le.yml +29 -0
  27. package/deps/simdutf/.github/workflows/rvv-1024-clang-18.yml +35 -0
  28. package/deps/simdutf/.github/workflows/rvv-128-clang-17.yml +35 -0
  29. package/deps/simdutf/.github/workflows/rvv-256-gcc-14.yml +31 -0
  30. package/deps/simdutf/.github/workflows/s390x.yml +29 -0
  31. package/deps/simdutf/.github/workflows/selective-amalgamation.yml +29 -0
  32. package/deps/simdutf/.github/workflows/typos.yml +19 -0
  33. package/deps/simdutf/.github/workflows/ubuntu22-cxx20.yml +30 -0
  34. package/deps/simdutf/.github/workflows/ubuntu22.yml +32 -0
  35. package/deps/simdutf/.github/workflows/ubuntu22_gcc12.yml +27 -0
  36. package/deps/simdutf/.github/workflows/ubuntu22sani.yml +29 -0
  37. package/deps/simdutf/.github/workflows/ubuntu24-cxxstandards.yml +34 -0
  38. package/deps/simdutf/.github/workflows/ubuntu24-unsignedchar.yml +34 -0
  39. package/deps/simdutf/.github/workflows/ubuntu24.yml +32 -0
  40. package/deps/simdutf/.github/workflows/ubuntu24sani.yml +36 -0
  41. package/deps/simdutf/.github/workflows/ubuntu24sani_clang.yml +29 -0
  42. package/deps/simdutf/.github/workflows/vs17-arm-ci.yml +21 -0
  43. package/deps/simdutf/.github/workflows/vs17-ci-cxx20.yml +41 -0
  44. package/deps/simdutf/.github/workflows/vs17-ci.yml +41 -0
  45. package/deps/simdutf/.github/workflows/vs17-clang-ci.yml +41 -0
  46. package/deps/simdutf/.github/workflows/vs17-cxxstandards.yml +36 -0
  47. package/deps/simdutf/AI_USAGE_POLICY.md +56 -0
  48. package/deps/simdutf/AUTHORS +6 -0
  49. package/deps/simdutf/CMakeLists.txt +231 -0
  50. package/deps/simdutf/CONTRIBUTING.md +214 -0
  51. package/deps/simdutf/CONTRIBUTORS +1 -0
  52. package/deps/simdutf/Doxyfile +2584 -0
  53. package/deps/simdutf/LICENSE-APACHE +201 -0
  54. package/deps/simdutf/LICENSE-MIT +18 -0
  55. package/deps/simdutf/Makefile.crosscompile +54 -0
  56. package/deps/simdutf/README-RVV.md +16 -0
  57. package/deps/simdutf/README.md +2782 -0
  58. package/deps/simdutf/SECURITY.md +8 -0
  59. package/deps/simdutf/benchmarks/CMakeLists.txt +101 -0
  60. package/deps/simdutf/benchmarks/alignment.cpp +150 -0
  61. package/deps/simdutf/benchmarks/base64/CMakeLists.txt +30 -0
  62. package/deps/simdutf/benchmarks/base64/benchmark_base64.cpp +875 -0
  63. package/deps/simdutf/benchmarks/base64/libbase64_spaces.h +49 -0
  64. package/deps/simdutf/benchmarks/base64/node_base64.h +227 -0
  65. package/deps/simdutf/benchmarks/base64/openssl3_base64.h +334 -0
  66. package/deps/simdutf/benchmarks/benchmark.cpp +65 -0
  67. package/deps/simdutf/benchmarks/benchmark_to_well_formed_utf16.cpp +347 -0
  68. package/deps/simdutf/benchmarks/competition/.clang-format-ignore +5 -0
  69. package/deps/simdutf/benchmarks/competition/CppCon2018/utf_utils.cpp +1276 -0
  70. package/deps/simdutf/benchmarks/competition/CppCon2018/utf_utils.h +595 -0
  71. package/deps/simdutf/benchmarks/competition/README.md +7 -0
  72. package/deps/simdutf/benchmarks/competition/hoehrmann/hoehrmann.h +91 -0
  73. package/deps/simdutf/benchmarks/competition/inoue2008/inoue_utf8_to_utf16.h +444 -0
  74. package/deps/simdutf/benchmarks/competition/inoue2008/inoue_utf8_to_utf16_tables.h +13183 -0
  75. package/deps/simdutf/benchmarks/competition/inoue2008/script.py +73 -0
  76. package/deps/simdutf/benchmarks/competition/llvm/ConvertUTF.cpp +738 -0
  77. package/deps/simdutf/benchmarks/competition/llvm/ConvertUTF.h +293 -0
  78. package/deps/simdutf/benchmarks/competition/u8u16/COPYRIGHT +8 -0
  79. package/deps/simdutf/benchmarks/competition/u8u16/Makefile +44 -0
  80. package/deps/simdutf/benchmarks/competition/u8u16/OSL3.0.txt +169 -0
  81. package/deps/simdutf/benchmarks/competition/u8u16/Profiling/BOM_Profiler.h +148 -0
  82. package/deps/simdutf/benchmarks/competition/u8u16/Profiling/i386_timer.h +45 -0
  83. package/deps/simdutf/benchmarks/competition/u8u16/Profiling/ppc_timer.c +34 -0
  84. package/deps/simdutf/benchmarks/competition/u8u16/README +56 -0
  85. package/deps/simdutf/benchmarks/competition/u8u16/config/config_defs.h +43 -0
  86. package/deps/simdutf/benchmarks/competition/u8u16/config/g4_config.h +27 -0
  87. package/deps/simdutf/benchmarks/competition/u8u16/config/mmx_config.h +16 -0
  88. package/deps/simdutf/benchmarks/competition/u8u16/config/p4_config.h +18 -0
  89. package/deps/simdutf/benchmarks/competition/u8u16/config/p4_ideal_config.h +16 -0
  90. package/deps/simdutf/benchmarks/competition/u8u16/config/spu_config.h +28 -0
  91. package/deps/simdutf/benchmarks/competition/u8u16/config/ssse3_config.h +20 -0
  92. package/deps/simdutf/benchmarks/competition/u8u16/iconv_u8u16.c +2 -0
  93. package/deps/simdutf/benchmarks/competition/u8u16/lib/altivec_simd.h +440 -0
  94. package/deps/simdutf/benchmarks/competition/u8u16/lib/libgen/make_basic_ops.py +121 -0
  95. package/deps/simdutf/benchmarks/competition/u8u16/lib/libgen/make_half_operand_versions.py +158 -0
  96. package/deps/simdutf/benchmarks/competition/u8u16/lib/libgen/make_test.py +270 -0
  97. package/deps/simdutf/benchmarks/competition/u8u16/lib/mmx_simd.h +141 -0
  98. package/deps/simdutf/benchmarks/competition/u8u16/lib/mmx_simd_basic.h +216 -0
  99. package/deps/simdutf/benchmarks/competition/u8u16/lib/mmx_simd_built_in.h +119 -0
  100. package/deps/simdutf/benchmarks/competition/u8u16/lib/mmx_simd_modified.h +2430 -0
  101. package/deps/simdutf/benchmarks/competition/u8u16/lib/outline.txt +39 -0
  102. package/deps/simdutf/benchmarks/competition/u8u16/lib/spu_simd.h +421 -0
  103. package/deps/simdutf/benchmarks/competition/u8u16/lib/sse_simd.h +836 -0
  104. package/deps/simdutf/benchmarks/competition/u8u16/lib/stdint.h +222 -0
  105. package/deps/simdutf/benchmarks/competition/u8u16/libu8u16_BE.c +4 -0
  106. package/deps/simdutf/benchmarks/competition/u8u16/libu8u16_LE.c +5 -0
  107. package/deps/simdutf/benchmarks/competition/u8u16/proto/u8u16.py +390 -0
  108. package/deps/simdutf/benchmarks/competition/u8u16/src/Makefile +18 -0
  109. package/deps/simdutf/benchmarks/competition/u8u16/src/bytelex.h +448 -0
  110. package/deps/simdutf/benchmarks/competition/u8u16/src/charsets/ASCII_EBCDIC.h +284 -0
  111. package/deps/simdutf/benchmarks/competition/u8u16/src/libu8u16.c +1975 -0
  112. package/deps/simdutf/benchmarks/competition/u8u16/src/libu8u16.pdf +0 -0
  113. package/deps/simdutf/benchmarks/competition/u8u16/src/libu8u16.w +2263 -0
  114. package/deps/simdutf/benchmarks/competition/u8u16/src/multiliteral.h +239 -0
  115. package/deps/simdutf/benchmarks/competition/u8u16/src/u8u16.c +232 -0
  116. package/deps/simdutf/benchmarks/competition/u8u16/src/x8x16.c +194 -0
  117. package/deps/simdutf/benchmarks/competition/u8u16/src/xml_error.c +193 -0
  118. package/deps/simdutf/benchmarks/competition/u8u16/src/xml_error.h +167 -0
  119. package/deps/simdutf/benchmarks/competition/u8u16/src/xmldecl.c +288 -0
  120. package/deps/simdutf/benchmarks/competition/u8u16/src/xmldecl.h +117 -0
  121. package/deps/simdutf/benchmarks/competition/u8u16/u8u16_g4.c +2 -0
  122. package/deps/simdutf/benchmarks/competition/u8u16/u8u16_mmx.c +2 -0
  123. package/deps/simdutf/benchmarks/competition/u8u16/u8u16_p4.c +3 -0
  124. package/deps/simdutf/benchmarks/competition/u8u16/u8u16_p4_ideal.c +2 -0
  125. package/deps/simdutf/benchmarks/competition/u8u16/u8u16_spu.c +2 -0
  126. package/deps/simdutf/benchmarks/competition/u8u16/u8u16_ssse3.c +3 -0
  127. package/deps/simdutf/benchmarks/competition/u8u16/x8x16_p4.c +2 -0
  128. package/deps/simdutf/benchmarks/competition/utf8lut/LICENSE +23 -0
  129. package/deps/simdutf/benchmarks/competition/utf8lut/data/test_minimal.txt +44 -0
  130. package/deps/simdutf/benchmarks/competition/utf8lut/readme.md +106 -0
  131. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_clang_corr_tests.cmd +11 -0
  132. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_clang_corr_tests.sh +13 -0
  133. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_gcc_corr_tests.sh +13 -0
  134. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_gcc_example.sh +13 -0
  135. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_gcc_file_conv.sh +14 -0
  136. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_gcc_iconv_lib.sh +11 -0
  137. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_gcc_iconv_sample.sh +8 -0
  138. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_mingw_corr_tests.cmd +12 -0
  139. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_mingw_example.cmd +13 -0
  140. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_mingw_file_conv.cmd +14 -0
  141. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_mingw_iconv_lib.cmd +11 -0
  142. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_mingw_iconv_sample.cmd +8 -0
  143. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_msvc_corr_tests.cmd +11 -0
  144. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_msvc_example.cmd +12 -0
  145. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_msvc_file_conv.cmd +13 -0
  146. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_msvc_iconv_lib.cmd +10 -0
  147. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/build_msvc_iconv_sample.cmd +9 -0
  148. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/html_table.py +25 -0
  149. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/measure.py +94 -0
  150. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/resize.py +20 -0
  151. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/wipe_all.cmd +2 -0
  152. package/deps/simdutf/benchmarks/competition/utf8lut/scripts/wipe_interm.cmd +1 -0
  153. package/deps/simdutf/benchmarks/competition/utf8lut/src/base/CustomMemcpy.h +75 -0
  154. package/deps/simdutf/benchmarks/competition/utf8lut/src/base/PerfDefs.h +47 -0
  155. package/deps/simdutf/benchmarks/competition/utf8lut/src/base/Timing.cpp +17 -0
  156. package/deps/simdutf/benchmarks/competition/utf8lut/src/base/Timing.h +76 -0
  157. package/deps/simdutf/benchmarks/competition/utf8lut/src/buffer/AllProcessors.cpp +35 -0
  158. package/deps/simdutf/benchmarks/competition/utf8lut/src/buffer/BaseBufferProcessor.cpp +117 -0
  159. package/deps/simdutf/benchmarks/competition/utf8lut/src/buffer/BaseBufferProcessor.h +210 -0
  160. package/deps/simdutf/benchmarks/competition/utf8lut/src/buffer/BufferDecoder.h +158 -0
  161. package/deps/simdutf/benchmarks/competition/utf8lut/src/buffer/BufferEncoder.h +104 -0
  162. package/deps/simdutf/benchmarks/competition/utf8lut/src/buffer/ProcessorPlugins.h +334 -0
  163. package/deps/simdutf/benchmarks/competition/utf8lut/src/buffer/ProcessorSelector.h +186 -0
  164. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/DecoderLut.cpp +140 -0
  165. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/DecoderLut.h +42 -0
  166. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/DecoderProcess.h +100 -0
  167. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/Dfa.h +57 -0
  168. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/EncoderLut.cpp +85 -0
  169. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/EncoderLut.h +27 -0
  170. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/EncoderProcess.h +126 -0
  171. package/deps/simdutf/benchmarks/competition/utf8lut/src/core/ProcessTrivial.h +108 -0
  172. package/deps/simdutf/benchmarks/competition/utf8lut/src/iconv/iconv.cpp +139 -0
  173. package/deps/simdutf/benchmarks/competition/utf8lut/src/iconv/iconv.h +74 -0
  174. package/deps/simdutf/benchmarks/competition/utf8lut/src/message/MessageConverter.cpp +65 -0
  175. package/deps/simdutf/benchmarks/competition/utf8lut/src/message/MessageConverter.h +91 -0
  176. package/deps/simdutf/benchmarks/competition/utf8lut/src/tests/CorrectnessTests.cpp +772 -0
  177. package/deps/simdutf/benchmarks/competition/utf8lut/src/tests/Example.cpp +12 -0
  178. package/deps/simdutf/benchmarks/competition/utf8lut/src/tests/FileConverter.cpp +486 -0
  179. package/deps/simdutf/benchmarks/competition/utf8lut/src/tests/iconv_sample.c +162 -0
  180. package/deps/simdutf/benchmarks/competition/utf8lut/src/utf8lut.h +15 -0
  181. package/deps/simdutf/benchmarks/competition/utf8sse4/fromutf8-sse.cpp +292 -0
  182. package/deps/simdutf/benchmarks/competition/utfcpp/LICENSE +23 -0
  183. package/deps/simdutf/benchmarks/competition/utfcpp/README.md +1503 -0
  184. package/deps/simdutf/benchmarks/competition/utfcpp/source/utf8/checked.h +335 -0
  185. package/deps/simdutf/benchmarks/competition/utfcpp/source/utf8/core.h +338 -0
  186. package/deps/simdutf/benchmarks/competition/utfcpp/source/utf8/cpp11.h +103 -0
  187. package/deps/simdutf/benchmarks/competition/utfcpp/source/utf8/cpp17.h +103 -0
  188. package/deps/simdutf/benchmarks/competition/utfcpp/source/utf8/unchecked.h +274 -0
  189. package/deps/simdutf/benchmarks/competition/utfcpp/source/utf8.h +34 -0
  190. package/deps/simdutf/benchmarks/dataset/README.md +155 -0
  191. package/deps/simdutf/benchmarks/dataset/emoji.txt +204 -0
  192. package/deps/simdutf/benchmarks/dataset/scripts/utf8type.py +40 -0
  193. package/deps/simdutf/benchmarks/dataset/wikipedia_mars/Makefile +80 -0
  194. package/deps/simdutf/benchmarks/dataset/wikipedia_mars/convert_to_utf6.py +20 -0
  195. package/deps/simdutf/benchmarks/find/CMakeLists.txt +6 -0
  196. package/deps/simdutf/benchmarks/find/findbenchmark.cpp +63 -0
  197. package/deps/simdutf/benchmarks/find/findbenchmarker.h +46 -0
  198. package/deps/simdutf/benchmarks/shortbench.cpp +555 -0
  199. package/deps/simdutf/benchmarks/src/CMakeLists.txt +52 -0
  200. package/deps/simdutf/benchmarks/src/apple_arm_events.h +1104 -0
  201. package/deps/simdutf/benchmarks/src/benchmark.cpp +3899 -0
  202. package/deps/simdutf/benchmarks/src/benchmark.h +317 -0
  203. package/deps/simdutf/benchmarks/src/benchmark_base.cpp +144 -0
  204. package/deps/simdutf/benchmarks/src/benchmark_base.h +98 -0
  205. package/deps/simdutf/benchmarks/src/cmdline.cpp +176 -0
  206. package/deps/simdutf/benchmarks/src/cmdline.h +35 -0
  207. package/deps/simdutf/benchmarks/src/event_counter.h +162 -0
  208. package/deps/simdutf/benchmarks/src/linux-perf-events.h +104 -0
  209. package/deps/simdutf/benchmarks/stream.cpp +209 -0
  210. package/deps/simdutf/benchmarks/threaded.cpp +123 -0
  211. package/deps/simdutf/cmake/CPM.cmake +1363 -0
  212. package/deps/simdutf/cmake/JoinPaths.cmake +23 -0
  213. package/deps/simdutf/cmake/add_cpp_test.cmake +68 -0
  214. package/deps/simdutf/cmake/simdutf-config.cmake.in +2 -0
  215. package/deps/simdutf/cmake/simdutf-flags.cmake +26 -0
  216. package/deps/simdutf/cmake/toolchains-ci/riscv64-linux-gnu.cmake +4 -0
  217. package/deps/simdutf/cmake/toolchains-dev/README.md +32 -0
  218. package/deps/simdutf/cmake/toolchains-dev/aarch64.cmake +14 -0
  219. package/deps/simdutf/cmake/toolchains-dev/loongarch64.cmake +22 -0
  220. package/deps/simdutf/cmake/toolchains-dev/powerpc64.cmake +16 -0
  221. package/deps/simdutf/cmake/toolchains-dev/powerpc64le.cmake +16 -0
  222. package/deps/simdutf/cmake/toolchains-dev/riscv64.cmake +16 -0
  223. package/deps/simdutf/cmake/toolchains-dev/rvv-spike.cmake +38 -0
  224. package/deps/simdutf/doc/avx512.png +0 -0
  225. package/deps/simdutf/doc/logo.png +0 -0
  226. package/deps/simdutf/doc/logo.svg +165 -0
  227. package/deps/simdutf/doc/node2023.png +0 -0
  228. package/deps/simdutf/doc/shortinput.md +78 -0
  229. package/deps/simdutf/doc/utf16utf8.png +0 -0
  230. package/deps/simdutf/doc/utf8utf16.png +0 -0
  231. package/deps/simdutf/doc/widelogo.png +0 -0
  232. package/deps/simdutf/doxygen.py +50 -0
  233. package/deps/simdutf/fuzz/.clang-format +9 -0
  234. package/deps/simdutf/fuzz/CMakeLists.txt +45 -0
  235. package/deps/simdutf/fuzz/README.md +168 -0
  236. package/deps/simdutf/fuzz/atomic_base64.cpp +448 -0
  237. package/deps/simdutf/fuzz/base64.cpp +278 -0
  238. package/deps/simdutf/fuzz/build.sh +83 -0
  239. package/deps/simdutf/fuzz/conversion.cpp +669 -0
  240. package/deps/simdutf/fuzz/helpers/.clang-format-ignore +1 -0
  241. package/deps/simdutf/fuzz/helpers/common.h +135 -0
  242. package/deps/simdutf/fuzz/helpers/nameof.hpp +1258 -0
  243. package/deps/simdutf/fuzz/main.cpp +72 -0
  244. package/deps/simdutf/fuzz/minimize_and_cleanse.sh +87 -0
  245. package/deps/simdutf/fuzz/misc.cpp +216 -0
  246. package/deps/simdutf/fuzz/random_fuzz.sh +154 -0
  247. package/deps/simdutf/fuzz/roundtrip.cpp +588 -0
  248. package/deps/simdutf/fuzz/safe_conversion.cpp +104 -0
  249. package/deps/simdutf/include/simdutf/avx512.h +79 -0
  250. package/deps/simdutf/include/simdutf/base64_implementation.h +158 -0
  251. package/deps/simdutf/include/simdutf/base64_tables.h +887 -0
  252. package/deps/simdutf/include/simdutf/common_defs.h +186 -0
  253. package/deps/simdutf/include/simdutf/compiler_check.h +50 -0
  254. package/deps/simdutf/include/simdutf/constexpr_ptr.h +138 -0
  255. package/deps/simdutf/include/simdutf/encoding_types.h +189 -0
  256. package/deps/simdutf/include/simdutf/error.h +126 -0
  257. package/deps/simdutf/include/simdutf/implementation.h +7081 -0
  258. package/deps/simdutf/include/simdutf/internal/isadetection.h +325 -0
  259. package/deps/simdutf/include/simdutf/portability.h +285 -0
  260. package/deps/simdutf/include/simdutf/scalar/ascii.h +86 -0
  261. package/deps/simdutf/include/simdutf/scalar/atomic_util.h +105 -0
  262. package/deps/simdutf/include/simdutf/scalar/base64.h +911 -0
  263. package/deps/simdutf/include/simdutf/scalar/latin1.h +26 -0
  264. package/deps/simdutf/include/simdutf/scalar/latin1_to_utf16/latin1_to_utf16.h +52 -0
  265. package/deps/simdutf/include/simdutf/scalar/latin1_to_utf32/latin1_to_utf32.h +27 -0
  266. package/deps/simdutf/include/simdutf/scalar/latin1_to_utf8/latin1_to_utf8.h +191 -0
  267. package/deps/simdutf/include/simdutf/scalar/swap_bytes.h +35 -0
  268. package/deps/simdutf/include/simdutf/scalar/utf16.h +226 -0
  269. package/deps/simdutf/include/simdutf/scalar/utf16_to_latin1/utf16_to_latin1.h +108 -0
  270. package/deps/simdutf/include/simdutf/scalar/utf16_to_latin1/valid_utf16_to_latin1.h +40 -0
  271. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf32/utf16_to_utf32.h +86 -0
  272. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf32/valid_utf16_to_utf32.h +44 -0
  273. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf8/utf16_to_utf8.h +295 -0
  274. package/deps/simdutf/include/simdutf/scalar/utf16_to_utf8/valid_utf16_to_utf8.h +91 -0
  275. package/deps/simdutf/include/simdutf/scalar/utf32.h +82 -0
  276. package/deps/simdutf/include/simdutf/scalar/utf32_to_latin1/utf32_to_latin1.h +68 -0
  277. package/deps/simdutf/include/simdutf/scalar/utf32_to_latin1/valid_utf32_to_latin1.h +67 -0
  278. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf16/utf32_to_utf16.h +84 -0
  279. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf16/valid_utf32_to_utf16.h +44 -0
  280. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf8/utf32_to_utf8.h +142 -0
  281. package/deps/simdutf/include/simdutf/scalar/utf32_to_utf8/valid_utf32_to_utf8.h +72 -0
  282. package/deps/simdutf/include/simdutf/scalar/utf8.h +326 -0
  283. package/deps/simdutf/include/simdutf/scalar/utf8_to_latin1/utf8_to_latin1.h +225 -0
  284. package/deps/simdutf/include/simdutf/scalar/utf8_to_latin1/valid_utf8_to_latin1.h +87 -0
  285. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf16/utf8_to_utf16.h +342 -0
  286. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf16/valid_utf8_to_utf16.h +106 -0
  287. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf32/utf8_to_utf32.h +299 -0
  288. package/deps/simdutf/include/simdutf/scalar/utf8_to_utf32/valid_utf8_to_utf32.h +83 -0
  289. package/deps/simdutf/include/simdutf/simdutf_version.h +26 -0
  290. package/deps/simdutf/include/simdutf.h +26 -0
  291. package/deps/simdutf/include/simdutf_c.h +342 -0
  292. package/deps/simdutf/riscv/Dockerfile +16 -0
  293. package/deps/simdutf/riscv/README.md +24 -0
  294. package/deps/simdutf/riscv/remove-docker-station +8 -0
  295. package/deps/simdutf/riscv/run-docker-station +31 -0
  296. package/deps/simdutf/scripts/.flake8 +2 -0
  297. package/deps/simdutf/scripts/Makefile +2 -0
  298. package/deps/simdutf/scripts/README_ADD_FUNCTION.md +49 -0
  299. package/deps/simdutf/scripts/add_function.py +330 -0
  300. package/deps/simdutf/scripts/amalgamation_tests.py +156 -0
  301. package/deps/simdutf/scripts/base64/Makefile +2 -0
  302. package/deps/simdutf/scripts/base64/README.md +2 -0
  303. package/deps/simdutf/scripts/base64/avx512.py +76 -0
  304. package/deps/simdutf/scripts/base64/neon_decode.py +143 -0
  305. package/deps/simdutf/scripts/base64/neon_generate_lut.py +101 -0
  306. package/deps/simdutf/scripts/base64/sse.py +252 -0
  307. package/deps/simdutf/scripts/base64/sseregular.py +160 -0
  308. package/deps/simdutf/scripts/base64/sseurl.py +283 -0
  309. package/deps/simdutf/scripts/base64/table.py +59 -0
  310. package/deps/simdutf/scripts/base64bench_print.py +145 -0
  311. package/deps/simdutf/scripts/benchmark-all.py +119 -0
  312. package/deps/simdutf/scripts/benchmark_print.py +324 -0
  313. package/deps/simdutf/scripts/check_feature_macros.py +156 -0
  314. package/deps/simdutf/scripts/check_typos.sh +13 -0
  315. package/deps/simdutf/scripts/clang_format.sh +35 -0
  316. package/deps/simdutf/scripts/clang_format_docker.sh +38 -0
  317. package/deps/simdutf/scripts/common.py +24 -0
  318. package/deps/simdutf/scripts/compilation_benchmark.py +55 -0
  319. package/deps/simdutf/scripts/compile_many_variations.sh +64 -0
  320. package/deps/simdutf/scripts/create_latex_table.py +62 -0
  321. package/deps/simdutf/scripts/docker/Dockerfile +14 -0
  322. package/deps/simdutf/scripts/docker/Makefile +9 -0
  323. package/deps/simdutf/scripts/docker/README.md +30 -0
  324. package/deps/simdutf/scripts/docker/llvm.gpg +0 -0
  325. package/deps/simdutf/scripts/ppc64_convert_utf16_to_utf8.py +155 -0
  326. package/deps/simdutf/scripts/prepare_doxygen.sh +21 -0
  327. package/deps/simdutf/scripts/release.py +197 -0
  328. package/deps/simdutf/scripts/shortinputplots.py +97 -0
  329. package/deps/simdutf/scripts/sse_convert_utf16_to_utf8.py +422 -0
  330. package/deps/simdutf/scripts/sse_convert_utf32_to_utf16.py +105 -0
  331. package/deps/simdutf/scripts/sse_utf8_utf16_decode.py +186 -0
  332. package/deps/simdutf/scripts/sse_validate_utf16le_proof.py +137 -0
  333. package/deps/simdutf/scripts/sse_validate_utf16le_testcases.py +129 -0
  334. package/deps/simdutf/scripts/table.py +207 -0
  335. package/deps/simdutf/scripts/tests/new.txt +33 -0
  336. package/deps/simdutf/scripts/tests/old.txt +33 -0
  337. package/deps/simdutf/scripts/tests/results.txt +272 -0
  338. package/deps/simdutf/simdutf.pc.in +11 -0
  339. package/deps/simdutf/singleheader/.flake8 +2 -0
  340. package/deps/simdutf/singleheader/CMakeLists.txt +64 -0
  341. package/deps/simdutf/singleheader/README-dev.md +81 -0
  342. package/deps/simdutf/singleheader/README.md +19 -0
  343. package/deps/simdutf/singleheader/amalgamate.py +513 -0
  344. package/deps/simdutf/singleheader/amalgamation_demo.c +59 -0
  345. package/deps/simdutf/singleheader/amalgamation_demo.cpp +54 -0
  346. package/deps/simdutf/singleheader/test-features.py +262 -0
  347. package/deps/simdutf/src/CMakeLists.txt +78 -0
  348. package/deps/simdutf/src/arm64/arm_base64.cpp +791 -0
  349. package/deps/simdutf/src/arm64/arm_convert_latin1_to_utf16.cpp +24 -0
  350. package/deps/simdutf/src/arm64/arm_convert_latin1_to_utf32.cpp +24 -0
  351. package/deps/simdutf/src/arm64/arm_convert_latin1_to_utf8.cpp +70 -0
  352. package/deps/simdutf/src/arm64/arm_convert_utf16_to_latin1.cpp +61 -0
  353. package/deps/simdutf/src/arm64/arm_convert_utf16_to_utf32.cpp +185 -0
  354. package/deps/simdutf/src/arm64/arm_convert_utf16_to_utf8.cpp +780 -0
  355. package/deps/simdutf/src/arm64/arm_convert_utf32_to_latin1.cpp +60 -0
  356. package/deps/simdutf/src/arm64/arm_convert_utf32_to_utf16.cpp +208 -0
  357. package/deps/simdutf/src/arm64/arm_convert_utf32_to_utf8.cpp +505 -0
  358. package/deps/simdutf/src/arm64/arm_convert_utf8_to_latin1.cpp +69 -0
  359. package/deps/simdutf/src/arm64/arm_convert_utf8_to_utf16.cpp +313 -0
  360. package/deps/simdutf/src/arm64/arm_convert_utf8_to_utf32.cpp +179 -0
  361. package/deps/simdutf/src/arm64/arm_find.cpp +199 -0
  362. package/deps/simdutf/src/arm64/arm_utf16fix.cpp +185 -0
  363. package/deps/simdutf/src/arm64/arm_validate_utf16.cpp +165 -0
  364. package/deps/simdutf/src/arm64/arm_validate_utf32le.cpp +65 -0
  365. package/deps/simdutf/src/arm64/implementation.cpp +1442 -0
  366. package/deps/simdutf/src/encoding_types.cpp +67 -0
  367. package/deps/simdutf/src/error.cpp +3 -0
  368. package/deps/simdutf/src/fallback/implementation.cpp +589 -0
  369. package/deps/simdutf/src/generic/ascii_validation.h +50 -0
  370. package/deps/simdutf/src/generic/base64.h +233 -0
  371. package/deps/simdutf/src/generic/base64lengths.h +63 -0
  372. package/deps/simdutf/src/generic/buf_block_reader.h +109 -0
  373. package/deps/simdutf/src/generic/find.h +75 -0
  374. package/deps/simdutf/src/generic/utf16/change_endianness.h +24 -0
  375. package/deps/simdutf/src/generic/utf16/count_code_points_bytemask.h +58 -0
  376. package/deps/simdutf/src/generic/utf16/to_well_formed.h +93 -0
  377. package/deps/simdutf/src/generic/utf16/utf32_length_from_utf16.h +15 -0
  378. package/deps/simdutf/src/generic/utf16/utf8_length_from_utf16.h +35 -0
  379. package/deps/simdutf/src/generic/utf16/utf8_length_from_utf16_bytemask.h +199 -0
  380. package/deps/simdutf/src/generic/utf16.h +73 -0
  381. package/deps/simdutf/src/generic/utf32.h +136 -0
  382. package/deps/simdutf/src/generic/utf8/utf16_length_from_utf8_bytemask.h +53 -0
  383. package/deps/simdutf/src/generic/utf8.h +92 -0
  384. package/deps/simdutf/src/generic/utf8_to_latin1/utf8_to_latin1.h +316 -0
  385. package/deps/simdutf/src/generic/utf8_to_latin1/valid_utf8_to_latin1.h +78 -0
  386. package/deps/simdutf/src/generic/utf8_to_utf16/utf8_to_utf16.h +332 -0
  387. package/deps/simdutf/src/generic/utf8_to_utf16/valid_utf8_to_utf16.h +74 -0
  388. package/deps/simdutf/src/generic/utf8_to_utf32/utf8_to_utf32.h +318 -0
  389. package/deps/simdutf/src/generic/utf8_to_utf32/valid_utf8_to_utf32.h +42 -0
  390. package/deps/simdutf/src/generic/utf8_validation/utf8_lookup4_algorithm.h +223 -0
  391. package/deps/simdutf/src/generic/utf8_validation/utf8_validator.h +84 -0
  392. package/deps/simdutf/src/generic/validate_utf16.h +164 -0
  393. package/deps/simdutf/src/generic/validate_utf32.h +99 -0
  394. package/deps/simdutf/src/haswell/avx2_base64.cpp +837 -0
  395. package/deps/simdutf/src/haswell/avx2_convert_latin1_to_utf16.cpp +28 -0
  396. package/deps/simdutf/src/haswell/avx2_convert_latin1_to_utf32.cpp +20 -0
  397. package/deps/simdutf/src/haswell/avx2_convert_latin1_to_utf8.cpp +83 -0
  398. package/deps/simdutf/src/haswell/avx2_convert_utf16_to_latin1.cpp +83 -0
  399. package/deps/simdutf/src/haswell/avx2_convert_utf16_to_utf32.cpp +210 -0
  400. package/deps/simdutf/src/haswell/avx2_convert_utf16_to_utf8.cpp +602 -0
  401. package/deps/simdutf/src/haswell/avx2_convert_utf32_to_latin1.cpp +116 -0
  402. package/deps/simdutf/src/haswell/avx2_convert_utf32_to_utf16.cpp +164 -0
  403. package/deps/simdutf/src/haswell/avx2_convert_utf32_to_utf8.cpp +569 -0
  404. package/deps/simdutf/src/haswell/avx2_convert_utf8_to_latin1.cpp +60 -0
  405. package/deps/simdutf/src/haswell/avx2_convert_utf8_to_utf16.cpp +195 -0
  406. package/deps/simdutf/src/haswell/avx2_convert_utf8_to_utf32.cpp +135 -0
  407. package/deps/simdutf/src/haswell/avx2_utf16fix.cpp +173 -0
  408. package/deps/simdutf/src/haswell/avx2_validate_utf16.cpp +17 -0
  409. package/deps/simdutf/src/haswell/implementation.cpp +1447 -0
  410. package/deps/simdutf/src/icelake/icelake_ascii_validation.inl.cpp +19 -0
  411. package/deps/simdutf/src/icelake/icelake_base64.inl.cpp +630 -0
  412. package/deps/simdutf/src/icelake/icelake_common.inl.cpp +37 -0
  413. package/deps/simdutf/src/icelake/icelake_convert_latin1_to_utf16.inl.cpp +36 -0
  414. package/deps/simdutf/src/icelake/icelake_convert_latin1_to_utf32.inl.cpp +23 -0
  415. package/deps/simdutf/src/icelake/icelake_convert_latin1_to_utf8.inl.cpp +107 -0
  416. package/deps/simdutf/src/icelake/icelake_convert_utf16_to_latin1.inl.cpp +103 -0
  417. package/deps/simdutf/src/icelake/icelake_convert_utf16_to_utf32.inl.cpp +136 -0
  418. package/deps/simdutf/src/icelake/icelake_convert_utf16_to_utf8.inl.cpp +206 -0
  419. package/deps/simdutf/src/icelake/icelake_convert_utf32_to_latin1.inl.cpp +74 -0
  420. package/deps/simdutf/src/icelake/icelake_convert_utf32_to_utf16.inl.cpp +338 -0
  421. package/deps/simdutf/src/icelake/icelake_convert_utf32_to_utf8.inl.cpp +574 -0
  422. package/deps/simdutf/src/icelake/icelake_convert_utf8_to_latin1.inl.cpp +104 -0
  423. package/deps/simdutf/src/icelake/icelake_convert_utf8_to_utf16.inl.cpp +75 -0
  424. package/deps/simdutf/src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp +69 -0
  425. package/deps/simdutf/src/icelake/icelake_find.inl.cpp +146 -0
  426. package/deps/simdutf/src/icelake/icelake_from_utf8.inl.cpp +266 -0
  427. package/deps/simdutf/src/icelake/icelake_from_valid_utf8.inl.cpp +136 -0
  428. package/deps/simdutf/src/icelake/icelake_macros.inl.cpp +143 -0
  429. package/deps/simdutf/src/icelake/icelake_utf16fix.cpp +138 -0
  430. package/deps/simdutf/src/icelake/icelake_utf32_validation.inl.cpp +63 -0
  431. package/deps/simdutf/src/icelake/icelake_utf8_common.inl.cpp +753 -0
  432. package/deps/simdutf/src/icelake/icelake_utf8_length_from_utf16.inl.cpp +269 -0
  433. package/deps/simdutf/src/icelake/icelake_utf8_validation.inl.cpp +116 -0
  434. package/deps/simdutf/src/icelake/implementation.cpp +1903 -0
  435. package/deps/simdutf/src/implementation.cpp +2526 -0
  436. package/deps/simdutf/src/lasx/implementation.cpp +1531 -0
  437. package/deps/simdutf/src/lasx/lasx_base64.cpp +695 -0
  438. package/deps/simdutf/src/lasx/lasx_convert_latin1_to_utf16.cpp +76 -0
  439. package/deps/simdutf/src/lasx/lasx_convert_latin1_to_utf32.cpp +55 -0
  440. package/deps/simdutf/src/lasx/lasx_convert_latin1_to_utf8.cpp +65 -0
  441. package/deps/simdutf/src/lasx/lasx_convert_utf16_to_latin1.cpp +64 -0
  442. package/deps/simdutf/src/lasx/lasx_convert_utf16_to_utf32.cpp +183 -0
  443. package/deps/simdutf/src/lasx/lasx_convert_utf16_to_utf8.cpp +550 -0
  444. package/deps/simdutf/src/lasx/lasx_convert_utf32_to_latin1.cpp +73 -0
  445. package/deps/simdutf/src/lasx/lasx_convert_utf32_to_utf16.cpp +218 -0
  446. package/deps/simdutf/src/lasx/lasx_convert_utf32_to_utf8.cpp +589 -0
  447. package/deps/simdutf/src/lasx/lasx_convert_utf8_to_latin1.cpp +72 -0
  448. package/deps/simdutf/src/lasx/lasx_convert_utf8_to_utf16.cpp +296 -0
  449. package/deps/simdutf/src/lasx/lasx_convert_utf8_to_utf32.cpp +190 -0
  450. package/deps/simdutf/src/lasx/lasx_find.cpp +64 -0
  451. package/deps/simdutf/src/lasx/lasx_validate_utf16.cpp +13 -0
  452. package/deps/simdutf/src/lasx/lasx_validate_utf32le.cpp +84 -0
  453. package/deps/simdutf/src/lsx/implementation.cpp +1417 -0
  454. package/deps/simdutf/src/lsx/lsx_base64.cpp +675 -0
  455. package/deps/simdutf/src/lsx/lsx_convert_latin1_to_utf16.cpp +39 -0
  456. package/deps/simdutf/src/lsx/lsx_convert_latin1_to_utf32.cpp +27 -0
  457. package/deps/simdutf/src/lsx/lsx_convert_latin1_to_utf8.cpp +56 -0
  458. package/deps/simdutf/src/lsx/lsx_convert_utf16_to_latin1.cpp +64 -0
  459. package/deps/simdutf/src/lsx/lsx_convert_utf16_to_utf32.cpp +133 -0
  460. package/deps/simdutf/src/lsx/lsx_convert_utf16_to_utf8.cpp +518 -0
  461. package/deps/simdutf/src/lsx/lsx_convert_utf32_to_latin1.cpp +66 -0
  462. package/deps/simdutf/src/lsx/lsx_convert_utf32_to_utf16.cpp +155 -0
  463. package/deps/simdutf/src/lsx/lsx_convert_utf32_to_utf8.cpp +459 -0
  464. package/deps/simdutf/src/lsx/lsx_convert_utf8_to_latin1.cpp +75 -0
  465. package/deps/simdutf/src/lsx/lsx_convert_utf8_to_utf16.cpp +291 -0
  466. package/deps/simdutf/src/lsx/lsx_convert_utf8_to_utf32.cpp +179 -0
  467. package/deps/simdutf/src/lsx/lsx_find.cpp +60 -0
  468. package/deps/simdutf/src/lsx/lsx_validate_utf16.cpp +13 -0
  469. package/deps/simdutf/src/lsx/lsx_validate_utf32le.cpp +68 -0
  470. package/deps/simdutf/src/ppc64/implementation.cpp +992 -0
  471. package/deps/simdutf/src/ppc64/ppc64_base64.cpp +480 -0
  472. package/deps/simdutf/src/ppc64/ppc64_base64_internal_tests.cpp +401 -0
  473. package/deps/simdutf/src/ppc64/ppc64_convert_latin1_to_utf16.cpp +12 -0
  474. package/deps/simdutf/src/ppc64/ppc64_convert_latin1_to_utf32.cpp +12 -0
  475. package/deps/simdutf/src/ppc64/ppc64_convert_latin1_to_utf8.cpp +149 -0
  476. package/deps/simdutf/src/ppc64/ppc64_convert_utf16_to_latin1.cpp +67 -0
  477. package/deps/simdutf/src/ppc64/ppc64_convert_utf16_to_utf32.cpp +87 -0
  478. package/deps/simdutf/src/ppc64/ppc64_convert_utf16_to_utf8.cpp +296 -0
  479. package/deps/simdutf/src/ppc64/ppc64_convert_utf32_to_latin1.cpp +57 -0
  480. package/deps/simdutf/src/ppc64/ppc64_convert_utf32_to_utf16.cpp +117 -0
  481. package/deps/simdutf/src/ppc64/ppc64_convert_utf32_to_utf8.cpp +166 -0
  482. package/deps/simdutf/src/ppc64/ppc64_convert_utf8_to_latin1.cpp +69 -0
  483. package/deps/simdutf/src/ppc64/ppc64_convert_utf8_to_utf16.cpp +211 -0
  484. package/deps/simdutf/src/ppc64/ppc64_convert_utf8_to_utf32.cpp +153 -0
  485. package/deps/simdutf/src/ppc64/ppc64_utf16_to_utf8_tables.h +1011 -0
  486. package/deps/simdutf/src/ppc64/ppc64_utf8_length_from_latin1.cpp +37 -0
  487. package/deps/simdutf/src/ppc64/ppc64_validate_utf16.cpp +19 -0
  488. package/deps/simdutf/src/ppc64/templates.cpp +91 -0
  489. package/deps/simdutf/src/rvv/implementation.cpp +138 -0
  490. package/deps/simdutf/src/rvv/rvv_find.cpp +27 -0
  491. package/deps/simdutf/src/rvv/rvv_helpers.inl.cpp +23 -0
  492. package/deps/simdutf/src/rvv/rvv_latin1_to.inl.cpp +71 -0
  493. package/deps/simdutf/src/rvv/rvv_length_from.inl.cpp +164 -0
  494. package/deps/simdutf/src/rvv/rvv_utf16_to.inl.cpp +399 -0
  495. package/deps/simdutf/src/rvv/rvv_utf16fix.cpp +110 -0
  496. package/deps/simdutf/src/rvv/rvv_utf32_to.inl.cpp +307 -0
  497. package/deps/simdutf/src/rvv/rvv_utf8_to.inl.cpp +435 -0
  498. package/deps/simdutf/src/rvv/rvv_validate.inl.cpp +275 -0
  499. package/deps/simdutf/src/simdutf/arm64/begin.h +2 -0
  500. package/deps/simdutf/src/simdutf/arm64/bitmanipulation.h +34 -0
  501. package/deps/simdutf/src/simdutf/arm64/end.h +2 -0
  502. package/deps/simdutf/src/simdutf/arm64/implementation.h +307 -0
  503. package/deps/simdutf/src/simdutf/arm64/intrinsics.h +10 -0
  504. package/deps/simdutf/src/simdutf/arm64/simd.h +547 -0
  505. package/deps/simdutf/src/simdutf/arm64/simd16-inl.h +403 -0
  506. package/deps/simdutf/src/simdutf/arm64/simd32-inl.h +129 -0
  507. package/deps/simdutf/src/simdutf/arm64/simd64-inl.h +28 -0
  508. package/deps/simdutf/src/simdutf/arm64.h +43 -0
  509. package/deps/simdutf/src/simdutf/fallback/begin.h +1 -0
  510. package/deps/simdutf/src/simdutf/fallback/bitmanipulation.h +13 -0
  511. package/deps/simdutf/src/simdutf/fallback/end.h +1 -0
  512. package/deps/simdutf/src/simdutf/fallback/implementation.h +331 -0
  513. package/deps/simdutf/src/simdutf/fallback.h +42 -0
  514. package/deps/simdutf/src/simdutf/haswell/begin.h +15 -0
  515. package/deps/simdutf/src/simdutf/haswell/bitmanipulation.h +35 -0
  516. package/deps/simdutf/src/simdutf/haswell/end.h +13 -0
  517. package/deps/simdutf/src/simdutf/haswell/implementation.h +338 -0
  518. package/deps/simdutf/src/simdutf/haswell/intrinsics.h +67 -0
  519. package/deps/simdutf/src/simdutf/haswell/simd.h +363 -0
  520. package/deps/simdutf/src/simdutf/haswell/simd16-inl.h +261 -0
  521. package/deps/simdutf/src/simdutf/haswell/simd32-inl.h +111 -0
  522. package/deps/simdutf/src/simdutf/haswell/simd64-inl.h +34 -0
  523. package/deps/simdutf/src/simdutf/haswell.h +63 -0
  524. package/deps/simdutf/src/simdutf/icelake/begin.h +14 -0
  525. package/deps/simdutf/src/simdutf/icelake/bitmanipulation.h +44 -0
  526. package/deps/simdutf/src/simdutf/icelake/end.h +12 -0
  527. package/deps/simdutf/src/simdutf/icelake/implementation.h +346 -0
  528. package/deps/simdutf/src/simdutf/icelake/intrinsics.h +138 -0
  529. package/deps/simdutf/src/simdutf/icelake/simd.h +17 -0
  530. package/deps/simdutf/src/simdutf/icelake/simd16-inl.h +90 -0
  531. package/deps/simdutf/src/simdutf/icelake/simd32-inl.h +47 -0
  532. package/deps/simdutf/src/simdutf/icelake.h +81 -0
  533. package/deps/simdutf/src/simdutf/lasx/begin.h +8 -0
  534. package/deps/simdutf/src/simdutf/lasx/bitmanipulation.h +25 -0
  535. package/deps/simdutf/src/simdutf/lasx/end.h +8 -0
  536. package/deps/simdutf/src/simdutf/lasx/implementation.h +310 -0
  537. package/deps/simdutf/src/simdutf/lasx/intrinsics.h +319 -0
  538. package/deps/simdutf/src/simdutf/lasx/simd.h +551 -0
  539. package/deps/simdutf/src/simdutf/lasx/simd16-inl.h +234 -0
  540. package/deps/simdutf/src/simdutf/lasx/simd32-inl.h +74 -0
  541. package/deps/simdutf/src/simdutf/lasx/simd64-inl.h +52 -0
  542. package/deps/simdutf/src/simdutf/lasx.h +49 -0
  543. package/deps/simdutf/src/simdutf/lsx/begin.h +2 -0
  544. package/deps/simdutf/src/simdutf/lsx/bitmanipulation.h +25 -0
  545. package/deps/simdutf/src/simdutf/lsx/end.h +2 -0
  546. package/deps/simdutf/src/simdutf/lsx/implementation.h +309 -0
  547. package/deps/simdutf/src/simdutf/lsx/intrinsics.h +196 -0
  548. package/deps/simdutf/src/simdutf/lsx/simd.h +421 -0
  549. package/deps/simdutf/src/simdutf/lsx/simd16-inl.h +242 -0
  550. package/deps/simdutf/src/simdutf/lsx/simd32-inl.h +69 -0
  551. package/deps/simdutf/src/simdutf/lsx/simd64-inl.h +50 -0
  552. package/deps/simdutf/src/simdutf/lsx.h +52 -0
  553. package/deps/simdutf/src/simdutf/ppc64/begin.h +1 -0
  554. package/deps/simdutf/src/simdutf/ppc64/bitmanipulation.h +29 -0
  555. package/deps/simdutf/src/simdutf/ppc64/end.h +1 -0
  556. package/deps/simdutf/src/simdutf/ppc64/implementation.h +348 -0
  557. package/deps/simdutf/src/simdutf/ppc64/intrinsics.h +19 -0
  558. package/deps/simdutf/src/simdutf/ppc64/simd.h +177 -0
  559. package/deps/simdutf/src/simdutf/ppc64/simd16-inl.h +327 -0
  560. package/deps/simdutf/src/simdutf/ppc64/simd32-inl.h +247 -0
  561. package/deps/simdutf/src/simdutf/ppc64/simd8-inl.h +618 -0
  562. package/deps/simdutf/src/simdutf/ppc64.h +40 -0
  563. package/deps/simdutf/src/simdutf/rvv/begin.h +7 -0
  564. package/deps/simdutf/src/simdutf/rvv/end.h +7 -0
  565. package/deps/simdutf/src/simdutf/rvv/implementation.h +321 -0
  566. package/deps/simdutf/src/simdutf/rvv/intrinsics.h +131 -0
  567. package/deps/simdutf/src/simdutf/rvv.h +41 -0
  568. package/deps/simdutf/src/simdutf/westmere/begin.h +8 -0
  569. package/deps/simdutf/src/simdutf/westmere/bitmanipulation.h +37 -0
  570. package/deps/simdutf/src/simdutf/westmere/end.h +8 -0
  571. package/deps/simdutf/src/simdutf/westmere/implementation.h +338 -0
  572. package/deps/simdutf/src/simdutf/westmere/intrinsics.h +38 -0
  573. package/deps/simdutf/src/simdutf/westmere/simd.h +379 -0
  574. package/deps/simdutf/src/simdutf/westmere/simd16-inl.h +242 -0
  575. package/deps/simdutf/src/simdutf/westmere/simd32-inl.h +151 -0
  576. package/deps/simdutf/src/simdutf/westmere/simd64-inl.h +33 -0
  577. package/deps/simdutf/src/simdutf/westmere.h +59 -0
  578. package/deps/simdutf/src/simdutf.cpp +152 -0
  579. package/deps/simdutf/src/simdutf_c.cpp +525 -0
  580. package/deps/simdutf/src/tables/utf16_to_utf8_tables.h +768 -0
  581. package/deps/simdutf/src/tables/utf32_to_utf16_tables.h +53 -0
  582. package/deps/simdutf/src/tables/utf8_to_utf16_tables.h +826 -0
  583. package/deps/simdutf/src/westmere/implementation.cpp +1479 -0
  584. package/deps/simdutf/src/westmere/internal/loader.cpp +7 -0
  585. package/deps/simdutf/src/westmere/internal/write_v_u16_11bits_to_utf8.cpp +66 -0
  586. package/deps/simdutf/src/westmere/sse_base64.cpp +672 -0
  587. package/deps/simdutf/src/westmere/sse_convert_latin1_to_utf16.cpp +21 -0
  588. package/deps/simdutf/src/westmere/sse_convert_latin1_to_utf32.cpp +31 -0
  589. package/deps/simdutf/src/westmere/sse_convert_latin1_to_utf8.cpp +71 -0
  590. package/deps/simdutf/src/westmere/sse_convert_utf16_to_latin1.cpp +70 -0
  591. package/deps/simdutf/src/westmere/sse_convert_utf16_to_utf32.cpp +206 -0
  592. package/deps/simdutf/src/westmere/sse_convert_utf16_to_utf8.cpp +504 -0
  593. package/deps/simdutf/src/westmere/sse_convert_utf32_to_latin1.cpp +82 -0
  594. package/deps/simdutf/src/westmere/sse_convert_utf32_to_utf16.cpp +209 -0
  595. package/deps/simdutf/src/westmere/sse_convert_utf32_to_utf8.cpp +589 -0
  596. package/deps/simdutf/src/westmere/sse_convert_utf8_to_latin1.cpp +58 -0
  597. package/deps/simdutf/src/westmere/sse_convert_utf8_to_utf16.cpp +197 -0
  598. package/deps/simdutf/src/westmere/sse_convert_utf8_to_utf32.cpp +141 -0
  599. package/deps/simdutf/src/westmere/sse_utf16fix.cpp +82 -0
  600. package/deps/simdutf/src/westmere/sse_validate_utf16.cpp +17 -0
  601. package/deps/simdutf/tests/CMakeLists.txt +483 -0
  602. package/deps/simdutf/tests/atomic_base64_tests.cpp +2845 -0
  603. package/deps/simdutf/tests/base64_tests.cpp +3617 -0
  604. package/deps/simdutf/tests/basic_fuzzer.cpp +805 -0
  605. package/deps/simdutf/tests/bele_tests.cpp +182 -0
  606. package/deps/simdutf/tests/constexpr_base64_tests.cpp +387 -0
  607. package/deps/simdutf/tests/convert_latin1_to_utf16be_tests.cpp +52 -0
  608. package/deps/simdutf/tests/convert_latin1_to_utf16le_tests.cpp +80 -0
  609. package/deps/simdutf/tests/convert_latin1_to_utf32_tests.cpp +66 -0
  610. package/deps/simdutf/tests/convert_latin1_to_utf8_tests.cpp +120 -0
  611. package/deps/simdutf/tests/convert_utf16_to_utf8_safe_tests.cpp +203 -0
  612. package/deps/simdutf/tests/convert_utf16_to_utf8_with_replacement_tests.cpp +276 -0
  613. package/deps/simdutf/tests/convert_utf16be_to_latin1_tests.cpp +109 -0
  614. package/deps/simdutf/tests/convert_utf16be_to_latin1_tests_with_errors.cpp +136 -0
  615. package/deps/simdutf/tests/convert_utf16be_to_utf32_tests.cpp +193 -0
  616. package/deps/simdutf/tests/convert_utf16be_to_utf32_with_errors_tests.cpp +381 -0
  617. package/deps/simdutf/tests/convert_utf16be_to_utf8_tests.cpp +259 -0
  618. package/deps/simdutf/tests/convert_utf16be_to_utf8_with_errors_tests.cpp +266 -0
  619. package/deps/simdutf/tests/convert_utf16le_to_latin1_tests.cpp +148 -0
  620. package/deps/simdutf/tests/convert_utf16le_to_latin1_tests_with_errors.cpp +176 -0
  621. package/deps/simdutf/tests/convert_utf16le_to_utf32_tests.cpp +213 -0
  622. package/deps/simdutf/tests/convert_utf16le_to_utf32_with_errors_tests.cpp +318 -0
  623. package/deps/simdutf/tests/convert_utf16le_to_utf8_tests.cpp +343 -0
  624. package/deps/simdutf/tests/convert_utf16le_to_utf8_with_errors_tests.cpp +271 -0
  625. package/deps/simdutf/tests/convert_utf32_to_latin1_tests.cpp +111 -0
  626. package/deps/simdutf/tests/convert_utf32_to_latin1_with_errors_tests.cpp +96 -0
  627. package/deps/simdutf/tests/convert_utf32_to_utf16be_tests.cpp +148 -0
  628. package/deps/simdutf/tests/convert_utf32_to_utf16be_with_errors_tests.cpp +192 -0
  629. package/deps/simdutf/tests/convert_utf32_to_utf16le_tests.cpp +166 -0
  630. package/deps/simdutf/tests/convert_utf32_to_utf16le_with_errors_tests.cpp +215 -0
  631. package/deps/simdutf/tests/convert_utf32_to_utf8_tests.cpp +181 -0
  632. package/deps/simdutf/tests/convert_utf32_to_utf8_with_errors_tests.cpp +261 -0
  633. package/deps/simdutf/tests/convert_utf8_to_latin1_tests.cpp +516 -0
  634. package/deps/simdutf/tests/convert_utf8_to_latin1_with_errors_tests.cpp +579 -0
  635. package/deps/simdutf/tests/convert_utf8_to_utf16be_tests.cpp +412 -0
  636. package/deps/simdutf/tests/convert_utf8_to_utf16be_with_errors_tests.cpp +480 -0
  637. package/deps/simdutf/tests/convert_utf8_to_utf16le_tests.cpp +671 -0
  638. package/deps/simdutf/tests/convert_utf8_to_utf16le_with_errors_tests.cpp +455 -0
  639. package/deps/simdutf/tests/convert_utf8_to_utf32_tests.cpp +1204 -0
  640. package/deps/simdutf/tests/convert_utf8_to_utf32_with_errors_tests.cpp +337 -0
  641. package/deps/simdutf/tests/convert_valid_utf16be_to_latin1_tests.cpp +37 -0
  642. package/deps/simdutf/tests/convert_valid_utf16be_to_utf32_tests.cpp +97 -0
  643. package/deps/simdutf/tests/convert_valid_utf16be_to_utf8_tests.cpp +126 -0
  644. package/deps/simdutf/tests/convert_valid_utf16le_to_latin1_tests.cpp +71 -0
  645. package/deps/simdutf/tests/convert_valid_utf16le_to_utf32_tests.cpp +122 -0
  646. package/deps/simdutf/tests/convert_valid_utf16le_to_utf8_tests.cpp +244 -0
  647. package/deps/simdutf/tests/convert_valid_utf32_to_latin1_tests.cpp +49 -0
  648. package/deps/simdutf/tests/convert_valid_utf32_to_utf16be_tests.cpp +92 -0
  649. package/deps/simdutf/tests/convert_valid_utf32_to_utf16le_tests.cpp +114 -0
  650. package/deps/simdutf/tests/convert_valid_utf32_to_utf8_tests.cpp +109 -0
  651. package/deps/simdutf/tests/convert_valid_utf8_to_latin1_tests.cpp +84 -0
  652. package/deps/simdutf/tests/convert_valid_utf8_to_utf16be_tests.cpp +124 -0
  653. package/deps/simdutf/tests/convert_valid_utf8_to_utf16le_tests.cpp +221 -0
  654. package/deps/simdutf/tests/convert_valid_utf8_to_utf32_tests.cpp +155 -0
  655. package/deps/simdutf/tests/count_utf16be.cpp +64 -0
  656. package/deps/simdutf/tests/count_utf16le.cpp +61 -0
  657. package/deps/simdutf/tests/count_utf8.cpp +87 -0
  658. package/deps/simdutf/tests/detect_encodings_tests.cpp +312 -0
  659. package/deps/simdutf/tests/embed/valid_utf8.txt +1 -0
  660. package/deps/simdutf/tests/embed_tests.cpp +22 -0
  661. package/deps/simdutf/tests/find_tests.cpp +77 -0
  662. package/deps/simdutf/tests/fixed_string_tests.cpp +153 -0
  663. package/deps/simdutf/tests/helpers/CMakeLists.txt +25 -0
  664. package/deps/simdutf/tests/helpers/compiletime_conversions.h +222 -0
  665. package/deps/simdutf/tests/helpers/fixed_string.h +267 -0
  666. package/deps/simdutf/tests/helpers/random_int.cpp +30 -0
  667. package/deps/simdutf/tests/helpers/random_int.h +39 -0
  668. package/deps/simdutf/tests/helpers/random_utf16.cpp +123 -0
  669. package/deps/simdutf/tests/helpers/random_utf16.h +52 -0
  670. package/deps/simdutf/tests/helpers/random_utf32.cpp +41 -0
  671. package/deps/simdutf/tests/helpers/random_utf32.h +40 -0
  672. package/deps/simdutf/tests/helpers/random_utf8.cpp +93 -0
  673. package/deps/simdutf/tests/helpers/random_utf8.h +36 -0
  674. package/deps/simdutf/tests/helpers/test.cpp +231 -0
  675. package/deps/simdutf/tests/helpers/test.h +193 -0
  676. package/deps/simdutf/tests/helpers/transcode_test_base.cpp +1257 -0
  677. package/deps/simdutf/tests/helpers/transcode_test_base.h +683 -0
  678. package/deps/simdutf/tests/helpers/utf16.h +27 -0
  679. package/deps/simdutf/tests/installation_tests/find/CMakeLists.txt +43 -0
  680. package/deps/simdutf/tests/installation_tests/from_fetch/CMakeLists.txt +47 -0
  681. package/deps/simdutf/tests/internal_tests.cpp +27 -0
  682. package/deps/simdutf/tests/null_safety_tests.cpp +94 -0
  683. package/deps/simdutf/tests/random_fuzzer.cpp +779 -0
  684. package/deps/simdutf/tests/readme_tests.cpp +274 -0
  685. package/deps/simdutf/tests/reference/CMakeLists.txt +23 -0
  686. package/deps/simdutf/tests/reference/decode_utf16.h +81 -0
  687. package/deps/simdutf/tests/reference/decode_utf32.h +47 -0
  688. package/deps/simdutf/tests/reference/encode_latin1.cpp +1 -0
  689. package/deps/simdutf/tests/reference/encode_latin1.h +32 -0
  690. package/deps/simdutf/tests/reference/encode_utf16.cpp +49 -0
  691. package/deps/simdutf/tests/reference/encode_utf16.h +20 -0
  692. package/deps/simdutf/tests/reference/encode_utf32.cpp +1 -0
  693. package/deps/simdutf/tests/reference/encode_utf32.h +36 -0
  694. package/deps/simdutf/tests/reference/encode_utf8.cpp +1 -0
  695. package/deps/simdutf/tests/reference/encode_utf8.h +40 -0
  696. package/deps/simdutf/tests/reference/validate_utf16.cpp +60 -0
  697. package/deps/simdutf/tests/reference/validate_utf16.h +14 -0
  698. package/deps/simdutf/tests/reference/validate_utf16_to_latin1.cpp +35 -0
  699. package/deps/simdutf/tests/reference/validate_utf16_to_latin1.h +13 -0
  700. package/deps/simdutf/tests/reference/validate_utf32.cpp +27 -0
  701. package/deps/simdutf/tests/reference/validate_utf32.h +12 -0
  702. package/deps/simdutf/tests/reference/validate_utf32_to_latin1.cpp +27 -0
  703. package/deps/simdutf/tests/reference/validate_utf32_to_latin1.h +12 -0
  704. package/deps/simdutf/tests/reference/validate_utf8.cpp +82 -0
  705. package/deps/simdutf/tests/reference/validate_utf8.h +11 -0
  706. package/deps/simdutf/tests/reference/validate_utf8_to_latin1.cpp +43 -0
  707. package/deps/simdutf/tests/reference/validate_utf8_to_latin1.h +12 -0
  708. package/deps/simdutf/tests/select_implementation.cpp +43 -0
  709. package/deps/simdutf/tests/simdutf_c_tests.cpp +244 -0
  710. package/deps/simdutf/tests/span_tests.cpp +401 -0
  711. package/deps/simdutf/tests/special_tests.cpp +559 -0
  712. package/deps/simdutf/tests/straight_c_test.c +187 -0
  713. package/deps/simdutf/tests/text_encoding_tests.cpp +77 -0
  714. package/deps/simdutf/tests/to_well_formed_utf16_tests.cpp +377 -0
  715. package/deps/simdutf/tests/utf8_length_from_utf16_tests.cpp +202 -0
  716. package/deps/simdutf/tests/validate_ascii_basic_tests.cpp +165 -0
  717. package/deps/simdutf/tests/validate_ascii_with_errors_tests.cpp +77 -0
  718. package/deps/simdutf/tests/validate_utf16be_basic_tests.cpp +175 -0
  719. package/deps/simdutf/tests/validate_utf16be_with_errors_tests.cpp +188 -0
  720. package/deps/simdutf/tests/validate_utf16le_basic_tests.cpp +268 -0
  721. package/deps/simdutf/tests/validate_utf16le_with_errors_tests.cpp +274 -0
  722. package/deps/simdutf/tests/validate_utf32_basic_tests.cpp +92 -0
  723. package/deps/simdutf/tests/validate_utf32_with_errors_tests.cpp +114 -0
  724. package/deps/simdutf/tests/validate_utf8_basic_tests.cpp +178 -0
  725. package/deps/simdutf/tests/validate_utf8_brute_force_tests.cpp +88 -0
  726. package/deps/simdutf/tests/validate_utf8_puzzler_tests.cpp +33 -0
  727. package/deps/simdutf/tests/validate_utf8_with_errors_tests.cpp +228 -0
  728. package/deps/simdutf/tools/CMakeLists.txt +85 -0
  729. package/deps/simdutf/tools/fastbase64.cpp +250 -0
  730. package/deps/simdutf/tools/sutf.cpp +556 -0
  731. package/deps/simdutf/tools/sutf.h +40 -0
  732. package/lib/commonjs/blake3.js +2 -1
  733. package/lib/commonjs/blake3.js.map +1 -1
  734. package/lib/commonjs/diffie-hellman.js +5 -4
  735. package/lib/commonjs/diffie-hellman.js.map +1 -1
  736. package/lib/commonjs/ecdh.js +5 -4
  737. package/lib/commonjs/ecdh.js.map +1 -1
  738. package/lib/module/blake3.js +2 -1
  739. package/lib/module/blake3.js.map +1 -1
  740. package/lib/module/diffie-hellman.js +5 -4
  741. package/lib/module/diffie-hellman.js.map +1 -1
  742. package/lib/module/ecdh.js +5 -4
  743. package/lib/module/ecdh.js.map +1 -1
  744. package/lib/tsconfig.tsbuildinfo +1 -1
  745. package/lib/typescript/blake3.d.ts.map +1 -1
  746. package/lib/typescript/diffie-hellman.d.ts.map +1 -1
  747. package/lib/typescript/ecdh.d.ts.map +1 -1
  748. package/package.json +2 -2
  749. package/src/blake3.ts +2 -1
  750. package/src/diffie-hellman.ts +5 -7
  751. package/src/ecdh.ts +5 -8
@@ -0,0 +1,3899 @@
1
+ #include "benchmark.h"
2
+ #include "simdutf.h"
3
+
4
+ #include <cassert>
5
+ #include <array>
6
+ #include <iostream>
7
+ #include <chrono>
8
+ #include <thread>
9
+ #include <string>
10
+ #include <vector>
11
+ #ifdef __x86_64__
12
+ /**
13
+ * utf8lut: Vectorized UTF-8 converter.
14
+ * by stgatilov (2019)
15
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
16
+ */
17
+ SIMDUTF_TARGET_WESTMERE
18
+ namespace {
19
+ #include "benchmarks/competition/utf8lut/src/utf8lut.h"
20
+ }
21
+ SIMDUTF_UNTARGET_REGION
22
+
23
+ /**
24
+ * Bob Steagall, CppCon2018
25
+ * https://github.com/BobSteagall/CppCon2018/
26
+ *
27
+ * Fast Conversion From UTF-8 with C++, DFAs, and SSE Intrinsics
28
+ * https://www.youtube.com/watch?v=5FQ87-Ecb-A
29
+ */
30
+ #include "benchmarks/competition/CppCon2018/utf_utils.cpp"
31
+ #endif
32
+
33
+ /**
34
+ * Bjoern Hoehrmann
35
+ * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
36
+ */
37
+ #include "benchmarks/competition/hoehrmann/hoehrmann.h"
38
+ /**
39
+ * LLVM relies on code from the Unicode Consortium
40
+ * https://en.wikipedia.org/wiki/Unicode_Consortium
41
+ */
42
+ #include "benchmarks/competition/llvm/ConvertUTF.cpp"
43
+ #ifdef __x86_64__
44
+ /**
45
+ * Olivier Goffart, UTF-8 processing using SIMD (SSE4), 2012.
46
+ * https://woboq.com/blog/utf-8-processing-using-simd.html
47
+ */
48
+ #include "benchmarks/competition/utf8sse4/fromutf8-sse.cpp"
49
+ #endif
50
+
51
+ #ifdef __x86_64__
52
+ /**
53
+ * benchmarks/competition/u8u16 contains an open source version of u8u16,
54
+ * referenced in Cameron, Robert D, A case study in SIMD text processing with
55
+ * parallel bit streams: UTF-8 to UTF-16 transcoding, Proceedings of the 13th
56
+ * ACM SIGPLAN Symposium on Principles and practice of parallel programming,
57
+ * 91--98.
58
+ */
59
+ // It seems that u8u16 is not good at scoping macros.
60
+ #undef LITTLE_ENDIAN
61
+ #undef BYTE_ORDER
62
+ #undef BIG_ENDIAN
63
+ #include "benchmarks/competition/u8u16/config/p4_config.h"
64
+ #include "benchmarks/competition/u8u16/src/libu8u16.c"
65
+ #endif
66
+
67
+ /**
68
+ * Nemanja Trifunovic, UTF8-CPP: UTF-8 with C++ in a Portable Way
69
+ * https://github.com/nemtrif/utfcpp/releases/tag/v3.2.2
70
+ */
71
+ #include "benchmarks/competition/utfcpp/source/utf8.h"
72
+
73
+ namespace simdutf::benchmarks {
74
+
75
+ template <typename Fn>
76
+ void Benchmark::register_function(std::string name, Fn function,
77
+ std::set<simdutf::encoding_type> set) {
78
+
79
+ if (name.find('+') == std::string::npos) {
80
+ // adding simdutf benchmark, populate for all known architectures
81
+ for (const auto &impl : simdutf::get_available_implementations()) {
82
+ const auto full_name = name + '+' + impl->name();
83
+ benchmarks.insert({full_name, std::make_pair(function, set)});
84
+ }
85
+ } else {
86
+ benchmarks.insert({name, std::make_pair(function, set)});
87
+ }
88
+ }
89
+
90
+ template <typename Fn>
91
+ void Benchmark::register_function(std::string name, Fn function,
92
+ simdutf::encoding_type enc1) {
93
+ std::set<simdutf::encoding_type> set{enc1};
94
+ register_function(name, function, set);
95
+ }
96
+
97
+ template <typename Fn>
98
+ void Benchmark::register_function(std::string name, Fn function,
99
+ simdutf::encoding_type enc1,
100
+ simdutf::encoding_type enc2) {
101
+ std::set<simdutf::encoding_type> set{enc1, enc2};
102
+ register_function(name, function, set);
103
+ }
104
+
105
+ template <typename Fn>
106
+ void Benchmark::register_function(std::string name, Fn function,
107
+ simdutf::encoding_type enc1,
108
+ simdutf::encoding_type enc2,
109
+ simdutf::encoding_type enc3) {
110
+ std::set<simdutf::encoding_type> set{enc1, enc2, enc3};
111
+ register_function(name, function, set);
112
+ }
113
+
114
+ Benchmark::Benchmark(std::vector<input::Testcase> &&testcases)
115
+ : BenchmarkBase(std::move(testcases)) {
116
+ register_function("to_well_formed_utf16le",
117
+ &Benchmark::run_to_well_formed_utf16le,
118
+ simdutf::encoding_type::UTF16_LE);
119
+ register_function("naive_validate_ascii",
120
+ &Benchmark::run_naive_validate_ascii,
121
+ simdutf::encoding_type::UTF8);
122
+ register_function("validate_ascii", &Benchmark::run_validate_ascii,
123
+ simdutf::encoding_type::UTF8);
124
+ register_function("validate_ascii_with_errors",
125
+ &Benchmark::run_validate_ascii_with_errors,
126
+ simdutf::encoding_type::UTF8);
127
+ register_function("validate_utf8", &Benchmark::run_validate_utf8,
128
+ simdutf::encoding_type::UTF8);
129
+ register_function("validate_utf8_with_errors",
130
+ &Benchmark::run_validate_utf8_with_errors,
131
+ simdutf::encoding_type::UTF8);
132
+ register_function("validate_utf16le", &Benchmark::run_validate_utf16le,
133
+ simdutf::encoding_type::UTF16_LE);
134
+ register_function("validate_utf16le_with_errors",
135
+ &Benchmark::run_validate_utf16le_with_errors,
136
+ simdutf::encoding_type::UTF16_LE);
137
+ register_function("validate_utf32", &Benchmark::run_validate_utf32,
138
+ simdutf::encoding_type::UTF32_LE);
139
+ register_function("validate_utf32_with_errors",
140
+ &Benchmark::run_validate_utf32_with_errors,
141
+ simdutf::encoding_type::UTF32_LE);
142
+
143
+ register_function("count_utf8", &Benchmark::run_count_utf8,
144
+ simdutf::encoding_type::UTF8);
145
+ register_function("count_utf16le", &Benchmark::run_count_utf16le,
146
+ simdutf::encoding_type::UTF16_LE);
147
+
148
+ register_function("utf8_length_from_latin1",
149
+ &Benchmark::run_utf8_length_from_latin1,
150
+ simdutf::encoding_type::Latin1);
151
+ register_function("utf8_length_from_utf16le",
152
+ &Benchmark::run_utf8_length_from_utf16le,
153
+ simdutf::encoding_type::UTF16_LE);
154
+ register_function("utf8_length_from_utf16le_with_replacement",
155
+ &Benchmark::run_utf8_length_from_utf16le_with_replacement,
156
+ simdutf::encoding_type::UTF16_LE);
157
+ register_function("utf8_length_from_utf16be",
158
+ &Benchmark::run_utf8_length_from_utf16be,
159
+ simdutf::encoding_type::UTF16_BE);
160
+ register_function("utf8_length_from_utf16be_with_replacement",
161
+ &Benchmark::run_utf8_length_from_utf16be_with_replacement,
162
+ simdutf::encoding_type::UTF16_BE);
163
+ register_function("utf8_length_from_utf32",
164
+ &Benchmark::run_utf8_length_from_utf32,
165
+ simdutf::encoding_type::UTF32_LE);
166
+ register_function("utf16_length_from_utf8",
167
+ &Benchmark::run_utf16_length_from_utf8,
168
+ simdutf::encoding_type::UTF8);
169
+ register_function("convert_latin1_to_utf8",
170
+ &Benchmark::run_convert_latin1_to_utf8,
171
+ simdutf::encoding_type::Latin1);
172
+ register_function("convert_latin1_to_utf16le",
173
+ &Benchmark::run_convert_latin1_to_utf16le,
174
+ simdutf::encoding_type::Latin1);
175
+ register_function("convert_latin1_to_utf32",
176
+ &Benchmark::run_convert_latin1_to_utf32,
177
+ simdutf::encoding_type::Latin1);
178
+
179
+ register_function("convert_utf8_to_latin1",
180
+ &Benchmark::run_convert_utf8_to_latin1,
181
+ simdutf::encoding_type::UTF8);
182
+ register_function("convert_utf8_to_latin1_with_errors",
183
+ &Benchmark::run_convert_utf8_to_latin1_with_errors,
184
+ simdutf::encoding_type::UTF8);
185
+ register_function("convert_valid_utf8_to_latin1",
186
+ &Benchmark::run_convert_valid_utf8_to_latin1,
187
+ simdutf::encoding_type::UTF8);
188
+
189
+ register_function("convert_utf8_to_utf16le",
190
+ &Benchmark::run_convert_utf8_to_utf16le,
191
+ simdutf::encoding_type::UTF8);
192
+ register_function("convert_utf8_to_utf16le_with_errors",
193
+ &Benchmark::run_convert_utf8_to_utf16le_with_errors,
194
+ simdutf::encoding_type::UTF8);
195
+ register_function(
196
+ "convert_utf8_to_utf16le_with_dynamic_allocation",
197
+ &Benchmark::run_convert_utf8_to_utf16le_with_dynamic_allocation,
198
+ simdutf::encoding_type::UTF8);
199
+ register_function("convert_valid_utf8_to_utf16le",
200
+ &Benchmark::run_convert_valid_utf8_to_utf16le,
201
+ simdutf::encoding_type::UTF8);
202
+
203
+ register_function("convert_utf8_to_utf32",
204
+ &Benchmark::run_convert_utf8_to_utf32,
205
+ simdutf::encoding_type::UTF8);
206
+ register_function("convert_utf8_to_utf32_with_errors",
207
+ &Benchmark::run_convert_utf8_to_utf32_with_errors,
208
+ simdutf::encoding_type::UTF8);
209
+ register_function(
210
+ "convert_utf8_to_utf32_with_dynamic_allocation",
211
+ &Benchmark::run_convert_utf8_to_utf32_with_dynamic_allocation,
212
+ simdutf::encoding_type::UTF8);
213
+ register_function("convert_valid_utf8_to_utf32",
214
+ &Benchmark::run_convert_valid_utf8_to_utf32,
215
+ simdutf::encoding_type::UTF8);
216
+
217
+ register_function("convert_utf16le_to_latin1",
218
+ &Benchmark::run_convert_utf16le_to_latin1,
219
+ simdutf::encoding_type::UTF16_LE);
220
+ register_function("convert_utf16le_to_latin1_with_errors",
221
+ &Benchmark::run_convert_utf16le_to_latin1_with_errors,
222
+ simdutf::encoding_type::UTF16_LE);
223
+ register_function("convert_valid_utf16le_to_latin1",
224
+ &Benchmark::run_convert_valid_utf16le_to_latin1,
225
+ simdutf::encoding_type::UTF16_LE);
226
+ #if SIMDUTF_IS_BIG_ENDIAN
227
+ register_function("convert_utf16_to_utf8_safe",
228
+ &Benchmark::run_convert_utf16_to_utf8_safe,
229
+ simdutf::encoding_type::UTF16_BE);
230
+ #else
231
+ register_function("convert_utf16_to_utf8_safe",
232
+ &Benchmark::run_convert_utf16_to_utf8_safe,
233
+ simdutf::encoding_type::UTF16_LE);
234
+ #endif // SIMDUTF_IS_BIG_ENDIAN
235
+ register_function("convert_utf16le_to_utf8",
236
+ &Benchmark::run_convert_utf16le_to_utf8,
237
+ simdutf::encoding_type::UTF16_LE);
238
+ register_function("convert_utf16le_to_utf8_with_errors",
239
+ &Benchmark::run_convert_utf16le_to_utf8_with_errors,
240
+ simdutf::encoding_type::UTF16_LE);
241
+ register_function(
242
+ "convert_utf16le_to_utf8_with_dynamic_allocation",
243
+ &Benchmark::run_convert_utf16le_to_utf8_with_dynamic_allocation,
244
+ simdutf::encoding_type::UTF16_LE);
245
+ register_function("convert_valid_utf16le_to_utf8",
246
+ &Benchmark::run_convert_valid_utf16le_to_utf8,
247
+ simdutf::encoding_type::UTF16_LE);
248
+
249
+ register_function("convert_utf16le_to_utf32",
250
+ &Benchmark::run_convert_utf16le_to_utf32,
251
+ simdutf::encoding_type::UTF16_LE);
252
+ register_function("convert_utf16le_to_utf32_with_errors",
253
+ &Benchmark::run_convert_utf16le_to_utf32_with_errors,
254
+ simdutf::encoding_type::UTF16_LE);
255
+ register_function(
256
+ "convert_utf16le_to_utf32_with_dynamic_allocation",
257
+ &Benchmark::run_convert_utf16le_to_utf32_with_dynamic_allocation,
258
+ simdutf::encoding_type::UTF16_LE);
259
+ register_function("convert_valid_utf16le_to_utf32",
260
+ &Benchmark::run_convert_valid_utf16le_to_utf32,
261
+ simdutf::encoding_type::UTF16_LE);
262
+
263
+ register_function("convert_utf32_to_latin1",
264
+ &Benchmark::run_convert_utf32_to_latin1,
265
+ simdutf::encoding_type::UTF32_LE);
266
+ register_function("convert_utf32_to_latin1_with_errors",
267
+ &Benchmark::run_convert_utf32_to_latin1_with_errors,
268
+ simdutf::encoding_type::UTF32_LE);
269
+ register_function("convert_valid_utf32_to_latin1",
270
+ &Benchmark::run_convert_valid_utf32_to_latin1,
271
+ simdutf::encoding_type::UTF32_LE);
272
+
273
+ register_function("convert_utf32_to_utf8",
274
+ &Benchmark::run_convert_utf32_to_utf8,
275
+ simdutf::encoding_type::UTF32_LE);
276
+ register_function("convert_utf32_to_utf8_with_errors",
277
+ &Benchmark::run_convert_utf32_to_utf8_with_errors,
278
+ simdutf::encoding_type::UTF32_LE);
279
+ register_function("convert_valid_utf32_to_utf8",
280
+ &Benchmark::run_convert_valid_utf32_to_utf8,
281
+ simdutf::encoding_type::UTF32_LE);
282
+
283
+ register_function("convert_utf32_to_utf16le",
284
+ &Benchmark::run_convert_utf32_to_utf16<endianness::LITTLE>,
285
+ simdutf::encoding_type::UTF32_LE);
286
+ register_function("convert_utf32_to_utf16be",
287
+ &Benchmark::run_convert_utf32_to_utf16<endianness::BIG>,
288
+ simdutf::encoding_type::UTF32_LE);
289
+ register_function(
290
+ "convert_utf32_to_utf16le_with_errors",
291
+ &Benchmark::run_convert_utf32_to_utf16_with_errors<endianness::LITTLE>,
292
+ simdutf::encoding_type::UTF32_LE);
293
+ register_function(
294
+ "convert_utf32_to_utf16be_with_errors",
295
+ &Benchmark::run_convert_utf32_to_utf16_with_errors<endianness::BIG>,
296
+ simdutf::encoding_type::UTF32_LE);
297
+ register_function(
298
+ "convert_valid_utf32_to_utf16le",
299
+ &Benchmark::run_convert_valid_utf32_to_utf16<endianness::LITTLE>,
300
+ simdutf::encoding_type::UTF32_LE);
301
+ register_function(
302
+ "convert_valid_utf32_to_utf16be",
303
+ &Benchmark::run_convert_valid_utf32_to_utf16<endianness::BIG>,
304
+ simdutf::encoding_type::UTF32_LE);
305
+
306
+ register_function("detect_encodings", &Benchmark::run_detect_encodings,
307
+ simdutf::encoding_type::UTF8,
308
+ simdutf::encoding_type::UTF16_LE,
309
+ simdutf::encoding_type::UTF32_LE);
310
+
311
+ #ifdef ICU_AVAILABLE
312
+ register_function("convert_latin1_to_utf8+icu",
313
+ &Benchmark::run_convert_latin1_to_utf8_icu,
314
+ simdutf::encoding_type::Latin1);
315
+ register_function("convert_latin1_to_utf16+icu",
316
+ &Benchmark::run_convert_latin1_to_utf16_icu,
317
+ simdutf::encoding_type::Latin1);
318
+ register_function("convert_latin1_to_utf32+icu",
319
+ &Benchmark::run_convert_latin1_to_utf32_icu,
320
+ simdutf::encoding_type::Latin1);
321
+ register_function("convert_utf8_to_latin1+icu",
322
+ &Benchmark::run_convert_utf8_to_latin1_icu,
323
+ simdutf::encoding_type::UTF8);
324
+ register_function("convert_utf8_to_utf16+icu",
325
+ &Benchmark::run_convert_utf8_to_utf16_icu,
326
+ simdutf::encoding_type::UTF8);
327
+ register_function("convert_utf16_to_utf8+icu",
328
+ &Benchmark::run_convert_utf16_to_utf8_icu,
329
+ simdutf::encoding_type::UTF16_LE);
330
+ register_function("convert_utf16_to_latin1+icu",
331
+ &Benchmark::run_convert_utf16_to_latin1_icu,
332
+ simdutf::encoding_type::UTF16_LE);
333
+ register_function("convert_utf32_to_latin1+icu",
334
+ &Benchmark::run_convert_utf32_to_latin1_icu,
335
+ simdutf::encoding_type::UTF32_LE);
336
+ #endif
337
+ #ifdef ICONV_AVAILABLE
338
+ register_function("convert_latin1_to_utf8+iconv",
339
+ &Benchmark::run_convert_latin1_to_utf8_iconv,
340
+ simdutf::encoding_type::Latin1);
341
+ register_function("convert_latin1_to_utf16+iconv",
342
+ &Benchmark::run_convert_latin1_to_utf16_iconv,
343
+ simdutf::encoding_type::Latin1);
344
+ register_function("convert_latin1_to_utf32+iconv",
345
+ &Benchmark::run_convert_latin1_to_utf32_iconv,
346
+ simdutf::encoding_type::Latin1);
347
+ register_function("convert_utf8_to_latin1+iconv",
348
+ &Benchmark::run_convert_utf8_to_latin1_iconv,
349
+ simdutf::encoding_type::UTF8);
350
+ register_function("convert_utf8_to_utf16+iconv",
351
+ &Benchmark::run_convert_utf8_to_utf16_iconv,
352
+ simdutf::encoding_type::UTF8);
353
+ register_function("convert_utf16_to_utf8+iconv",
354
+ &Benchmark::run_convert_utf16_to_utf8_iconv,
355
+ simdutf::encoding_type::UTF16_LE);
356
+ register_function("convert_utf16_to_latin1+iconv",
357
+ &Benchmark::run_convert_utf16_to_latin1_iconv,
358
+ simdutf::encoding_type::UTF16_LE);
359
+ register_function("convert_utf32_to_latin1+iconv",
360
+ &Benchmark::run_convert_utf32_to_latin1_iconv,
361
+ simdutf::encoding_type::UTF32_LE);
362
+ #endif
363
+ #ifdef INOUE2008
364
+ register_function("convert_valid_utf8_to_utf16+inoue2008",
365
+ &Benchmark::run_convert_valid_utf8_to_utf16_inoue2008,
366
+ simdutf::encoding_type::UTF8);
367
+ #endif
368
+ #ifdef __x86_64__
369
+ register_function("convert_utf8_to_utf16+u8u16",
370
+ &Benchmark::run_convert_utf8_to_utf16_u8u16,
371
+ simdutf::encoding_type::UTF8);
372
+ register_function("convert_utf16_to_utf8+utf8lut",
373
+ &Benchmark::run_convert_valid_utf8_to_utf16_utf8lut,
374
+ simdutf::encoding_type::UTF16_LE);
375
+ register_function("convert_valid_utf16_to_utf8+utf8lut",
376
+ &Benchmark::run_convert_valid_utf16_to_utf8_utf8lut,
377
+ simdutf::encoding_type::UTF16_LE);
378
+ register_function("convert_utf8_to_utf16+utf8lut",
379
+ &Benchmark::run_convert_valid_utf8_to_utf16_utf8lut,
380
+ simdutf::encoding_type::UTF8);
381
+ register_function("convert_utf8_to_utf32+utf8lut",
382
+ &Benchmark::run_convert_utf8_to_utf32_utf8lut,
383
+ simdutf::encoding_type::UTF8);
384
+ register_function("convert_valid_utf8_to_utf16+utf8lut",
385
+ &Benchmark::run_convert_valid_utf8_to_utf16_utf8lut,
386
+ simdutf::encoding_type::UTF8);
387
+ register_function("convert_utf32_to_utf8+utf8lut",
388
+ &Benchmark::run_convert_valid_utf32_to_utf8_utf8lut,
389
+ simdutf::encoding_type::UTF32_LE);
390
+ register_function("convert_valid_utf32_to_utf8+utf8lut",
391
+ &Benchmark::run_convert_valid_utf32_to_utf8_utf8lut,
392
+ simdutf::encoding_type::UTF32_BE);
393
+ register_function("convert_valid_utf8_to_utf32+utf8lut",
394
+ &Benchmark::run_convert_utf8_to_utf32_utf8lut,
395
+ simdutf::encoding_type::UTF8);
396
+ register_function("convert_utf8_to_utf16+utf8sse4",
397
+ &Benchmark::run_convert_utf8_to_utf16_utf8sse4,
398
+ simdutf::encoding_type::UTF8);
399
+ register_function("convert_utf8_to_utf16+cppcon2018",
400
+ &Benchmark::run_convert_utf8_to_utf16_cppcon2018,
401
+ simdutf::encoding_type::UTF8);
402
+ register_function("convert_utf8_to_utf32+cppcon2018",
403
+ &Benchmark::run_convert_utf8_to_utf32_cppcon2018,
404
+ simdutf::encoding_type::UTF8);
405
+ #endif
406
+ register_function("convert_utf8_to_utf16+hoehrmann",
407
+ &Benchmark::run_convert_utf8_to_utf16_hoehrmann,
408
+ simdutf::encoding_type::UTF8);
409
+ register_function("convert_utf8_to_utf32+hoehrmann",
410
+ &Benchmark::run_convert_utf8_to_utf32_hoehrmann,
411
+ simdutf::encoding_type::UTF8);
412
+
413
+ register_function("convert_utf8_to_utf16+llvm",
414
+ &Benchmark::run_convert_utf8_to_utf16_llvm,
415
+ simdutf::encoding_type::UTF8);
416
+ register_function("convert_utf8_to_utf32+llvm",
417
+ &Benchmark::run_convert_utf8_to_utf32_llvm,
418
+ simdutf::encoding_type::UTF8);
419
+ register_function("convert_utf16_to_utf8+llvm",
420
+ &Benchmark::run_convert_utf16_to_utf8_llvm,
421
+ simdutf::encoding_type::UTF16_LE);
422
+ register_function("convert_utf32_to_utf8+llvm",
423
+ &Benchmark::run_convert_utf32_to_utf8_llvm,
424
+ simdutf::encoding_type::UTF32_LE);
425
+ register_function("convert_utf32_to_utf16+llvm",
426
+ &Benchmark::run_convert_utf32_to_utf16_llvm,
427
+ simdutf::encoding_type::UTF32_LE);
428
+ register_function("convert_utf16_to_utf32+llvm",
429
+ &Benchmark::run_convert_utf16_to_utf32_llvm,
430
+ simdutf::encoding_type::UTF16_LE);
431
+
432
+ register_function("convert_utf8_to_utf16+utfcpp",
433
+ &Benchmark::run_convert_utf8_to_utf16_utfcpp,
434
+ simdutf::encoding_type::UTF8);
435
+ register_function("convert_utf8_to_utf32+utfcpp",
436
+ &Benchmark::run_convert_utf8_to_utf32_utfcpp,
437
+ simdutf::encoding_type::UTF8);
438
+ register_function("convert_utf16_to_utf8+utfcpp",
439
+ &Benchmark::run_convert_utf16_to_utf8_utfcpp,
440
+ simdutf::encoding_type::UTF16_LE);
441
+ register_function("convert_utf32_to_utf8+utfcpp",
442
+ &Benchmark::run_convert_utf32_to_utf8_utfcpp,
443
+ simdutf::encoding_type::UTF32_LE);
444
+
445
+ register_function("utf8_length_from_latin1+node",
446
+ &Benchmark::run_utf8_length_from_latin1_node,
447
+ simdutf::encoding_type::Latin1);
448
+ }
449
+
450
+ // static
451
+ Benchmark Benchmark::create(const CommandLine &cmdline) {
452
+ std::vector<input::Testcase> testcases;
453
+
454
+ using input::File;
455
+ using input::random_utf8;
456
+ using input::Testcase;
457
+
458
+ for (const size_t iterations : cmdline.iterations) {
459
+ for (const auto &path : cmdline.files) {
460
+ testcases.emplace_back(
461
+ Testcase{cmdline.procedures, iterations, File{path}});
462
+ }
463
+
464
+ for (const size_t size : cmdline.random_size) {
465
+ testcases.emplace_back(
466
+ Testcase{cmdline.procedures, iterations, random_utf8{size}});
467
+ }
468
+ }
469
+
470
+ return Benchmark{std::move(testcases)};
471
+ }
472
+
473
+ void Benchmark::list_procedures(ListingMode lm) const {
474
+ switch (lm) {
475
+ case ListingMode::None:
476
+ break;
477
+
478
+ case ListingMode::HumanReadable: {
479
+ const auto &known_procedures = all_procedures();
480
+ printf("Available procedures (%zu)\n", size_t(known_procedures.size()));
481
+ for (const auto &name : known_procedures) {
482
+ printf("- %s\n", name.c_str());
483
+ }
484
+ } break;
485
+
486
+ case ListingMode::PlainLines: {
487
+ const auto &known_procedures = all_procedures();
488
+ for (const auto &name : known_procedures) {
489
+ puts(name.c_str());
490
+ }
491
+ break;
492
+ }
493
+
494
+ case ListingMode::Json: {
495
+ printf("[\n");
496
+ auto first = true;
497
+ for (const auto &item : benchmarks) {
498
+ const auto &name = item.first;
499
+ const auto &entry = item.second;
500
+ if (!first) {
501
+ putchar(',');
502
+ }
503
+ first = false;
504
+
505
+ printf(" {\n");
506
+ printf(" \"name\": \"%s\",\n", name.c_str());
507
+ if (std::holds_alternative<thirdparty_fn>(entry.first)) {
508
+ printf(" \"simdutf\": false,\n");
509
+ } else if (std::holds_alternative<simdutf_fn>(entry.first)) {
510
+ printf(" \"simdutf\": true,\n");
511
+ }
512
+
513
+ {
514
+ printf(" \"encodings\": [");
515
+ bool first = true;
516
+ for (const auto &enc : entry.second) {
517
+ if (!first) {
518
+ putchar(',');
519
+ }
520
+ first = false;
521
+
522
+ switch (enc) {
523
+ case simdutf::UTF8:
524
+ printf("\"utf8\"");
525
+ break;
526
+ case simdutf::UTF16_LE:
527
+ printf("\"utf16le\"");
528
+ break;
529
+ case simdutf::UTF16_BE:
530
+ printf("\"utf16be\"");
531
+ break;
532
+ case simdutf::UTF32_LE:
533
+ printf("\"utf32le\"");
534
+ break;
535
+ case simdutf::UTF32_BE:
536
+ printf("\"utf32be\"");
537
+ break;
538
+ case simdutf::Latin1:
539
+ printf("\"latin1\"");
540
+ break;
541
+ default:
542
+ printf("\"unknown\"");
543
+ break;
544
+ }
545
+ }
546
+ printf("]\n");
547
+ } // encodings
548
+ printf(" }");
549
+ } // for
550
+ printf("]\n");
551
+ break;
552
+ }
553
+ }
554
+ }
555
+
556
+ void Benchmark::run(const std::string &procedure_name, size_t iterations) {
557
+ const auto item = benchmarks.find(procedure_name);
558
+ if (item == benchmarks.end()) {
559
+ std::cerr << "Unsupported procedure: " << procedure_name << '\n';
560
+ std::cerr << "Report the issue.\n";
561
+ std::cerr << " Aborting ! " << '\n';
562
+ exit(1);
563
+ }
564
+
565
+ const auto &entry = item->second;
566
+ if (std::holds_alternative<thirdparty_fn>(entry.first)) {
567
+ const auto fn = std::get<thirdparty_fn>(entry.first);
568
+
569
+ (this->*fn)(iterations);
570
+ } else if (std::holds_alternative<simdutf_fn>(entry.first)) {
571
+ const auto p = procedure_name.find('+');
572
+ const std::string name{procedure_name.substr(0, p)};
573
+ const std::string impl{procedure_name.substr(p + 1)};
574
+
575
+ auto implementation = simdutf::get_available_implementations()[impl];
576
+ if (implementation == nullptr) {
577
+ throw std::runtime_error("Wrong implementation " + impl);
578
+ }
579
+ // If you want to skip the CPU feature checks, you can set
580
+ // a variable when calling the benchmark program. E.g.,
581
+ // SIMDUTF_SKIP_CPU_CHECK=ON benchmark -F myfile.txt
582
+ // This might result in a crash (E.g., Illegal instruction).
583
+ SIMDUTF_PUSH_DISABLE_WARNINGS
584
+ SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
585
+ // manually verified this is safe
586
+ static const char *skip_check = getenv("SIMDUTF_SKIP_CPU_CHECK");
587
+ SIMDUTF_POP_DISABLE_WARNINGS
588
+ if (!skip_check && !implementation->supported_by_runtime_system()) {
589
+ std::cout << procedure_name << ": unsupported by the system\n";
590
+ return;
591
+ }
592
+
593
+ const auto fn = std::get<simdutf_fn>(entry.first);
594
+ (this->*fn)(*implementation, iterations);
595
+ } else {
596
+ throw std::logic_error("The entry for '" + procedure_name +
597
+ "' is not valid. Please report an issue.");
598
+ }
599
+
600
+ // We pause after each call to make sure
601
+ // that other benchmarks are not affected by frequency throttling.
602
+ // This was initially introduced for AVX-512 only, but it is probably
603
+ // wise to have it always.
604
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
605
+ }
606
+
607
+ void Benchmark::run_validate_utf8(const simdutf::implementation &implementation,
608
+ size_t iterations) {
609
+ const char *data = reinterpret_cast<const char *>(input_data.data());
610
+ const size_t size = input_data.size();
611
+ volatile bool sink{false};
612
+
613
+ auto proc = [&implementation, data, size, &sink]() {
614
+ sink = implementation.validate_utf8(data, size);
615
+ };
616
+
617
+ count_events(proc, iterations); // warming up!
618
+ const auto result = count_events(proc, iterations);
619
+ if ((sink == false) && (iterations > 0)) {
620
+ std::cerr << "The input was declared invalid.\n";
621
+ }
622
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
623
+ print_summary(result, size, char_count);
624
+ }
625
+
626
+ void Benchmark::run_validate_utf8_with_errors(
627
+ const simdutf::implementation &implementation, size_t iterations) {
628
+ const char *data = reinterpret_cast<const char *>(input_data.data());
629
+ const size_t size = input_data.size();
630
+ volatile bool sink{false};
631
+
632
+ auto proc = [&implementation, data, size, &sink]() {
633
+ result res = implementation.validate_utf8_with_errors(data, size);
634
+ sink = !(res.error);
635
+ };
636
+
637
+ count_events(proc, iterations); // warming up!
638
+ const auto result = count_events(proc, iterations);
639
+ if ((sink == false) && (iterations > 0)) {
640
+ std::cerr << "The input was declared invalid.\n";
641
+ }
642
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
643
+ print_summary(result, size, char_count);
644
+ }
645
+
646
+ namespace details {
647
+ bool ascii_is_valid(const char *data, size_t size) {
648
+ unsigned char result = 0;
649
+ for (size_t i = 0; i < size; i++) {
650
+ result |= static_cast<unsigned char>(data[i]);
651
+ }
652
+ return (result <= 0x7F);
653
+ }
654
+ } // namespace details
655
+
656
+ void Benchmark::run_naive_validate_ascii(
657
+ const simdutf::implementation &implementation, size_t iterations) {
658
+ const char *data = reinterpret_cast<const char *>(input_data.data());
659
+ const size_t size = input_data.size();
660
+ volatile bool sink{false};
661
+ auto proc = [&implementation, data, size, &sink]() {
662
+ sink = details::ascii_is_valid(data, size);
663
+ };
664
+
665
+ count_events(proc, iterations); // warming up!
666
+ const auto result = count_events(proc, iterations);
667
+ if ((sink == false) && (iterations > 0)) {
668
+ std::cerr << "The input was declared invalid.\n";
669
+ }
670
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
671
+ print_summary(result, size, char_count);
672
+ }
673
+
674
+ void Benchmark::run_validate_ascii(
675
+ const simdutf::implementation &implementation, size_t iterations) {
676
+ const char *data = reinterpret_cast<const char *>(input_data.data());
677
+ const size_t size = input_data.size();
678
+ volatile bool sink{false};
679
+
680
+ auto proc = [&implementation, data, size, &sink]() {
681
+ sink = implementation.validate_ascii(data, size);
682
+ };
683
+
684
+ count_events(proc, iterations); // warming up!
685
+ const auto result = count_events(proc, iterations);
686
+ if ((sink == false) && (iterations > 0)) {
687
+ std::cerr << "The input was declared invalid.\n";
688
+ }
689
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
690
+ print_summary(result, size, char_count);
691
+ }
692
+
693
+ void Benchmark::run_validate_ascii_with_errors(
694
+ const simdutf::implementation &implementation, size_t iterations) {
695
+ const char *data = reinterpret_cast<const char *>(input_data.data());
696
+ const size_t size = input_data.size();
697
+ volatile bool sink{false};
698
+
699
+ auto proc = [&implementation, data, size, &sink]() {
700
+ result res = implementation.validate_ascii_with_errors(data, size);
701
+ sink = !(res.error);
702
+ };
703
+
704
+ count_events(proc, iterations); // warming up!
705
+ const auto result = count_events(proc, iterations);
706
+ if ((sink == false) && (iterations > 0)) {
707
+ std::cerr << "The input was declared invalid.\n";
708
+ }
709
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
710
+ print_summary(result, size, char_count);
711
+ }
712
+
713
+ void Benchmark::run_validate_utf16le(
714
+ const simdutf::implementation &implementation, size_t iterations) {
715
+ const simdutf::encoding_type bom =
716
+ BOM::check_bom(input_data.data(), input_data.size());
717
+ const char16_t *data = reinterpret_cast<const char16_t *>(
718
+ input_data.data() + BOM::bom_byte_size(bom));
719
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
720
+ if (size % 2 != 0) {
721
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
722
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
723
+ printf(" Running function on truncated input.\n");
724
+ }
725
+
726
+ size /= 2;
727
+
728
+ volatile bool sink{false};
729
+
730
+ auto proc = [&implementation, data, size, &sink]() {
731
+ sink = implementation.validate_utf16le(data, size);
732
+ };
733
+ count_events(proc, iterations); // warming up!
734
+ const auto result = count_events(proc, iterations);
735
+ if ((sink == false) && (iterations > 0)) {
736
+ std::cerr << "The input was declared invalid.\n";
737
+ }
738
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
739
+ print_summary(result, input_data.size(), char_count);
740
+ }
741
+
742
+ void Benchmark::run_validate_utf16le_with_errors(
743
+ const simdutf::implementation &implementation, size_t iterations) {
744
+ const simdutf::encoding_type bom =
745
+ BOM::check_bom(input_data.data(), input_data.size());
746
+ const char16_t *data = reinterpret_cast<const char16_t *>(
747
+ input_data.data() + BOM::bom_byte_size(bom));
748
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
749
+ if (size % 2 != 0) {
750
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
751
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
752
+ printf(" Running function on truncated input.\n");
753
+ }
754
+
755
+ size /= 2;
756
+
757
+ volatile bool sink{false};
758
+
759
+ auto proc = [&implementation, data, size, &sink]() {
760
+ result res = implementation.validate_utf16le_with_errors(data, size);
761
+ sink = !(res.error);
762
+ };
763
+ count_events(proc, iterations); // warming up!
764
+ const auto result = count_events(proc, iterations);
765
+ if ((sink == false) && (iterations > 0)) {
766
+ std::cerr << "The input was declared invalid.\n";
767
+ }
768
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
769
+ print_summary(result, input_data.size(), char_count);
770
+ }
771
+
772
+ void Benchmark::run_validate_utf32(
773
+ const simdutf::implementation &implementation, size_t iterations) {
774
+ const simdutf::encoding_type bom =
775
+ BOM::check_bom(input_data.data(), input_data.size());
776
+ const char32_t *data = reinterpret_cast<const char32_t *>(
777
+ input_data.data() + BOM::bom_byte_size(bom));
778
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
779
+ if (size % 2 != 0) {
780
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
781
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
782
+ printf(" Running function on truncated input.\n");
783
+ }
784
+
785
+ size /= 4;
786
+
787
+ volatile bool sink{false};
788
+
789
+ auto proc = [&implementation, data, size, &sink]() {
790
+ sink = implementation.validate_utf32(data, size);
791
+ };
792
+ count_events(proc, iterations); // warming up!
793
+ const auto result = count_events(proc, iterations);
794
+ if ((sink == false) && (iterations > 0)) {
795
+ std::cerr << "The input was declared invalid.\n";
796
+ }
797
+ size_t char_count = size;
798
+ print_summary(result, input_data.size(), char_count);
799
+ }
800
+
801
+ void Benchmark::run_validate_utf32_with_errors(
802
+ const simdutf::implementation &implementation, size_t iterations) {
803
+ const simdutf::encoding_type bom =
804
+ BOM::check_bom(input_data.data(), input_data.size());
805
+ const char32_t *data = reinterpret_cast<const char32_t *>(
806
+ input_data.data() + BOM::bom_byte_size(bom));
807
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
808
+ if (size % 4 != 0) {
809
+ printf(
810
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
811
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
812
+ printf(" Running function on truncated input.\n");
813
+ }
814
+
815
+ size /= 4;
816
+
817
+ volatile bool sink{false};
818
+
819
+ auto proc = [&implementation, data, size, &sink]() {
820
+ result res = implementation.validate_utf32_with_errors(data, size);
821
+ sink = !(res.error);
822
+ };
823
+ count_events(proc, iterations); // warming up!
824
+ const auto result = count_events(proc, iterations);
825
+ if ((sink == false) && (iterations > 0)) {
826
+ std::cerr << "The input was declared invalid.\n";
827
+ }
828
+ size_t char_count = size;
829
+ print_summary(result, input_data.size(), char_count);
830
+ }
831
+
832
+ void Benchmark::run_convert_latin1_to_utf8(
833
+ const simdutf::implementation &implementation, size_t iterations) {
834
+ const char *data = reinterpret_cast<const char *>(input_data.data());
835
+ const size_t size = input_data.size();
836
+ std::unique_ptr<char[]> output_buffer{new char[size * 2]};
837
+ volatile size_t sink{0};
838
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
839
+ sink =
840
+ implementation.convert_latin1_to_utf8(data, size, output_buffer.get());
841
+ };
842
+ count_events(proc, iterations); // warming up!
843
+ const auto result = count_events(proc, iterations);
844
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
845
+ std::cerr << "The output is zero which might indicate an error.\n";
846
+ }
847
+ size_t char_count = size;
848
+ print_summary(result, size, char_count);
849
+ }
850
+
851
+ void Benchmark::run_convert_latin1_to_utf16le(
852
+ const simdutf::implementation &implementation, size_t iterations) {
853
+ const char *data = reinterpret_cast<const char *>(input_data.data());
854
+ const size_t size = input_data.size();
855
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size * 2]};
856
+ volatile size_t sink{0};
857
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
858
+ sink = implementation.convert_latin1_to_utf16le(data, size,
859
+ output_buffer.get());
860
+ };
861
+ count_events(proc, iterations); // warming up!
862
+ const auto result = count_events(proc, iterations);
863
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
864
+ std::cerr << "The output is zero which might indicate an error.\n";
865
+ }
866
+ size_t char_count = size;
867
+ print_summary(result, size, char_count);
868
+ }
869
+
870
+ void Benchmark::run_convert_latin1_to_utf32(
871
+ const simdutf::implementation &implementation, size_t iterations) {
872
+ const char *data = reinterpret_cast<const char *>(input_data.data());
873
+ const size_t size = input_data.size();
874
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
875
+ volatile size_t sink{0};
876
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
877
+ sink =
878
+ implementation.convert_latin1_to_utf32(data, size, output_buffer.get());
879
+ };
880
+ count_events(proc, iterations); // warming up!
881
+ const auto result = count_events(proc, iterations);
882
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
883
+ std::cerr << "The output is zero which might indicate an error.\n";
884
+ }
885
+ size_t char_count = size;
886
+ print_summary(result, size, char_count);
887
+ }
888
+
889
+ void Benchmark::run_utf8_length_from_latin1(
890
+ const simdutf::implementation &implementation, size_t iterations) {
891
+ const char *data = reinterpret_cast<const char *>(input_data.data());
892
+ const size_t size = input_data.size();
893
+ volatile size_t sink{0};
894
+
895
+ auto proc = [&implementation, data, size, &sink]() {
896
+ sink = implementation.utf8_length_from_latin1(data, size);
897
+ };
898
+ count_events(proc, iterations); // warming up!
899
+ const auto result = count_events(proc, iterations);
900
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
901
+ std::cerr << "The output is zero which might indicate an error.\n";
902
+ }
903
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
904
+ print_summary(result, size, char_count);
905
+ }
906
+
907
+ void Benchmark::run_utf8_length_from_utf16le(
908
+ const simdutf::implementation &implementation, size_t iterations) {
909
+ const char16_t *data = reinterpret_cast<const char16_t *>(input_data.data());
910
+ const size_t size = input_data.size() / 2;
911
+ volatile size_t sink{0};
912
+
913
+ auto proc = [&implementation, data, size, &sink]() {
914
+ sink = implementation.utf8_length_from_utf16le(data, size);
915
+ };
916
+ count_events(proc, iterations); // warming up!
917
+ const auto result = count_events(proc, iterations);
918
+ print_summary(result, size, size);
919
+ }
920
+
921
+ void Benchmark::run_utf8_length_from_utf16be(
922
+ const simdutf::implementation &implementation, size_t iterations) {
923
+ const char16_t *data = reinterpret_cast<const char16_t *>(input_data.data());
924
+ const size_t size = input_data.size() / 2;
925
+ volatile size_t sink{0};
926
+
927
+ auto proc = [&implementation, data, size, &sink]() {
928
+ sink = implementation.utf8_length_from_utf16be(data, size);
929
+ };
930
+ count_events(proc, iterations); // warming up!
931
+ const auto result = count_events(proc, iterations);
932
+ print_summary(result, size, size);
933
+ }
934
+
935
+ void Benchmark::run_utf8_length_from_utf16le_with_replacement(
936
+ const simdutf::implementation &implementation, size_t iterations) {
937
+ const char16_t *data = reinterpret_cast<const char16_t *>(input_data.data());
938
+ const size_t size = input_data.size() / 2;
939
+ volatile size_t sink{0};
940
+
941
+ auto proc = [&implementation, data, size, &sink]() {
942
+ auto r =
943
+ implementation.utf8_length_from_utf16le_with_replacement(data, size);
944
+ sink = r.count;
945
+ };
946
+ count_events(proc, iterations); // warming up!
947
+ const auto result = count_events(proc, iterations);
948
+ print_summary(result, size, size);
949
+ }
950
+
951
+ void Benchmark::run_utf8_length_from_utf16be_with_replacement(
952
+ const simdutf::implementation &implementation, size_t iterations) {
953
+ const char16_t *data = reinterpret_cast<const char16_t *>(input_data.data());
954
+ const size_t size = input_data.size() / 2;
955
+ volatile size_t sink{0};
956
+
957
+ auto proc = [&implementation, data, size, &sink]() {
958
+ auto r =
959
+ implementation.utf8_length_from_utf16be_with_replacement(data, size);
960
+ sink = r.count;
961
+ };
962
+ count_events(proc, iterations); // warming up!
963
+ const auto result = count_events(proc, iterations);
964
+ print_summary(result, size, size);
965
+ }
966
+
967
+ void Benchmark::run_utf8_length_from_utf32(
968
+ const simdutf::implementation &implementation, size_t iterations) {
969
+ const char32_t *data = reinterpret_cast<const char32_t *>(input_data.data());
970
+ const size_t size = input_data.size() / 4;
971
+ volatile size_t sink{0};
972
+
973
+ auto proc = [&implementation, data, size, &sink]() {
974
+ sink = implementation.utf8_length_from_utf32(data, size);
975
+ };
976
+ count_events(proc, iterations); // warming up!
977
+ const auto result = count_events(proc, iterations);
978
+ print_summary(result, size, size);
979
+ }
980
+
981
+ void Benchmark::run_to_well_formed_utf16le(
982
+ const simdutf::implementation &implementation, size_t iterations) {
983
+ const char16_t *data = reinterpret_cast<const char16_t *>(input_data.data());
984
+ const size_t size = input_data.size() / 2;
985
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
986
+ auto proc = [&implementation, data, size, &output_buffer]() {
987
+ implementation.to_well_formed_utf16le(data, size, output_buffer.get());
988
+ };
989
+ count_events(proc, iterations); // warming up!
990
+ const auto result = count_events(proc, iterations);
991
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
992
+ print_summary(result, input_data.size(), char_count);
993
+ }
994
+
995
+ void Benchmark::run_utf16_length_from_utf8(
996
+ const simdutf::implementation &implementation, size_t iterations) {
997
+ const char *data = reinterpret_cast<const char *>(input_data.data());
998
+ const size_t size = input_data.size() / 4;
999
+ volatile size_t sink{0};
1000
+
1001
+ auto proc = [&implementation, data, size, &sink]() {
1002
+ sink = implementation.utf16_length_from_utf8(data, size);
1003
+ };
1004
+ count_events(proc, iterations); // warming up!
1005
+ const auto result = count_events(proc, iterations);
1006
+ print_summary(result, size, size);
1007
+ }
1008
+
1009
+ static inline uint32_t portable_popcount(uint64_t v) {
1010
+ #ifdef __GNUC__
1011
+ return static_cast<uint32_t>(__builtin_popcountll(v));
1012
+ #elif defined(_WIN64) && defined(_MSC_VER) && _MSC_VER >= 1400 && \
1013
+ !defined(_M_ARM64)
1014
+ return static_cast<uint32_t>(__popcnt64(static_cast<__int64>(v)));
1015
+ #else
1016
+ v = v - ((v >> 1) & 0x5555555555555555);
1017
+ v = (v & 0x3333333333333333) + ((v >> 2) & 0x3333333333333333);
1018
+ v = ((v + (v >> 4)) & 0x0F0F0F0F0F0F0F0F);
1019
+ return static_cast<uint32_t>((v * (0x0101010101010101)) >> 56);
1020
+ #endif
1021
+ }
1022
+
1023
+ void Benchmark::run_utf8_length_from_latin1_node(size_t iterations) {
1024
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1025
+ const size_t size = input_data.size();
1026
+ volatile size_t sink{0};
1027
+
1028
+ auto proc = [data, size, &sink]() {
1029
+ // from https://github.com/nodejs/node/pull/54345
1030
+ uint32_t length = size;
1031
+ uint32_t result = length;
1032
+ uint32_t i = 0;
1033
+ const auto length8 = length & ~0x7;
1034
+ while (i < length8) {
1035
+ // Original PR used std::popcount, but it is not available pre-C++20.
1036
+ result += portable_popcount(
1037
+ *reinterpret_cast<const uint64_t *>(data + i) & 0x8080808080808080);
1038
+ i += 8;
1039
+ }
1040
+ while (i < length) {
1041
+ result += (data[i] >> 7);
1042
+ i++;
1043
+ }
1044
+ sink = result;
1045
+ };
1046
+ count_events(proc, iterations); // warming up!
1047
+ const auto result = count_events(proc, iterations);
1048
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1049
+ std::cerr << "The output is zero which might indicate an error.\n";
1050
+ }
1051
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1052
+ print_summary(result, size, char_count);
1053
+ }
1054
+
1055
+ void Benchmark::run_convert_utf8_to_latin1(
1056
+ const simdutf::implementation &implementation, size_t iterations) {
1057
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1058
+ const size_t size = input_data.size();
1059
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1060
+ volatile size_t sink{0};
1061
+
1062
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
1063
+ sink =
1064
+ implementation.convert_utf8_to_latin1(data, size, output_buffer.get());
1065
+ };
1066
+ count_events(proc, iterations); // warming up!
1067
+ const auto result = count_events(proc, iterations);
1068
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1069
+ std::cerr << "The output is zero which might indicate an error.\n";
1070
+ }
1071
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1072
+ print_summary(result, size, char_count);
1073
+ }
1074
+
1075
+ void Benchmark::run_convert_utf8_to_latin1_with_errors(
1076
+ const simdutf::implementation &implementation, size_t iterations) {
1077
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1078
+ const size_t size = input_data.size();
1079
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1080
+ volatile bool sink{false};
1081
+
1082
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
1083
+ result res = implementation.convert_utf8_to_latin1_with_errors(
1084
+ data, size, output_buffer.get());
1085
+ sink = !(res.error);
1086
+ };
1087
+ count_events(proc, iterations); // warming up!
1088
+ const auto result = count_events(proc, iterations);
1089
+ if ((sink == false) && (iterations > 0)) {
1090
+ std::cerr << "The input was declared invalid.\n";
1091
+ }
1092
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1093
+ print_summary(result, size, char_count);
1094
+ }
1095
+
1096
+ void Benchmark::run_convert_valid_utf8_to_latin1(
1097
+ const simdutf::implementation &implementation, size_t iterations) {
1098
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1099
+ const size_t size = input_data.size();
1100
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1101
+ volatile size_t sink{0};
1102
+
1103
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
1104
+ sink = implementation.convert_valid_utf8_to_latin1(data, size,
1105
+ output_buffer.get());
1106
+ };
1107
+ count_events(proc, iterations); // warming up!
1108
+ const auto result = count_events(proc, iterations);
1109
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1110
+ std::cerr << "The output is zero which might indicate an error.\n";
1111
+ }
1112
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1113
+ print_summary(result, size, char_count);
1114
+ }
1115
+
1116
+ void Benchmark::run_convert_utf8_to_utf16le(
1117
+ const simdutf::implementation &implementation, size_t iterations) {
1118
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1119
+ const size_t size = input_data.size();
1120
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
1121
+ volatile size_t sink{0};
1122
+
1123
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
1124
+ sink =
1125
+ implementation.convert_utf8_to_utf16le(data, size, output_buffer.get());
1126
+ };
1127
+ count_events(proc, iterations); // warming up!
1128
+ const auto result = count_events(proc, iterations);
1129
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1130
+ std::cerr << "The output is zero which might indicate an error.\n";
1131
+ }
1132
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1133
+ print_summary(result, size, char_count);
1134
+ }
1135
+
1136
+ void Benchmark::run_convert_utf8_to_utf16le_with_errors(
1137
+ const simdutf::implementation &implementation, size_t iterations) {
1138
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1139
+ const size_t size = input_data.size();
1140
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
1141
+ volatile bool sink{false};
1142
+
1143
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
1144
+ result res = implementation.convert_utf8_to_utf16le_with_errors(
1145
+ data, size, output_buffer.get());
1146
+ sink = !(res.error);
1147
+ };
1148
+ count_events(proc, iterations); // warming up!
1149
+ const auto result = count_events(proc, iterations);
1150
+ if ((sink == false) && (iterations > 0)) {
1151
+ std::cerr << "The input was declared invalid.\n";
1152
+ }
1153
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1154
+ print_summary(result, size, char_count);
1155
+ }
1156
+
1157
+ void Benchmark::run_convert_utf8_to_utf32(
1158
+ const simdutf::implementation &implementation, size_t iterations) {
1159
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1160
+ const size_t size = input_data.size();
1161
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
1162
+ volatile size_t sink{0};
1163
+
1164
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
1165
+ sink =
1166
+ implementation.convert_utf8_to_utf32(data, size, output_buffer.get());
1167
+ };
1168
+ count_events(proc, iterations); // warming up!
1169
+ const auto result = count_events(proc, iterations);
1170
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1171
+ std::cerr << "The output is zero which might indicate an error.\n";
1172
+ }
1173
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1174
+ print_summary(result, size, char_count);
1175
+ }
1176
+
1177
+ void Benchmark::run_convert_utf8_to_utf32_with_errors(
1178
+ const simdutf::implementation &implementation, size_t iterations) {
1179
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1180
+ const size_t size = input_data.size();
1181
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
1182
+ volatile bool sink{false};
1183
+
1184
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
1185
+ result res = implementation.convert_utf8_to_utf32_with_errors(
1186
+ data, size, output_buffer.get());
1187
+ sink = !(res.error);
1188
+ };
1189
+ count_events(proc, iterations); // warming up!
1190
+ const auto result = count_events(proc, iterations);
1191
+ if ((sink == false) && (iterations > 0)) {
1192
+ std::cerr << "The input was declared invalid.\n";
1193
+ }
1194
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1195
+ print_summary(result, size, char_count);
1196
+ }
1197
+
1198
+ void Benchmark::run_convert_utf8_to_utf16le_with_dynamic_allocation(
1199
+ const simdutf::implementation &implementation, size_t iterations) {
1200
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1201
+ const size_t size = input_data.size();
1202
+ volatile size_t sink{0};
1203
+ auto proc = [&implementation, data, size, &sink]() {
1204
+ auto dyn_size = implementation.utf16_length_from_utf8(data, size);
1205
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[dyn_size]};
1206
+ sink =
1207
+ implementation.convert_utf8_to_utf16le(data, size, output_buffer.get());
1208
+ };
1209
+ count_events(proc, iterations); // warming up!
1210
+ const auto result = count_events(proc, iterations);
1211
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1212
+ std::cerr << "The output is zero which might indicate an error.\n";
1213
+ }
1214
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1215
+ print_summary(result, size, char_count);
1216
+ }
1217
+
1218
+ void Benchmark::run_convert_utf8_to_utf32_with_dynamic_allocation(
1219
+ const simdutf::implementation &implementation, size_t iterations) {
1220
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1221
+ const size_t size = input_data.size();
1222
+ volatile size_t sink{0};
1223
+ auto proc = [&implementation, data, size, &sink]() {
1224
+ auto dyn_size = implementation.utf32_length_from_utf8(data, size);
1225
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[dyn_size]};
1226
+ sink =
1227
+ implementation.convert_utf8_to_utf32(data, size, output_buffer.get());
1228
+ };
1229
+ count_events(proc, iterations); // warming up!
1230
+ const auto result = count_events(proc, iterations);
1231
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1232
+ std::cerr << "The output is zero which might indicate an error.\n";
1233
+ }
1234
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1235
+ print_summary(result, size, char_count);
1236
+ }
1237
+
1238
+ #ifdef ICU_AVAILABLE
1239
+
1240
+ void Benchmark::run_convert_latin1_to_utf8_icu(size_t iterations) {
1241
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1242
+ const size_t size = input_data.size();
1243
+ volatile size_t sink{0};
1244
+
1245
+ // Allocate target buffer
1246
+ int32_t targetCapacity = size * 2;
1247
+ std::unique_ptr<char[]> target(new char[targetCapacity]);
1248
+
1249
+ auto proc = [data, size, &sink, &target, targetCapacity]() {
1250
+ UErrorCode status = U_ZERO_ERROR;
1251
+
1252
+ // Open converters for source and target encodings
1253
+ UConverter *latin1conv = ucnv_open("ISO-8859-1", &status);
1254
+ assert(U_SUCCESS(status));
1255
+ UConverter *utf8conv = ucnv_open("UTF-8", &status);
1256
+ assert(U_SUCCESS(status));
1257
+
1258
+ // Pointers for source and target
1259
+ const char *source = data;
1260
+ const char *sourceLimit = data + size;
1261
+ char *targetStart = target.get();
1262
+ char *targetLimit = target.get() + targetCapacity;
1263
+
1264
+ // Convert from ISO-8859-1 to UTF-8
1265
+ ucnv_convertEx(utf8conv, latin1conv, &targetStart, targetLimit, &source,
1266
+ sourceLimit, nullptr, nullptr, nullptr, nullptr, true, true,
1267
+ &status);
1268
+ assert(U_SUCCESS(status));
1269
+
1270
+ // Calculate the output size
1271
+ sink = targetStart - target.get();
1272
+
1273
+ // Clean up
1274
+ ucnv_close(utf8conv);
1275
+ ucnv_close(latin1conv);
1276
+ };
1277
+
1278
+ count_events(proc, iterations); // warming up!
1279
+ const auto result = count_events(proc, iterations);
1280
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1281
+ std::cerr
1282
+ << "The output is zero which might indicate a misconfiguration.\n";
1283
+ }
1284
+ size_t char_count = size;
1285
+ std::unique_ptr<char[]> output_buffer{new char[size * 2]};
1286
+ size_t expected = get_active_implementation()->convert_latin1_to_utf8(
1287
+ data, size, output_buffer.get());
1288
+ if (expected != sink) {
1289
+ std::cerr << "The number of characters outputted does not match.\n";
1290
+ std::cout << "Expected: " << expected << ", Sink: " << sink
1291
+ << std::endl; // print values
1292
+ }
1293
+
1294
+ if (memcmp(target.get(), output_buffer.get(), sink) != 0) {
1295
+ std::cerr << "The output data does not match.\n";
1296
+ }
1297
+
1298
+ print_summary(result, size, char_count);
1299
+ }
1300
+
1301
+ void Benchmark::run_convert_latin1_to_utf16_icu(size_t iterations) {
1302
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1303
+ const size_t size = input_data.size();
1304
+ volatile size_t sink{0};
1305
+
1306
+ // Allocate target buffer outside lambda
1307
+ std::unique_ptr<UChar[]> target(new UChar[size * 2]);
1308
+
1309
+ auto proc = [data, size, &sink, &target]() {
1310
+ UErrorCode status = U_ZERO_ERROR;
1311
+
1312
+ // Open converter for source encoding
1313
+ UConverter *latin1conv = ucnv_open("ISO-8859-1", &status);
1314
+ assert(U_SUCCESS(status));
1315
+
1316
+ // Convert from ISO-8859-1 to UTF-16 directly
1317
+ int32_t actualTargetSize =
1318
+ ucnv_toUChars(latin1conv, target.get(), size * 2, data, size, &status);
1319
+ assert(U_SUCCESS(status));
1320
+
1321
+ // Calculate the output size in bytes
1322
+ sink = actualTargetSize * sizeof(UChar);
1323
+
1324
+ // Clean up
1325
+ ucnv_close(latin1conv);
1326
+ };
1327
+
1328
+ count_events(proc, iterations); // warming up!
1329
+ const auto result = count_events(proc, iterations);
1330
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1331
+ std::cerr
1332
+ << "The output is zero which might indicate a misconfiguration.\n";
1333
+ }
1334
+ size_t char_count = size;
1335
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
1336
+ size_t expected = get_active_implementation()->convert_latin1_to_utf16le(
1337
+ data, size, output_buffer.get()); // expected char16_t units
1338
+ if (2 * expected != sink) {
1339
+ std::cerr << "The number of utf16le code units does not match.\n";
1340
+ std::cerr << "Expected: " << 2 * expected + 1 << ", Sink: " << sink
1341
+ << std::endl; // print values
1342
+ }
1343
+
1344
+ if (memcmp(target.get(), output_buffer.get(), sink) != 0) {
1345
+ std::cerr << "The output data does not match.\n";
1346
+ // compare first 20 characters and print their hexadecimal values
1347
+ std::cout << "First 20 characters of target data: ";
1348
+ for (size_t i = 0; i < 20; i++) {
1349
+ std::cout << std::hex << static_cast<int>(target.get()[i]) << " ";
1350
+ }
1351
+ std::cout << "\nFirst 20 characters of output buffer: ";
1352
+ for (size_t i = 0; i < 20; i++) {
1353
+ std::cout << std::hex << static_cast<int>(output_buffer[i]) << " ";
1354
+ }
1355
+
1356
+ // compare last 20 characters and print their hexadecimal values
1357
+ size_t num_chars = sink / sizeof(UChar);
1358
+ size_t start = num_chars < 20 ? 0 : num_chars - 20;
1359
+ std::cout << "\nLast 20 characters of target data: ";
1360
+ for (size_t i = start; i < num_chars; i++) {
1361
+ std::cout << std::hex << static_cast<int>(target.get()[i]) << " ";
1362
+ }
1363
+ std::cout << "\nLast 20 characters of output buffer: ";
1364
+ for (size_t i = start; i < num_chars; i++) {
1365
+ std::cout << std::hex << static_cast<int>(output_buffer[i]) << " ";
1366
+ }
1367
+ }
1368
+
1369
+ print_summary(result, size, char_count);
1370
+ }
1371
+
1372
+ void Benchmark::run_convert_latin1_to_utf32_icu(size_t iterations) {
1373
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1374
+ const size_t size = input_data.size();
1375
+ volatile size_t sink{0};
1376
+
1377
+ std::unique_ptr<char[]> target;
1378
+
1379
+ auto proc = [&target, data, size, &sink]() {
1380
+ UErrorCode status = U_ZERO_ERROR;
1381
+
1382
+ // Open converters for source and target encodings
1383
+ UConverter *latin1conv = ucnv_open("ISO-8859-1", &status);
1384
+ assert(U_SUCCESS(status));
1385
+ UConverter *utf32conv = ucnv_open("UTF-32LE", &status);
1386
+ assert(U_SUCCESS(status));
1387
+
1388
+ // Allocate target buffer
1389
+ int32_t targetCapacity = size * 4; // UTF-32 takes four bytes.
1390
+ target.reset(new char[targetCapacity]);
1391
+
1392
+ // Pointers for source and target
1393
+ const char *source = data;
1394
+ const char *sourceLimit = data + size;
1395
+ char *targetStart = target.get();
1396
+ char *targetLimit = target.get() + targetCapacity;
1397
+
1398
+ // Convert from ISO-8859-1 to UTF-32
1399
+ ucnv_convertEx(utf32conv, latin1conv, &targetStart, targetLimit, &source,
1400
+ sourceLimit, nullptr, nullptr, nullptr, nullptr, true, true,
1401
+ &status);
1402
+ assert(U_SUCCESS(status));
1403
+
1404
+ // Calculate the output size in bytes
1405
+ sink = targetStart - target.get();
1406
+
1407
+ // Clean up
1408
+ ucnv_close(utf32conv);
1409
+ ucnv_close(latin1conv);
1410
+ };
1411
+
1412
+ count_events(proc, iterations); // warming up!
1413
+ const auto result = count_events(proc, iterations);
1414
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1415
+ std::cerr
1416
+ << "The output is zero which might indicate a misconfiguration.\n";
1417
+ }
1418
+ size_t char_count = size;
1419
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
1420
+ size_t expected = get_active_implementation()->convert_latin1_to_utf32(
1421
+ data, size, output_buffer.get()); // expected is the # of UTF32 characters
1422
+ if (4 * expected != sink) {
1423
+ std::cerr
1424
+ << "The number of characters outputted does not match.\n"; // each UTF32
1425
+ // character
1426
+ // takes four
1427
+ // bytes
1428
+ std::cout << "Expected: " << expected << ", Sink: " << sink
1429
+ << std::endl; // print values
1430
+ }
1431
+
1432
+ if (memcmp(target.get(), output_buffer.get(), sink) != 0) {
1433
+ std::cerr << "The output data does not match.\n";
1434
+ // compare first 20 characters and print their hexadecimal values
1435
+ std::cout << "First 20 characters of target data: ";
1436
+ for (size_t i = 0; i < 20; i++) {
1437
+ std::cout << std::hex << static_cast<int>(target.get()[i]) << " ";
1438
+ }
1439
+ std::cout << "\nFirst 20 characters of output buffer: ";
1440
+ for (size_t i = 0; i < 20; i++) {
1441
+ std::cout << std::hex << static_cast<int>(output_buffer[i]) << " ";
1442
+ }
1443
+
1444
+ // compare last 20 characters and print their hexadecimal values
1445
+ size_t num_chars = sink / sizeof(UChar);
1446
+ size_t start = num_chars < 20 ? 0 : num_chars - 20;
1447
+ std::cout << "\nLast 20 characters of target data: ";
1448
+ for (size_t i = start; i < num_chars; i++) {
1449
+ std::cout << std::hex << static_cast<int>(target.get()[i]) << " ";
1450
+ }
1451
+ std::cout << "\nLast 20 characters of output buffer: ";
1452
+ for (size_t i = start; i < num_chars; i++) {
1453
+ std::cout << std::hex << static_cast<int>(output_buffer[i]) << " ";
1454
+ }
1455
+ }
1456
+
1457
+ print_summary(result, size, char_count);
1458
+ }
1459
+
1460
+ void Benchmark::run_convert_utf8_to_latin1_icu(size_t iterations) {
1461
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1462
+ const size_t size = input_data.size();
1463
+ volatile size_t sink{0};
1464
+
1465
+ std::unique_ptr<char[]> target;
1466
+
1467
+ auto proc = [&target, data, size, &sink]() {
1468
+ UErrorCode status = U_ZERO_ERROR;
1469
+
1470
+ // Open converters for source and target encodings
1471
+ UConverter *utf8conv = ucnv_open("UTF-8", &status);
1472
+ assert(U_SUCCESS(status));
1473
+ UConverter *latin1conv = ucnv_open("ISO-8859-1", &status);
1474
+ assert(U_SUCCESS(status));
1475
+
1476
+ // Allocate target buffer
1477
+ int32_t targetCapacity = size * 2;
1478
+ target.reset(new char[targetCapacity]);
1479
+
1480
+ // Pointers for source and target
1481
+ const char *source = data;
1482
+ const char *sourceLimit = data + size;
1483
+ char *targetStart = target.get();
1484
+ char *targetLimit = target.get() + targetCapacity;
1485
+
1486
+ // Convert from ISO-8859-1 to UTF-8
1487
+ ucnv_convertEx(latin1conv, utf8conv, &targetStart, targetLimit, &source,
1488
+ sourceLimit, nullptr, nullptr, nullptr, nullptr, true, true,
1489
+ &status);
1490
+ assert(U_SUCCESS(status));
1491
+
1492
+ // Calculate the output size
1493
+ sink = targetStart - target.get();
1494
+
1495
+ // Clean up
1496
+ ucnv_close(utf8conv);
1497
+ ucnv_close(latin1conv);
1498
+ };
1499
+
1500
+ count_events(proc, iterations); // warming up!
1501
+ const auto result = count_events(proc, iterations);
1502
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1503
+ std::cerr
1504
+ << "The output is zero which might indicate a misconfiguration.\n";
1505
+ }
1506
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1507
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1508
+ size_t expected = get_active_implementation()->convert_utf8_to_latin1(
1509
+ data, size, output_buffer.get());
1510
+ if (expected != sink) {
1511
+ std::cerr << "The number of latin1 code units does not match.\n";
1512
+ }
1513
+
1514
+ if (memcmp(target.get(), output_buffer.get(), sink) != 0) {
1515
+ std::cerr << "The output data does not match.\n";
1516
+ // compare first 20 characters and print their hexadecimal values
1517
+ std::cout << "First 20 characters of target data: ";
1518
+ for (size_t i = 0; i < 20; i++) {
1519
+ std::cout << std::hex << static_cast<int>(target.get()[i]) << " ";
1520
+ }
1521
+ std::cout << "\nFirst 20 characters of output buffer: ";
1522
+ for (size_t i = 0; i < 20; i++) {
1523
+ std::cout << std::hex << static_cast<int>(output_buffer[i]) << " ";
1524
+ }
1525
+ }
1526
+
1527
+ print_summary(result, size, char_count);
1528
+ }
1529
+
1530
+ void Benchmark::run_convert_utf8_to_utf16_icu(size_t iterations) {
1531
+ const char *data = reinterpret_cast<const char *>(input_data.data());
1532
+ const size_t size = input_data.size();
1533
+ volatile size_t sink{0};
1534
+ auto proc = [data, size, &sink]() {
1535
+ auto str =
1536
+ U_ICU_NAMESPACE::UnicodeString::fromUTF8(std::string_view(data, size));
1537
+ sink = str.length();
1538
+ };
1539
+ count_events(proc, iterations); // warming up!
1540
+ const auto result = count_events(proc, iterations);
1541
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1542
+ std::cerr
1543
+ << "The output is zero which might indicate a misconfiguration.\n";
1544
+ }
1545
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1546
+ // checking
1547
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
1548
+ size_t expected = convert_utf8_to_utf16le(data, size, output_buffer.get());
1549
+ if (expected != sink) {
1550
+ std::cerr << "The number of UTF-16 code units does not match.\n";
1551
+ }
1552
+ print_summary(result, size, char_count);
1553
+ }
1554
+ void Benchmark::run_convert_utf16_to_utf8_icu(size_t iterations) {
1555
+ const simdutf::encoding_type bom =
1556
+ BOM::check_bom(input_data.data(), input_data.size());
1557
+ const char16_t *data = reinterpret_cast<const char16_t *>(
1558
+ input_data.data() + BOM::bom_byte_size(bom));
1559
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
1560
+ if (size % 2 != 0) {
1561
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
1562
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
1563
+ printf(" Running function on truncated input.\n");
1564
+ }
1565
+ size /= 2;
1566
+ volatile size_t sink{0};
1567
+
1568
+ auto proc = [data, size, &sink]() {
1569
+ U_ICU_NAMESPACE::UnicodeString str(data, size);
1570
+ std::string out;
1571
+ out = str.toUTF8String(out);
1572
+ sink = out.size();
1573
+ };
1574
+ count_events(proc, iterations); // warming up!
1575
+ const auto result = count_events(proc, iterations);
1576
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1577
+ std::cerr << "The output is zero which might indicate an error.\n";
1578
+ }
1579
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
1580
+ print_summary(result, input_data.size(), char_count);
1581
+ }
1582
+
1583
+ void Benchmark::run_convert_utf16_to_latin1_icu(size_t iterations) {
1584
+ const simdutf::encoding_type bom =
1585
+ BOM::check_bom(input_data.data(), input_data.size());
1586
+ const char16_t *data = reinterpret_cast<const char16_t *>(
1587
+ input_data.data() + BOM::bom_byte_size(bom));
1588
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
1589
+ if (size % 2 != 0) {
1590
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
1591
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
1592
+ printf(" Running function on truncated input.\n");
1593
+ }
1594
+ size /= 2;
1595
+ volatile size_t sink{0};
1596
+
1597
+ std::unique_ptr<char[]> target;
1598
+
1599
+ auto proc = [&target, data, size, &sink]() {
1600
+ UErrorCode status = U_ZERO_ERROR;
1601
+ UConverter *conv =
1602
+ ucnv_open("ISO-8859-1", &status); // open a converter for ISO-8859-1
1603
+ assert(U_SUCCESS(status));
1604
+
1605
+ int32_t targetCapacity = size; // adjust as needed
1606
+ target.reset(new char[targetCapacity]);
1607
+ char *targetStart = target.get();
1608
+
1609
+ sink =
1610
+ ucnv_fromUChars(conv, targetStart, targetCapacity,
1611
+ reinterpret_cast<const UChar *>(data), size, &status);
1612
+ assert(U_SUCCESS(status));
1613
+
1614
+ // Clean up
1615
+ ucnv_close(conv);
1616
+ };
1617
+
1618
+ count_events(proc, iterations); // warming up!
1619
+ const auto result = count_events(proc, iterations);
1620
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1621
+ std::cerr
1622
+ << "The output is zero which might indicate a misconfiguration.\n";
1623
+ }
1624
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
1625
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1626
+ size_t expected = get_active_implementation()->convert_utf16le_to_latin1(
1627
+ data, size, output_buffer.get());
1628
+ if (expected != sink) {
1629
+ std::cerr << "The number of expected bytes does not match.\n";
1630
+ std::cout << "Expected: " << expected << ", Sink: " << sink
1631
+ << std::endl; // print values
1632
+ }
1633
+
1634
+ if (memcmp(target.get(), output_buffer.get(), sink) != 0) {
1635
+ std::cerr << "The output data does not match.\n";
1636
+ // compare first 20 characters and print their hexadecimal values
1637
+ std::cout << "First 20 characters of target data: ";
1638
+ for (size_t i = 0; i < 20; i++) {
1639
+ std::cout << std::hex << static_cast<int>(target.get()[i]) << " ";
1640
+ }
1641
+ std::cout << "\nFirst 20 characters of output buffer: ";
1642
+ for (size_t i = 0; i < 20; i++) {
1643
+ std::cout << std::hex << static_cast<int>(output_buffer[i]) << " ";
1644
+ }
1645
+ }
1646
+
1647
+ print_summary(result, input_data.size(), char_count);
1648
+ }
1649
+
1650
+ void Benchmark::run_convert_utf32_to_latin1_icu(size_t iterations) {
1651
+ const simdutf::encoding_type bom =
1652
+ BOM::check_bom(input_data.data(), input_data.size());
1653
+ const char32_t *data = reinterpret_cast<const char32_t *>(
1654
+ input_data.data() + BOM::bom_byte_size(bom));
1655
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
1656
+ if (size % 4 != 0) {
1657
+ printf(
1658
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
1659
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
1660
+ printf(" Running function on truncated input.\n");
1661
+ }
1662
+
1663
+ size /= 4;
1664
+ volatile size_t sink{0};
1665
+ std::unique_ptr<char[]> target;
1666
+
1667
+ auto proc = [&target, data, size, &sink]() {
1668
+ UErrorCode status = U_ZERO_ERROR;
1669
+
1670
+ UConverter *utf32conv =
1671
+ ucnv_open("UTF-32LE", &status); // create a UTF-32 converter
1672
+ assert(U_SUCCESS(status));
1673
+
1674
+ UConverter *latin1conv =
1675
+ ucnv_open("ISO-8859-1", &status); // create a Latin1 converter
1676
+ assert(U_SUCCESS(status));
1677
+
1678
+ int32_t targetCapacity = size; // adjust as needed
1679
+ target.reset(new char[targetCapacity]);
1680
+ char *targetStart = target.get();
1681
+
1682
+ const char *sourceStart = reinterpret_cast<const char *>(data);
1683
+ const char *sourceEnd = sourceStart + size * sizeof(char32_t);
1684
+
1685
+ // Convert from UTF-32 to Latin1
1686
+ ucnv_convertEx(latin1conv, utf32conv, &targetStart,
1687
+ targetStart + targetCapacity, &sourceStart, sourceEnd,
1688
+ nullptr, nullptr, nullptr, nullptr, true, true, &status);
1689
+ assert(U_SUCCESS(status));
1690
+
1691
+ // Calculate the output size
1692
+ sink = targetStart - target.get();
1693
+
1694
+ // Clean up
1695
+ ucnv_close(utf32conv);
1696
+ ucnv_close(latin1conv);
1697
+ };
1698
+
1699
+ count_events(proc, iterations); // warming up!
1700
+ const auto result = count_events(proc, iterations);
1701
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1702
+ std::cerr
1703
+ << "The output is zero which might indicate a misconfiguration.\n";
1704
+ }
1705
+ size_t char_count = size;
1706
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1707
+ size_t expected = get_active_implementation()->convert_utf32_to_latin1(
1708
+ data, size, output_buffer.get());
1709
+ if (expected != sink) {
1710
+ std::cerr << "The number of expected bytes does not match.\n";
1711
+ std::cout << "Expected: " << expected << ", Sink: " << sink
1712
+ << std::endl; // print values
1713
+ }
1714
+
1715
+ if (memcmp(target.get(), output_buffer.get(), sink) != 0) {
1716
+ std::cerr << "The output data does not match.\n";
1717
+ // compare first 20 characters and print their hexadecimal values
1718
+ std::cout << "First 20 characters of target data: ";
1719
+ for (size_t i = 0; i < 20; i++) {
1720
+ std::cout << std::hex << static_cast<int>(target.get()[i]) << " ";
1721
+ }
1722
+ std::cout << "\nFirst 20 characters of output buffer: ";
1723
+ for (size_t i = 0; i < 20; i++) {
1724
+ std::cout << std::hex << static_cast<int>(output_buffer[i]) << " ";
1725
+ }
1726
+ }
1727
+
1728
+ print_summary(result, input_data.size(), char_count);
1729
+ }
1730
+
1731
+ #endif
1732
+
1733
+ #ifdef ICONV_AVAILABLE
1734
+ void Benchmark::run_convert_latin1_to_utf8_iconv(size_t iterations) {
1735
+ iconv_t cv = iconv_open("UTF-8", "ISO-8859-1");
1736
+ if (cv == (iconv_t)(-1)) {
1737
+ fprintf(stderr,
1738
+ "[iconv] cannot initialize ISO-8859-1 to UTF-8 converter\n");
1739
+ return;
1740
+ }
1741
+ char *data = reinterpret_cast<char *>(input_data.data());
1742
+ const size_t size = input_data.size();
1743
+ std::unique_ptr<char[]> output_buffer{new char[size * 2]}; // 2 for safety
1744
+ volatile size_t sink{0};
1745
+ auto proc = [&cv, data, size, &output_buffer, &sink]() {
1746
+ size_t inbytes = size;
1747
+ size_t outbytes = sizeof(uint8_t) * size * 2;
1748
+ #ifdef WINICONV_CONST
1749
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
1750
+ #else
1751
+ char *inptr = data;
1752
+ #endif
1753
+ char *outptr = reinterpret_cast<char *>(output_buffer.get());
1754
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
1755
+ if (result == static_cast<size_t>(-1)) {
1756
+ sink = 0;
1757
+ } else {
1758
+ sink = (sizeof(uint8_t) * size - outbytes) / sizeof(char);
1759
+ }
1760
+ };
1761
+ count_events(proc, iterations); // warming up!
1762
+ const auto result = count_events(proc, iterations);
1763
+ iconv_close(cv);
1764
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1765
+ std::cerr << "The output is zero which might indicate an error.\n";
1766
+ }
1767
+ size_t char_count = size;
1768
+ print_summary(result, size, char_count);
1769
+ }
1770
+
1771
+ void Benchmark::run_convert_latin1_to_utf16_iconv(size_t iterations) {
1772
+ iconv_t cv = iconv_open("UTF-16", "ISO-8859-1");
1773
+ if (cv == (iconv_t)(-1)) {
1774
+ fprintf(stderr,
1775
+ "[iconv] cannot initialize ISO-8859-1 to UTF-16 converter\n");
1776
+ return;
1777
+ }
1778
+ char *data = reinterpret_cast<char *>(input_data.data());
1779
+ const size_t size = input_data.size();
1780
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
1781
+ volatile size_t sink{0};
1782
+ auto proc = [&cv, data, size, &output_buffer, &sink]() {
1783
+ size_t inbytes = size;
1784
+ size_t outbytes = sizeof(uint16_t) * size;
1785
+ #ifdef WINICONV_CONST
1786
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
1787
+ #else
1788
+ char *inptr = data;
1789
+ #endif
1790
+ char *outptr = reinterpret_cast<char *>(output_buffer.get());
1791
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
1792
+ if (result == static_cast<size_t>(-1)) {
1793
+ sink = 0;
1794
+ } else {
1795
+ sink = (sizeof(uint16_t) * size - outbytes) / sizeof(char);
1796
+ }
1797
+ };
1798
+ count_events(proc, iterations); // warming up!
1799
+ const auto result = count_events(proc, iterations);
1800
+ iconv_close(cv);
1801
+ size_t char_count = size;
1802
+ print_summary(result, size, char_count);
1803
+ }
1804
+
1805
+ void Benchmark::run_convert_latin1_to_utf32_iconv(size_t iterations) {
1806
+ iconv_t cv = iconv_open("UTF-32LE", "ISO-8859-1");
1807
+ if (cv == (iconv_t)(-1)) {
1808
+ fprintf(stderr,
1809
+ "[iconv] cannot initialize ISO-8859-1 to UTF-32 converter\n");
1810
+ return;
1811
+ }
1812
+ char *data = reinterpret_cast<char *>(input_data.data());
1813
+ const size_t size = input_data.size();
1814
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
1815
+ volatile size_t sink{0};
1816
+ auto proc = [&cv, data, size, &output_buffer, &sink]() {
1817
+ size_t inbytes = size;
1818
+ size_t outbytes = sizeof(uint32_t) * size;
1819
+ #ifdef WINICONV_CONST
1820
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
1821
+ #else
1822
+ char *inptr = data;
1823
+ #endif
1824
+ char *outptr = reinterpret_cast<char *>(output_buffer.get());
1825
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
1826
+ if (result == static_cast<size_t>(-1)) {
1827
+ sink = 0;
1828
+ } else {
1829
+ sink = (sizeof(uint32_t) * size - outbytes) / sizeof(char);
1830
+ ;
1831
+ }
1832
+ };
1833
+ count_events(proc, iterations); // warming up!
1834
+ const auto result = count_events(proc, iterations);
1835
+ iconv_close(cv);
1836
+ size_t char_count = size;
1837
+ print_summary(result, size, char_count);
1838
+ }
1839
+
1840
+ void Benchmark::run_convert_utf8_to_latin1_iconv(size_t iterations) {
1841
+ iconv_t cv = iconv_open("ISO-8859-1", "UTF-8");
1842
+ if (cv == (iconv_t)(-1)) {
1843
+ fprintf(stderr, "[iconv] cannot initialize UTF-8 to Latin1 converter\n");
1844
+ return;
1845
+ }
1846
+ char *data = reinterpret_cast<char *>(input_data.data());
1847
+ const size_t size = input_data.size();
1848
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1849
+ volatile size_t sink{0};
1850
+
1851
+ auto proc = [&cv, data, size, &output_buffer, &sink]() {
1852
+ size_t inbytes = size;
1853
+ size_t outbytes = sizeof(uint8_t) * size;
1854
+ // win-iconv includes WINICONV_CONST in its function signatures
1855
+ // https://github.com/simdutf/simdutf/pull/178
1856
+ #ifdef WINICONV_CONST
1857
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
1858
+ #else
1859
+ char *inptr = data;
1860
+ #endif
1861
+ char *outptr = reinterpret_cast<char *>(output_buffer.get());
1862
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
1863
+ if (result == static_cast<size_t>(-1)) {
1864
+ sink = 0;
1865
+ } else {
1866
+ sink = (sizeof(uint8_t) * size - outbytes) / sizeof(char);
1867
+ ;
1868
+ }
1869
+ };
1870
+ count_events(proc, iterations); // warming up!
1871
+ const auto result = count_events(proc, iterations);
1872
+ iconv_close(cv);
1873
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1874
+ std::cerr << "The output is zero which might indicate an error.\n";
1875
+ }
1876
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1877
+ print_summary(result, size, char_count);
1878
+ }
1879
+
1880
+ void Benchmark::run_convert_utf8_to_utf16_iconv(size_t iterations) {
1881
+ iconv_t cv = iconv_open("UTF-16LE", "UTF-8");
1882
+ if (cv == (iconv_t)(-1)) {
1883
+ fprintf(stderr, "[iconv] cannot initialize UTF-8 to UTF-16LE converter\n");
1884
+ return;
1885
+ }
1886
+ char *data = reinterpret_cast<char *>(input_data.data());
1887
+ const size_t size = input_data.size();
1888
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
1889
+ volatile size_t sink{0};
1890
+
1891
+ auto proc = [&cv, data, size, &output_buffer, &sink]() {
1892
+ size_t inbytes = size;
1893
+ size_t outbytes = sizeof(uint16_t) * size;
1894
+ // win-iconv includes WINICONV_CONST in its function signatures
1895
+ // https://github.com/simdutf/simdutf/pull/178
1896
+ #ifdef WINICONV_CONST
1897
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
1898
+ #else
1899
+ char *inptr = data;
1900
+ #endif
1901
+ char *outptr = reinterpret_cast<char *>(output_buffer.get());
1902
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
1903
+ if (result == static_cast<size_t>(-1)) {
1904
+ sink = 0;
1905
+ } else {
1906
+ sink = (sizeof(uint16_t) * size - outbytes) / sizeof(char);
1907
+ ;
1908
+ }
1909
+ };
1910
+ count_events(proc, iterations); // warming up!
1911
+ const auto result = count_events(proc, iterations);
1912
+ iconv_close(cv);
1913
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1914
+ std::cerr << "The output is zero which might indicate an error.\n";
1915
+ }
1916
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
1917
+ print_summary(result, size, char_count);
1918
+ }
1919
+
1920
+ void Benchmark::run_convert_utf16_to_latin1_iconv(size_t iterations) {
1921
+ iconv_t cv = iconv_open("ISO-8859-1", "UTF-16LE");
1922
+ if (cv == (iconv_t)(-1)) {
1923
+ fprintf(stderr,
1924
+ "[iconv] cannot initialize the UTF-16LE to ISO-8859-1 converter\n");
1925
+ return;
1926
+ }
1927
+ const simdutf::encoding_type bom =
1928
+ BOM::check_bom(input_data.data(), input_data.size());
1929
+ char16_t *data =
1930
+ reinterpret_cast<char16_t *>(input_data.data() + BOM::bom_byte_size(bom));
1931
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
1932
+ if (size % 2 != 0) {
1933
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
1934
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
1935
+ printf(" Running function on truncated input.\n");
1936
+ }
1937
+
1938
+ size /= 2;
1939
+
1940
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
1941
+ // yields 4 bytes,
1942
+ // thus we're making safe assumption that each 16-bit word will be
1943
+ // expanded to four bytes.
1944
+ std::unique_ptr<char[]> output_buffer{new char[size]};
1945
+
1946
+ volatile size_t sink{0};
1947
+
1948
+ auto proc = [cv, data, size, &output_buffer, &sink]() {
1949
+ size_t inbytes = sizeof(uint16_t) * size;
1950
+ size_t outbytes = size;
1951
+ #ifdef WINICONV_CONST
1952
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
1953
+ #else
1954
+ char *inptr = reinterpret_cast<char *>(data);
1955
+ #endif
1956
+ char *outptr = output_buffer.get();
1957
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
1958
+ if (result == static_cast<size_t>(-1)) {
1959
+ sink = 0;
1960
+ } else {
1961
+ sink = (size - outbytes) / sizeof(char16_t);
1962
+ }
1963
+ };
1964
+ count_events(proc, iterations); // warming up!
1965
+ const auto result = count_events(proc, iterations);
1966
+ iconv_close(cv);
1967
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
1968
+ std::cerr << "The output is zero which might indicate an error.\n";
1969
+ }
1970
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
1971
+ print_summary(result, input_data.size(), char_count);
1972
+ }
1973
+
1974
+ void Benchmark::run_convert_utf16_to_utf8_iconv(size_t iterations) {
1975
+ iconv_t cv = iconv_open("UTF-8", "UTF-16LE");
1976
+ if (cv == (iconv_t)(-1)) {
1977
+ fprintf(stderr,
1978
+ "[iconv] cannot initialize the UTF-16LE to UTF-8 converter\n");
1979
+ return;
1980
+ }
1981
+ const simdutf::encoding_type bom =
1982
+ BOM::check_bom(input_data.data(), input_data.size());
1983
+ char16_t *data =
1984
+ reinterpret_cast<char16_t *>(input_data.data() + BOM::bom_byte_size(bom));
1985
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
1986
+ if (size % 2 != 0) {
1987
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
1988
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
1989
+ printf(" Running function on truncated input.\n");
1990
+ }
1991
+
1992
+ size /= 2;
1993
+
1994
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
1995
+ // yields 4 bytes,
1996
+ // thus we're making safe assumption that each 16-bit word will be
1997
+ // expanded to four bytes.
1998
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
1999
+
2000
+ volatile size_t sink{0};
2001
+
2002
+ auto proc = [cv, data, size, &output_buffer, &sink]() {
2003
+ size_t inbytes = sizeof(uint16_t) * size;
2004
+ size_t outbytes = 4 * size;
2005
+ #ifdef WINICONV_CONST
2006
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
2007
+ #else
2008
+ char *inptr = reinterpret_cast<char *>(data);
2009
+ #endif
2010
+ char *outptr = output_buffer.get();
2011
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
2012
+ if (result == static_cast<size_t>(-1)) {
2013
+ sink = 0;
2014
+ } else {
2015
+ sink = (4 * size - outbytes) / sizeof(char16_t);
2016
+ }
2017
+ };
2018
+ count_events(proc, iterations); // warming up!
2019
+ const auto result = count_events(proc, iterations);
2020
+ iconv_close(cv);
2021
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2022
+ std::cerr << "The output is zero which might indicate an error.\n";
2023
+ }
2024
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2025
+ print_summary(result, input_data.size(), char_count);
2026
+ }
2027
+
2028
+ void Benchmark::run_convert_utf32_to_latin1_iconv(size_t iterations) {
2029
+ iconv_t cv = iconv_open("ISO-8859-1", "UTF-32LE");
2030
+ if (cv == (iconv_t)(-1)) {
2031
+ fprintf(stderr,
2032
+ "[iconv] cannot initialize the UTF-32 to ISO-8859-1 converter\n");
2033
+ return;
2034
+ }
2035
+ const simdutf::encoding_type bom =
2036
+ BOM::check_bom(input_data.data(), input_data.size());
2037
+ char32_t *data =
2038
+ reinterpret_cast<char32_t *>(input_data.data() + BOM::bom_byte_size(bom));
2039
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2040
+ if (size % 4 != 0) {
2041
+ printf(
2042
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
2043
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2044
+ printf(" Running function on truncated input.\n");
2045
+ }
2046
+
2047
+ size /= 4;
2048
+
2049
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
2050
+ // yields 4 bytes,
2051
+ // thus we're making safe assumption that each 16-bit word will be
2052
+ // expanded to four bytes.
2053
+ std::unique_ptr<char[]> output_buffer{new char[size]};
2054
+
2055
+ volatile size_t sink{0};
2056
+
2057
+ auto proc = [cv, data, size, &output_buffer, &sink]() {
2058
+ size_t inbytes = sizeof(uint32_t) * size;
2059
+ size_t outbytes = size;
2060
+ #ifdef WINICONV_CONST
2061
+ WINICONV_CONST char *inptr = reinterpret_cast<WINICONV_CONST char *>(data);
2062
+ #else
2063
+ char *inptr = reinterpret_cast<char *>(data);
2064
+ #endif
2065
+ char *outptr = output_buffer.get();
2066
+ size_t result = iconv(cv, &inptr, &inbytes, &outptr, &outbytes);
2067
+ if (result == static_cast<size_t>(-1)) {
2068
+ sink = 0;
2069
+ abort();
2070
+ } else {
2071
+ sink = (size - outbytes) / sizeof(char32_t);
2072
+ }
2073
+ };
2074
+ count_events(proc, iterations); // warming up!
2075
+ const auto result = count_events(proc, iterations);
2076
+ iconv_close(cv);
2077
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2078
+ std::cerr << "The output is zero which might indicate an error.\n";
2079
+ }
2080
+ size_t char_count = size;
2081
+ print_summary(result, input_data.size(), char_count);
2082
+ }
2083
+ #endif
2084
+
2085
+ #ifdef INOUE2008
2086
+ void Benchmark::run_convert_valid_utf8_to_utf16_inoue2008(size_t iterations) {
2087
+ // Inoue2008 is only up to 3-byte UTF8 sequence.
2088
+ for (uint8_t c : input_data) {
2089
+ if (c >= 0b11110000) {
2090
+ std::cerr << "Warning: Inoue 2008 does not support 4-byte inputs!"
2091
+ << std::endl;
2092
+ break;
2093
+ }
2094
+ }
2095
+ // This is currently minimally tested. It is possible that the transcoding
2096
+ // could be wrong. It is also unsafe: it could fail in disastrous ways if the
2097
+ // input is adversarial.
2098
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2099
+ const size_t size = input_data.size();
2100
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
2101
+ volatile size_t sink{0};
2102
+ auto proc = [data, size, &output_buffer, &sink]() {
2103
+ sink = inoue2008::convert_valid(data, size, output_buffer.get());
2104
+ };
2105
+ count_events(proc, iterations); // warming up!
2106
+ const auto result = count_events(proc, iterations);
2107
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2108
+ std::cerr
2109
+ << "The output is zero which might indicate a misconfiguration.\n";
2110
+ }
2111
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2112
+ print_summary(result, size, char_count);
2113
+ }
2114
+ #endif
2115
+ /**
2116
+ * Bjoern Hoehrmann
2117
+ * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
2118
+ */
2119
+ void Benchmark::run_convert_utf8_to_utf16_hoehrmann(size_t iterations) {
2120
+ uint8_t const *data = input_data.data();
2121
+ const size_t size = input_data.size();
2122
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
2123
+ volatile size_t sink{0};
2124
+ auto proc = [data, size, &output_buffer, &sink]() {
2125
+ sink = hoehrmann::toUtf16(data, size, output_buffer.get());
2126
+ };
2127
+ count_events(proc, iterations); // warming up!
2128
+ const auto result = count_events(proc, iterations);
2129
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2130
+ std::cerr << "The output is zero which might indicate an error.\n";
2131
+ }
2132
+ size_t char_count = get_active_implementation()->count_utf8(
2133
+ reinterpret_cast<const char *>(data), size);
2134
+ print_summary(result, size, char_count);
2135
+ }
2136
+ /**
2137
+ * Bjoern Hoehrmann
2138
+ * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
2139
+ */
2140
+ void Benchmark::run_convert_utf8_to_utf32_hoehrmann(size_t iterations) {
2141
+ uint8_t const *data = input_data.data();
2142
+ const size_t size = input_data.size();
2143
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
2144
+ volatile size_t sink{0};
2145
+ auto proc = [data, size, &output_buffer, &sink]() {
2146
+ sink = hoehrmann::toUtf32(data, size, output_buffer.get());
2147
+ };
2148
+ count_events(proc, iterations); // warming up!
2149
+ const auto result = count_events(proc, iterations);
2150
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2151
+ std::cerr << "The output is zero which might indicate an error.\n";
2152
+ }
2153
+ size_t char_count = get_active_implementation()->count_utf8(
2154
+ reinterpret_cast<const char *>(data), size);
2155
+ print_summary(result, size, char_count);
2156
+ }
2157
+
2158
+ #ifdef __x86_64__
2159
+ /**
2160
+ * utf8lut: Vectorized UTF-8 converter.
2161
+ * by stgatilov (2019)
2162
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2163
+ */
2164
+ void Benchmark::run_convert_utf16_to_utf8_utf8lut(size_t iterations) {
2165
+ const simdutf::encoding_type bom =
2166
+ BOM::check_bom(input_data.data(), input_data.size());
2167
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2168
+ input_data.data() + BOM::bom_byte_size(bom));
2169
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2170
+ if (size % 2 != 0) {
2171
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2172
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2173
+ printf(" Running function on truncated input.\n");
2174
+ }
2175
+
2176
+ size /= 2;
2177
+
2178
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
2179
+ // yields 4 bytes,
2180
+ // thus we're making safe assumption that each 16-bit word will be
2181
+ // expanded to four bytes.
2182
+ // utf8lut requires an extra 16 bytes of padding.
2183
+ std::unique_ptr<char[]> output_buffer{new char[size * 4 + 16]};
2184
+
2185
+ volatile size_t sink{0};
2186
+
2187
+ auto proc = [data, size, &output_buffer, &sink]() {
2188
+ std::unique_ptr<BaseBufferProcessor> processor(
2189
+ ProcessorSelector<dfUtf16, dfUtf8>::WithOptions<cmValidate>::Create());
2190
+ ConversionResult result = ConvertInMemory(
2191
+ *processor, reinterpret_cast<const char *>(data), 2 * size,
2192
+ reinterpret_cast<char *>(output_buffer.get()), size * 4 + 16);
2193
+ if (result.status != 0) {
2194
+ sink = 0;
2195
+ } else {
2196
+ sink = result.outputSize;
2197
+ }
2198
+ };
2199
+ count_events(proc, iterations); // warming up!
2200
+ const auto result = count_events(proc, iterations);
2201
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2202
+ std::cerr << "The output is zero which might indicate an error.\n";
2203
+ }
2204
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2205
+ print_summary(result, input_data.size(), char_count);
2206
+ }
2207
+ /**
2208
+ * utf8lut: Vectorized UTF-8 converter.
2209
+ * by stgatilov (2019)
2210
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2211
+ */
2212
+ void Benchmark::run_convert_valid_utf16_to_utf8_utf8lut(size_t iterations) {
2213
+ const simdutf::encoding_type bom =
2214
+ BOM::check_bom(input_data.data(), input_data.size());
2215
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2216
+ input_data.data() + BOM::bom_byte_size(bom));
2217
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2218
+ if (size % 2 != 0) {
2219
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2220
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2221
+ printf(" Running function on truncated input.\n");
2222
+ }
2223
+
2224
+ size /= 2;
2225
+
2226
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
2227
+ // yields 4 bytes,
2228
+ // thus we're making safe assumption that each 16-bit word will be
2229
+ // expanded to four bytes.
2230
+ // utf8lut requires an extra 16 bytes of padding.
2231
+ std::unique_ptr<char[]> output_buffer{new char[size * 4 + 16]};
2232
+
2233
+ volatile size_t sink{0};
2234
+
2235
+ auto proc = [data, size, &output_buffer, &sink]() {
2236
+ std::unique_ptr<BaseBufferProcessor> processor(
2237
+ ProcessorSelector<dfUtf16, dfUtf8>::WithOptions<cmFull>::Create());
2238
+ ConversionResult result = ConvertInMemory(
2239
+ *processor, reinterpret_cast<const char *>(data), 2 * size,
2240
+ reinterpret_cast<char *>(output_buffer.get()), size * 4 + 16);
2241
+ if (result.status != 0) {
2242
+ sink = 0;
2243
+ } else {
2244
+ sink = result.outputSize;
2245
+ }
2246
+ };
2247
+ count_events(proc, iterations); // warming up!
2248
+ const auto result = count_events(proc, iterations);
2249
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2250
+ std::cerr << "The output is zero which might indicate an error.\n";
2251
+ }
2252
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2253
+ print_summary(result, input_data.size(), char_count);
2254
+ }
2255
+ /**
2256
+ * utf8lut: Vectorized UTF-8 converter.
2257
+ * by stgatilov (2019)
2258
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2259
+ */
2260
+ void Benchmark::run_convert_utf8_to_utf16_utf8lut(size_t iterations) {
2261
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2262
+ const size_t size = input_data.size();
2263
+ // utf8lut requires an extra 8 bytes of padding.
2264
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size * 2 + 8]};
2265
+ volatile size_t sink{0};
2266
+ auto proc = [data, size, &output_buffer, &sink]() {
2267
+ std::unique_ptr<BaseBufferProcessor> processor(
2268
+ ProcessorSelector<dfUtf8, dfUtf16>::WithOptions<cmValidate>::Create());
2269
+ ConversionResult result = ConvertInMemory(
2270
+ *processor, data, size, reinterpret_cast<char *>(output_buffer.get()),
2271
+ size * 2 + 16);
2272
+ if (result.status != 0) {
2273
+ sink = 0;
2274
+ } else {
2275
+ sink = result.outputSize / 2;
2276
+ }
2277
+ };
2278
+ count_events(proc, iterations); // warming up!
2279
+ const auto result = count_events(proc, iterations);
2280
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2281
+ std::cerr
2282
+ << "The output is zero which might indicate a misconfiguration.\n";
2283
+ }
2284
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2285
+ print_summary(result, size, char_count);
2286
+ }
2287
+ /**
2288
+ * utf8lut: Vectorized UTF-8 converter.
2289
+ * by stgatilov (2019)
2290
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2291
+ */
2292
+ void Benchmark::run_convert_utf8_to_utf32_utf8lut(size_t iterations) {
2293
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2294
+ const size_t size = input_data.size();
2295
+
2296
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size + 4]};
2297
+ volatile size_t sink{0};
2298
+ auto proc = [data, size, &output_buffer, &sink]() {
2299
+ std::unique_ptr<BaseBufferProcessor> processor(
2300
+ ProcessorSelector<dfUtf8, dfUtf32>::WithOptions<cmValidate>::Create());
2301
+ ConversionResult result = ConvertInMemory(
2302
+ *processor, data, size, reinterpret_cast<char *>(output_buffer.get()),
2303
+ size * 4 + 16);
2304
+ if (result.status != 0) {
2305
+ sink = 0;
2306
+ } else {
2307
+ sink = result.outputSize / 2;
2308
+ }
2309
+ };
2310
+ count_events(proc, iterations); // warming up!
2311
+ const auto result = count_events(proc, iterations);
2312
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2313
+ std::cerr
2314
+ << "The output is zero which might indicate a misconfiguration.\n";
2315
+ }
2316
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2317
+ print_summary(result, size, char_count);
2318
+ }
2319
+ /**
2320
+ * utf8lut: Vectorized UTF-8 converter.
2321
+ * by stgatilov (2019)
2322
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2323
+ */
2324
+ void Benchmark::run_convert_valid_utf8_to_utf16_utf8lut(size_t iterations) {
2325
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2326
+ const size_t size = input_data.size();
2327
+ // utf8lut requires an extra 8 bytes of padding.
2328
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size * 2 + 8]};
2329
+ volatile size_t sink{0};
2330
+ auto proc = [data, size, &output_buffer, &sink]() {
2331
+ std::unique_ptr<BaseBufferProcessor> processor(
2332
+ ProcessorSelector<dfUtf8, dfUtf16>::WithOptions<cmFull>::Create());
2333
+ ConversionResult result = ConvertInMemory(
2334
+ *processor, data, size, reinterpret_cast<char *>(output_buffer.get()),
2335
+ size * 2 + 16);
2336
+ if (result.status != 0) {
2337
+ sink = 0;
2338
+ } else {
2339
+ sink = result.outputSize / 2;
2340
+ }
2341
+ };
2342
+ count_events(proc, iterations); // warming up!
2343
+ const auto result = count_events(proc, iterations);
2344
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2345
+ std::cerr
2346
+ << "The output is zero which might indicate a misconfiguration.\n";
2347
+ }
2348
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2349
+ print_summary(result, size, char_count);
2350
+ }
2351
+
2352
+ /**
2353
+ * utf8lut: Vectorized UTF-8 converter.
2354
+ * by stgatilov (2019)
2355
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2356
+ */
2357
+ void Benchmark::run_convert_utf32_to_utf8_utf8lut(size_t iterations) {
2358
+ const simdutf::encoding_type bom =
2359
+ BOM::check_bom(input_data.data(), input_data.size());
2360
+ const char32_t *data = reinterpret_cast<const char32_t *>(
2361
+ input_data.data() + BOM::bom_byte_size(bom));
2362
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2363
+ if (size % 4 != 0) {
2364
+ printf(
2365
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
2366
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2367
+ printf(" Running function on truncated input.\n");
2368
+ }
2369
+
2370
+ size /= 4;
2371
+
2372
+ // Note: a single 32-bit word can yield up to four UTF-8 bytes. We are
2373
+ // making a safe assumption that each 32-bit word will yield four
2374
+ // UTF-8 bytes.
2375
+ // utf8lut requires an extra 16 bytes of padding.
2376
+ std::unique_ptr<char[]> output_buffer{new char[size * 4 + 16]};
2377
+
2378
+ volatile size_t sink{0};
2379
+
2380
+ auto proc = [data, size, &output_buffer, &sink]() {
2381
+ std::unique_ptr<BaseBufferProcessor> processor(
2382
+ ProcessorSelector<dfUtf32, dfUtf8>::WithOptions<cmValidate>::Create());
2383
+ ConversionResult result = ConvertInMemory(
2384
+ *processor, reinterpret_cast<const char *>(data), 4 * size,
2385
+ reinterpret_cast<char *>(output_buffer.get()), size * 4 + 16);
2386
+ if (result.status != 0) {
2387
+ sink = 0;
2388
+ } else {
2389
+ sink = result.outputSize;
2390
+ }
2391
+ };
2392
+ count_events(proc, iterations); // warming up!
2393
+ const auto result = count_events(proc, iterations);
2394
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2395
+ std::cerr << "The output is zero which might indicate an error.\n";
2396
+ }
2397
+ size_t char_count = size;
2398
+ print_summary(result, input_data.size(), char_count);
2399
+ }
2400
+
2401
+ /**
2402
+ * utf8lut: Vectorized UTF-8 converter.
2403
+ * by stgatilov (2019)
2404
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2405
+ */
2406
+ void Benchmark::run_convert_valid_utf8_to_utf32_utf8lut(size_t iterations) {
2407
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2408
+ const size_t size = input_data.size();
2409
+
2410
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size + 4]};
2411
+ volatile size_t sink{0};
2412
+ auto proc = [data, size, &output_buffer, &sink]() {
2413
+ std::unique_ptr<BaseBufferProcessor> processor(
2414
+ ProcessorSelector<dfUtf8, dfUtf32>::WithOptions<cmFull>::Create());
2415
+ ConversionResult result = ConvertInMemory(
2416
+ *processor, data, size, reinterpret_cast<char *>(output_buffer.get()),
2417
+ size * 4 + 16);
2418
+ if (result.status != 0) {
2419
+ sink = 0;
2420
+ } else {
2421
+ sink = result.outputSize / 2;
2422
+ }
2423
+ };
2424
+ count_events(proc, iterations); // warming up!
2425
+ const auto result = count_events(proc, iterations);
2426
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2427
+ std::cerr << "The output is zero which might indicate an error.\n";
2428
+ }
2429
+ size_t char_count = size;
2430
+ print_summary(result, input_data.size(), char_count);
2431
+ }
2432
+ /**
2433
+ * utf8lut: Vectorized UTF-8 converter.
2434
+ * by stgatilov (2019)
2435
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2436
+ */
2437
+ /**
2438
+ * utf8lut: Vectorized UTF-8 converter.
2439
+ * by stgatilov (2019)
2440
+ * https://dirtyhandscoding.github.io/posts/utf8lut-vectorized-utf-8-converter-introduction.html
2441
+ */
2442
+ void Benchmark::run_convert_valid_utf32_to_utf8_utf8lut(size_t iterations) {
2443
+ const simdutf::encoding_type bom =
2444
+ BOM::check_bom(input_data.data(), input_data.size());
2445
+ const char32_t *data = reinterpret_cast<const char32_t *>(
2446
+ input_data.data() + BOM::bom_byte_size(bom));
2447
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2448
+ if (size % 4 != 0) {
2449
+ printf(
2450
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
2451
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2452
+ printf(" Running function on truncated input.\n");
2453
+ }
2454
+
2455
+ size /= 4;
2456
+
2457
+ // Note: a single 32-bit word can yield up to four UTF-8 bytes. We are
2458
+ // making a safe assumption that each 32-bit word will yield four
2459
+ // UTF-8 bytes.
2460
+ // utf8lut requires an extra 16 bytes of padding.
2461
+ std::unique_ptr<char[]> output_buffer{new char[size * 4 + 16]};
2462
+
2463
+ volatile size_t sink{0};
2464
+
2465
+ auto proc = [data, size, &output_buffer, &sink]() {
2466
+ std::unique_ptr<BaseBufferProcessor> processor(
2467
+ ProcessorSelector<dfUtf32, dfUtf8>::WithOptions<cmFull>::Create());
2468
+ ConversionResult result = ConvertInMemory(
2469
+ *processor, reinterpret_cast<const char *>(data), 4 * size,
2470
+ reinterpret_cast<char *>(output_buffer.get()), size * 4 + 16);
2471
+ if (result.status != 0) {
2472
+ sink = 0;
2473
+ } else {
2474
+ sink = result.outputSize;
2475
+ }
2476
+ };
2477
+ count_events(proc, iterations); // warming up!
2478
+ const auto result = count_events(proc, iterations);
2479
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2480
+ std::cerr << "The output is zero which might indicate an error.\n";
2481
+ }
2482
+ size_t char_count = size;
2483
+ print_summary(result, input_data.size(), char_count);
2484
+ }
2485
+ /**
2486
+ * Bob Steagall, CppCon2018
2487
+ * https://github.com/BobSteagall/CppCon2018/
2488
+ *
2489
+ * Fast Conversion From UTF-8 with C++, DFAs, and SSE Intrinsics
2490
+ * https://www.youtube.com/watch?v=5FQ87-Ecb-A
2491
+ */
2492
+ void Benchmark::run_convert_utf8_to_utf16_cppcon2018(size_t iterations) {
2493
+ using char8_t = unsigned char;
2494
+ const char8_t *data = reinterpret_cast<const char8_t *>(input_data.data());
2495
+ const size_t size = input_data.size();
2496
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
2497
+ volatile size_t sink{0};
2498
+ auto proc = [data, size, &output_buffer, &sink]() {
2499
+ sink = uu::UtfUtils::SseConvert(data, data + size, output_buffer.get());
2500
+ };
2501
+ count_events(proc, iterations); // warming up!
2502
+ const auto result = count_events(proc, iterations);
2503
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2504
+ std::cerr
2505
+ << "The output is zero which might indicate a misconfiguration.\n";
2506
+ }
2507
+ size_t char_count = get_active_implementation()->count_utf8(
2508
+ reinterpret_cast<const char *>(data), size);
2509
+ print_summary(result, size, char_count);
2510
+ }
2511
+ /**
2512
+ * Bob Steagall, CppCon2018
2513
+ * https://github.com/BobSteagall/CppCon2018/
2514
+ *
2515
+ * Fast Conversion From UTF-8 with C++, DFAs, and SSE Intrinsics
2516
+ * https://www.youtube.com/watch?v=5FQ87-Ecb-A
2517
+ */
2518
+ void Benchmark::run_convert_utf8_to_utf32_cppcon2018(size_t iterations) {
2519
+ using char8_t = unsigned char;
2520
+ const char8_t *data = reinterpret_cast<const char8_t *>(input_data.data());
2521
+ const size_t size = input_data.size();
2522
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
2523
+ volatile size_t sink{0};
2524
+ auto proc = [data, size, &output_buffer, &sink]() {
2525
+ sink = uu::UtfUtils::SseConvert(data, data + size, output_buffer.get());
2526
+ };
2527
+ count_events(proc, iterations); // warming up!
2528
+ const auto result = count_events(proc, iterations);
2529
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2530
+ std::cerr
2531
+ << "The output is zero which might indicate a misconfiguration.\n";
2532
+ }
2533
+ size_t char_count = get_active_implementation()->count_utf8(
2534
+ reinterpret_cast<const char *>(data), size);
2535
+ print_summary(result, size, char_count);
2536
+ }
2537
+ /**
2538
+ * Cameron, Robert D, A case study in SIMD text processing with parallel bit
2539
+ * streams: UTF-8 to UTF-16 transcoding, Proceedings of the 13th ACM SIGPLAN
2540
+ * Symposium on Principles and practice of parallel programming, 91--98.
2541
+ */
2542
+ void Benchmark::run_convert_utf8_to_utf16_u8u16(size_t iterations) {
2543
+ // u8u16 wants to take mutable chars, let us hope it does not actually mutate
2544
+ // anything!
2545
+ //
2546
+ // This is currently untested. At a glance it looks fine, but
2547
+ // it is possible that the transcoding could be wrong.
2548
+ char *data = reinterpret_cast<char *>(input_data.data());
2549
+ const size_t size = input_data.size();
2550
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
2551
+ volatile size_t sink{0};
2552
+ auto proc = [data, size, &output_buffer, &sink]() {
2553
+ char *srcbuf_ptr = data;
2554
+ size_t inbytes_left = size;
2555
+ char *trgtbuf_ptr = reinterpret_cast<char *>(output_buffer.get());
2556
+ size_t outbytes_left = size * sizeof(char16_t);
2557
+ size_t result_code =
2558
+ u8u16(&srcbuf_ptr, &inbytes_left, &trgtbuf_ptr, &outbytes_left);
2559
+ bool is_ok = (result_code != size_t(-1));
2560
+ if (is_ok) {
2561
+ sink = (reinterpret_cast<char16_t *>(trgtbuf_ptr) - output_buffer.get());
2562
+ } else {
2563
+ sink = 0;
2564
+ }
2565
+ };
2566
+ count_events(proc, iterations); // warming up!
2567
+ const auto result = count_events(proc, iterations);
2568
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2569
+ std::cerr
2570
+ << "The output is zero which might indicate a misconfiguration.\n";
2571
+ }
2572
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2573
+ print_summary(result, size, char_count);
2574
+ }
2575
+
2576
+ /**
2577
+ * Olivier Goffart, UTF-8 processing using SIMD (SSE4), 2012.
2578
+ * https://woboq.com/blog/utf-8-processing-using-simd.html
2579
+ */
2580
+ void Benchmark::run_convert_utf8_to_utf16_utf8sse4(size_t iterations) {
2581
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2582
+ const size_t size = input_data.size();
2583
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
2584
+ volatile size_t sink{0};
2585
+ auto proc = [data, size, &output_buffer, &sink]() {
2586
+ const char *srcbuf_ptr = data;
2587
+ size_t inbytes_left = size;
2588
+ char *trgtbuf_ptr = reinterpret_cast<char *>(output_buffer.get());
2589
+ size_t outbytes_left = size * sizeof(char16_t);
2590
+ size_t result_code = utf8sse4::fromUtf8(&srcbuf_ptr, &inbytes_left,
2591
+ &trgtbuf_ptr, &outbytes_left);
2592
+ bool is_ok = (result_code != size_t(-1));
2593
+ if (is_ok) {
2594
+ sink = (reinterpret_cast<char16_t *>(trgtbuf_ptr) - output_buffer.get());
2595
+ } else {
2596
+ sink = 0;
2597
+ }
2598
+ };
2599
+ count_events(proc, iterations); // warming up!
2600
+ const auto result = count_events(proc, iterations);
2601
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2602
+ std::cerr
2603
+ << "The output is zero which might indicate a misconfiguration.\n";
2604
+ }
2605
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2606
+ print_summary(result, size, char_count);
2607
+ }
2608
+ #endif
2609
+
2610
+ void Benchmark::run_convert_valid_utf8_to_utf16le(
2611
+ const simdutf::implementation &implementation, size_t iterations) {
2612
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2613
+ const size_t size = input_data.size();
2614
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
2615
+ volatile size_t sink{0};
2616
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2617
+ sink = implementation.convert_valid_utf8_to_utf16le(data, size,
2618
+ output_buffer.get());
2619
+ };
2620
+ count_events(proc, iterations); // warming up!
2621
+ const auto result = count_events(proc, iterations);
2622
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2623
+ std::cerr
2624
+ << "The output is zero which might indicate a misconfiguration.\n";
2625
+ }
2626
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2627
+ print_summary(result, size, char_count);
2628
+ }
2629
+
2630
+ void Benchmark::run_convert_valid_utf8_to_utf32(
2631
+ const simdutf::implementation &implementation, size_t iterations) {
2632
+ const char *data = reinterpret_cast<const char *>(input_data.data());
2633
+ const size_t size = input_data.size();
2634
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
2635
+ volatile size_t sink{0};
2636
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2637
+ sink = implementation.convert_valid_utf8_to_utf32(data, size,
2638
+ output_buffer.get());
2639
+ };
2640
+ count_events(proc, iterations); // warming up!
2641
+ const auto result = count_events(proc, iterations);
2642
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2643
+ std::cerr
2644
+ << "The output is zero which might indicate a misconfiguration.\n";
2645
+ }
2646
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
2647
+ print_summary(result, size, char_count);
2648
+ }
2649
+
2650
+ void Benchmark::run_convert_utf16le_to_latin1(
2651
+ const simdutf::implementation &implementation, size_t iterations) {
2652
+ const simdutf::encoding_type bom =
2653
+ BOM::check_bom(input_data.data(), input_data.size());
2654
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2655
+ input_data.data() + BOM::bom_byte_size(bom));
2656
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2657
+ if (size % 2 != 0) {
2658
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2659
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2660
+ printf(" Running function on truncated input.\n");
2661
+ }
2662
+
2663
+ size /= 2;
2664
+ std::unique_ptr<char[]> output_buffer{new char[size]};
2665
+ volatile size_t sink{0};
2666
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2667
+ sink = implementation.convert_utf16le_to_latin1(data, size,
2668
+ output_buffer.get());
2669
+ };
2670
+ count_events(proc, iterations); // warming up!
2671
+ const auto result = count_events(proc, iterations);
2672
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2673
+ std::cerr << "The output is zero which might indicate an error.\n";
2674
+ }
2675
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2676
+ print_summary(result, input_data.size(), char_count);
2677
+ }
2678
+
2679
+ void Benchmark::run_convert_utf16le_to_latin1_with_errors(
2680
+ const simdutf::implementation &implementation, size_t iterations) {
2681
+ const simdutf::encoding_type bom =
2682
+ BOM::check_bom(input_data.data(), input_data.size());
2683
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2684
+ input_data.data() + BOM::bom_byte_size(bom));
2685
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2686
+ if (size % 2 != 0) {
2687
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2688
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2689
+ printf(" Running function on truncated input.\n");
2690
+ }
2691
+
2692
+ size /= 2;
2693
+ std::unique_ptr<char[]> output_buffer{new char[size]};
2694
+ volatile bool sink{false};
2695
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2696
+ result res = implementation.convert_utf16le_to_latin1_with_errors(
2697
+ data, size, output_buffer.get());
2698
+ sink = !(res.error);
2699
+ };
2700
+ count_events(proc, iterations); // warming up!
2701
+ const auto result = count_events(proc, iterations);
2702
+ if ((sink == false) && (iterations > 0)) {
2703
+ std::cerr << "The input was declared invalid.\n";
2704
+ }
2705
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2706
+ print_summary(result, input_data.size(), char_count);
2707
+ }
2708
+
2709
+ void Benchmark::run_convert_valid_utf16le_to_latin1(
2710
+ const simdutf::implementation &implementation, size_t iterations) {
2711
+ const simdutf::encoding_type bom =
2712
+ BOM::check_bom(input_data.data(), input_data.size());
2713
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2714
+ input_data.data() + BOM::bom_byte_size(bom));
2715
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2716
+ if (size % 2 != 0) {
2717
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2718
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2719
+ printf(" Running function on truncated input.\n");
2720
+ }
2721
+
2722
+ size /= 2;
2723
+ std::unique_ptr<char[]> output_buffer{new char[size]};
2724
+ volatile size_t sink{0};
2725
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2726
+ sink = implementation.convert_valid_utf16le_to_latin1(data, size,
2727
+ output_buffer.get());
2728
+ };
2729
+ count_events(proc, iterations); // warming up!
2730
+ const auto result = count_events(proc, iterations);
2731
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2732
+ std::cerr << "The output is zero which might indicate an error.\n";
2733
+ }
2734
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2735
+ print_summary(result, input_data.size(), char_count);
2736
+ }
2737
+
2738
+ void Benchmark::run_convert_utf16_to_utf8_safe(
2739
+ const simdutf::implementation &implementation, size_t iterations) {
2740
+ const simdutf::implementation *active_implementation =
2741
+ simdutf::get_active_implementation();
2742
+ simdutf::get_active_implementation() =
2743
+ &implementation; // set the active implementation
2744
+ const simdutf::encoding_type bom =
2745
+ BOM::check_bom(input_data.data(), input_data.size());
2746
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2747
+ input_data.data() + BOM::bom_byte_size(bom));
2748
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2749
+ if (size % 2 != 0) {
2750
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2751
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2752
+ printf(" Running function on truncated input.\n");
2753
+ }
2754
+
2755
+ size /= 2;
2756
+
2757
+ size_t budget = simdutf::utf8_length_from_utf16(data, size);
2758
+
2759
+ std::unique_ptr<char[]> output_buffer{new char[budget]};
2760
+
2761
+ volatile size_t sink{0};
2762
+
2763
+ auto proc = [&implementation, data, size, &output_buffer, &sink, &budget]() {
2764
+ sink = simdutf::convert_utf16_to_utf8_safe(data, size, output_buffer.get(),
2765
+ budget);
2766
+ };
2767
+ count_events(proc, iterations); // warming up!
2768
+ const auto result = count_events(proc, iterations);
2769
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2770
+ std::cerr << "The output is zero which might indicate an error.\n";
2771
+ }
2772
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2773
+ print_summary(result, input_data.size(), char_count);
2774
+ simdutf::get_active_implementation() =
2775
+ active_implementation; // restore the active implementation
2776
+ }
2777
+
2778
+ void Benchmark::run_convert_utf16le_to_utf8(
2779
+ const simdutf::implementation &implementation, size_t iterations) {
2780
+ const simdutf::encoding_type bom =
2781
+ BOM::check_bom(input_data.data(), input_data.size());
2782
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2783
+ input_data.data() + BOM::bom_byte_size(bom));
2784
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2785
+ if (size % 2 != 0) {
2786
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2787
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2788
+ printf(" Running function on truncated input.\n");
2789
+ }
2790
+
2791
+ size /= 2;
2792
+
2793
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
2794
+ // yields 4 bytes,
2795
+ // thus we're making safe assumption that each 16-bit word will be
2796
+ // expanded to four bytes.
2797
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
2798
+
2799
+ volatile size_t sink{0};
2800
+
2801
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2802
+ sink =
2803
+ implementation.convert_utf16le_to_utf8(data, size, output_buffer.get());
2804
+ };
2805
+ count_events(proc, iterations); // warming up!
2806
+ const auto result = count_events(proc, iterations);
2807
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2808
+ std::cerr << "The output is zero which might indicate an error.\n";
2809
+ }
2810
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2811
+ print_summary(result, input_data.size(), char_count);
2812
+ }
2813
+
2814
+ void Benchmark::run_convert_utf16le_to_utf8_with_errors(
2815
+ const simdutf::implementation &implementation, size_t iterations) {
2816
+ const simdutf::encoding_type bom =
2817
+ BOM::check_bom(input_data.data(), input_data.size());
2818
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2819
+ input_data.data() + BOM::bom_byte_size(bom));
2820
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2821
+ if (size % 2 != 0) {
2822
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2823
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2824
+ printf(" Running function on truncated input.\n");
2825
+ }
2826
+
2827
+ size /= 2;
2828
+
2829
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
2830
+ // yields 4 bytes,
2831
+ // thus we're making safe assumption that each 16-bit word will be
2832
+ // expanded to four bytes.
2833
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
2834
+
2835
+ volatile bool sink{false};
2836
+
2837
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2838
+ result res = implementation.convert_utf16le_to_utf8_with_errors(
2839
+ data, size, output_buffer.get());
2840
+ sink = !(res.error);
2841
+ };
2842
+ count_events(proc, iterations); // warming up!
2843
+ const auto result = count_events(proc, iterations);
2844
+ if ((sink == false) && (iterations > 0)) {
2845
+ std::cerr << "The input was declared invalid.\n";
2846
+ }
2847
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2848
+ print_summary(result, input_data.size(), char_count);
2849
+ }
2850
+
2851
+ void Benchmark::run_convert_utf16le_to_utf32(
2852
+ const simdutf::implementation &implementation, size_t iterations) {
2853
+ const simdutf::encoding_type bom =
2854
+ BOM::check_bom(input_data.data(), input_data.size());
2855
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2856
+ input_data.data() + BOM::bom_byte_size(bom));
2857
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2858
+ if (size % 2 != 0) {
2859
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2860
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2861
+ printf(" Running function on truncated input.\n");
2862
+ }
2863
+
2864
+ size /= 2;
2865
+
2866
+ // Note: all code units yield 4 bytes. We are making a safe assumption that
2867
+ // all code units will be non-surrogate code units so the size would get
2868
+ // doubled (16 bits -> 32 bits).
2869
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size * 2]};
2870
+
2871
+ volatile size_t sink{0};
2872
+
2873
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2874
+ sink = implementation.convert_utf16le_to_utf32(data, size,
2875
+ output_buffer.get());
2876
+ };
2877
+ count_events(proc, iterations); // warming up!
2878
+ const auto result = count_events(proc, iterations);
2879
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2880
+ std::cerr << "The output is zero which might indicate an error.\n";
2881
+ }
2882
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2883
+ print_summary(result, input_data.size(), char_count);
2884
+ }
2885
+
2886
+ void Benchmark::run_convert_utf16le_to_utf32_with_errors(
2887
+ const simdutf::implementation &implementation, size_t iterations) {
2888
+ const simdutf::encoding_type bom =
2889
+ BOM::check_bom(input_data.data(), input_data.size());
2890
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2891
+ input_data.data() + BOM::bom_byte_size(bom));
2892
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2893
+ if (size % 2 != 0) {
2894
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2895
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2896
+ printf(" Running function on truncated input.\n");
2897
+ }
2898
+
2899
+ size /= 2;
2900
+
2901
+ // Note: all code units yield 4 bytes. We are making a safe assumption that
2902
+ // all code units will be non-surrogate code units so the size would get
2903
+ // doubled (16 bits -> 32 bits).
2904
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size * 2]};
2905
+
2906
+ volatile bool sink{false};
2907
+
2908
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
2909
+ result res = implementation.convert_utf16le_to_utf32_with_errors(
2910
+ data, size, output_buffer.get());
2911
+ sink = !(res.error);
2912
+ };
2913
+ count_events(proc, iterations); // warming up!
2914
+ const auto result = count_events(proc, iterations);
2915
+ if ((sink == false) && (iterations > 0)) {
2916
+ std::cerr << "The input was declared invalid.\n";
2917
+ }
2918
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2919
+ print_summary(result, input_data.size(), char_count);
2920
+ }
2921
+
2922
+ void Benchmark::run_convert_utf16le_to_utf8_with_dynamic_allocation(
2923
+ const simdutf::implementation &implementation, size_t iterations) {
2924
+ const simdutf::encoding_type bom =
2925
+ BOM::check_bom(input_data.data(), input_data.size());
2926
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2927
+ input_data.data() + BOM::bom_byte_size(bom));
2928
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2929
+ if (size % 2 != 0) {
2930
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2931
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2932
+ printf(" Running function on truncated input.\n");
2933
+ }
2934
+
2935
+ size /= 2;
2936
+
2937
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
2938
+ // yields 4 bytes,
2939
+ // thus we're making safe assumption that each 16-bit word will be
2940
+ // expanded to four bytes.
2941
+
2942
+ volatile size_t sink{0};
2943
+
2944
+ auto proc = [&implementation, data, size, &sink]() {
2945
+ auto dyn_size = implementation.utf8_length_from_utf16le(data, size);
2946
+ std::unique_ptr<char[]> output_buffer{new char[dyn_size]};
2947
+ sink =
2948
+ implementation.convert_utf16le_to_utf8(data, size, output_buffer.get());
2949
+ };
2950
+ count_events(proc, iterations); // warming up!
2951
+ const auto result = count_events(proc, iterations);
2952
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2953
+ std::cerr << "The output is zero which might indicate an error.\n";
2954
+ }
2955
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2956
+ print_summary(result, input_data.size(), char_count);
2957
+ }
2958
+
2959
+ void Benchmark::run_convert_utf16le_to_utf32_with_dynamic_allocation(
2960
+ const simdutf::implementation &implementation, size_t iterations) {
2961
+ const simdutf::encoding_type bom =
2962
+ BOM::check_bom(input_data.data(), input_data.size());
2963
+ const char16_t *data = reinterpret_cast<const char16_t *>(
2964
+ input_data.data() + BOM::bom_byte_size(bom));
2965
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
2966
+ if (size % 2 != 0) {
2967
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
2968
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
2969
+ printf(" Running function on truncated input.\n");
2970
+ }
2971
+
2972
+ size /= 2;
2973
+
2974
+ // Note: all code units yield 4 bytes. We are making a safe assumption that
2975
+ // all code units will be non-surrogate code units so the size would get
2976
+ // doubled (16 bits -> 32 bits).
2977
+
2978
+ volatile size_t sink{0};
2979
+
2980
+ auto proc = [&implementation, data, size, &sink]() {
2981
+ auto dyn_size = implementation.utf32_length_from_utf16le(data, size);
2982
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[dyn_size]};
2983
+ sink = implementation.convert_utf16le_to_utf32(data, size,
2984
+ output_buffer.get());
2985
+ };
2986
+ count_events(proc, iterations); // warming up!
2987
+ const auto result = count_events(proc, iterations);
2988
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
2989
+ std::cerr << "The output is zero which might indicate an error.\n";
2990
+ }
2991
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
2992
+ print_summary(result, input_data.size(), char_count);
2993
+ }
2994
+
2995
+ void Benchmark::run_convert_valid_utf16le_to_utf8(
2996
+ const simdutf::implementation &implementation, size_t iterations) {
2997
+ const simdutf::encoding_type bom =
2998
+ BOM::check_bom(input_data.data(), input_data.size());
2999
+ const char16_t *data = reinterpret_cast<const char16_t *>(
3000
+ input_data.data() + BOM::bom_byte_size(bom));
3001
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3002
+ if (size % 2 != 0) {
3003
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
3004
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3005
+ printf(" Running function on truncated input.\n");
3006
+ }
3007
+
3008
+ size /= 2;
3009
+
3010
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
3011
+ // yields 4 bytes,
3012
+ // thus we're making safe assumption that each 16-bit word will be
3013
+ // expanded to four bytes.
3014
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
3015
+
3016
+ volatile size_t sink{0};
3017
+
3018
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3019
+ sink = implementation.convert_valid_utf16le_to_utf8(data, size,
3020
+ output_buffer.get());
3021
+ };
3022
+ count_events(proc, iterations); // warming up!
3023
+ const auto result = count_events(proc, iterations);
3024
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3025
+ std::cerr << "The output is zero which might indicate an error.\n";
3026
+ }
3027
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
3028
+ print_summary(result, input_data.size(), char_count);
3029
+ }
3030
+
3031
+ void Benchmark::run_convert_utf32_to_latin1(
3032
+ const simdutf::implementation &implementation, size_t iterations) {
3033
+ const simdutf::encoding_type bom =
3034
+ BOM::check_bom(input_data.data(), input_data.size());
3035
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3036
+ input_data.data() + BOM::bom_byte_size(bom));
3037
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3038
+ if (size % 4 != 0) {
3039
+ printf(
3040
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3041
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3042
+ printf(" Running function on truncated input.\n");
3043
+ }
3044
+
3045
+ size /= 4;
3046
+
3047
+ std::unique_ptr<char[]> output_buffer{new char[size]};
3048
+ volatile size_t sink{0};
3049
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3050
+ sink =
3051
+ implementation.convert_utf32_to_latin1(data, size, output_buffer.get());
3052
+ };
3053
+ count_events(proc, iterations); // warming up!
3054
+ const auto result = count_events(proc, iterations);
3055
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3056
+ std::cerr << "The output is zero which might indicate an error.\n";
3057
+ }
3058
+ size_t char_count = size;
3059
+ print_summary(result, input_data.size(), char_count);
3060
+ }
3061
+ void Benchmark::run_convert_utf32_to_latin1_with_errors(
3062
+ const simdutf::implementation &implementation, size_t iterations) {
3063
+ const simdutf::encoding_type bom =
3064
+ BOM::check_bom(input_data.data(), input_data.size());
3065
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3066
+ input_data.data() + BOM::bom_byte_size(bom));
3067
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3068
+ if (size % 4 != 0) {
3069
+ printf(
3070
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3071
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3072
+ printf(" Running function on truncated input.\n");
3073
+ }
3074
+
3075
+ size /= 4;
3076
+
3077
+ std::unique_ptr<char[]> output_buffer{new char[size]};
3078
+ volatile bool sink{false};
3079
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3080
+ result res = implementation.convert_utf32_to_latin1_with_errors(
3081
+ data, size, output_buffer.get());
3082
+ sink = !(res.error);
3083
+ };
3084
+ count_events(proc, iterations); // warming up!
3085
+ const auto result = count_events(proc, iterations);
3086
+ if ((sink == false) && (iterations > 0)) {
3087
+ std::cerr << "The input was declared invalid.\n";
3088
+ }
3089
+ size_t char_count = size;
3090
+ print_summary(result, input_data.size(), char_count);
3091
+ }
3092
+ void Benchmark::run_convert_valid_utf32_to_latin1(
3093
+ const simdutf::implementation &implementation, size_t iterations) {
3094
+ const simdutf::encoding_type bom =
3095
+ BOM::check_bom(input_data.data(), input_data.size());
3096
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3097
+ input_data.data() + BOM::bom_byte_size(bom));
3098
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3099
+ if (size % 4 != 0) {
3100
+ printf(
3101
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3102
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3103
+ printf(" Running function on truncated input.\n");
3104
+ }
3105
+
3106
+ size /= 4;
3107
+
3108
+ std::unique_ptr<char[]> output_buffer{new char[size]};
3109
+ volatile size_t sink{0};
3110
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3111
+ sink = implementation.convert_valid_utf32_to_latin1(data, size,
3112
+ output_buffer.get());
3113
+ };
3114
+ count_events(proc, iterations); // warming up!
3115
+ const auto result = count_events(proc, iterations);
3116
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3117
+ std::cerr << "The output is zero which might indicate an error.\n";
3118
+ }
3119
+ size_t char_count = size;
3120
+ print_summary(result, input_data.size(), char_count);
3121
+ }
3122
+
3123
+ void Benchmark::run_convert_utf32_to_utf8(
3124
+ const simdutf::implementation &implementation, size_t iterations) {
3125
+ const simdutf::encoding_type bom =
3126
+ BOM::check_bom(input_data.data(), input_data.size());
3127
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3128
+ input_data.data() + BOM::bom_byte_size(bom));
3129
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3130
+ if (size % 4 != 0) {
3131
+ printf(
3132
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3133
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3134
+ printf(" Running function on truncated input.\n");
3135
+ }
3136
+
3137
+ size /= 4;
3138
+
3139
+ // Note: In the "worst" case, a 32-bit word will yield 4 UTF-8 bytes. So, we
3140
+ // are making a safe assumption that each word will produce 4 bytes.
3141
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
3142
+
3143
+ volatile size_t sink{0};
3144
+
3145
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3146
+ sink =
3147
+ implementation.convert_utf32_to_utf8(data, size, output_buffer.get());
3148
+ };
3149
+ count_events(proc, iterations); // warming up!
3150
+ const auto result = count_events(proc, iterations);
3151
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3152
+ std::cerr << "The output is zero which might indicate an error.\n";
3153
+ }
3154
+ size_t char_count = size;
3155
+ print_summary(result, input_data.size(), char_count);
3156
+ }
3157
+
3158
+ void Benchmark::run_convert_utf32_to_utf8_with_errors(
3159
+ const simdutf::implementation &implementation, size_t iterations) {
3160
+ const simdutf::encoding_type bom =
3161
+ BOM::check_bom(input_data.data(), input_data.size());
3162
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3163
+ input_data.data() + BOM::bom_byte_size(bom));
3164
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3165
+ if (size % 4 != 0) {
3166
+ printf(
3167
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3168
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3169
+ printf(" Running function on truncated input.\n");
3170
+ }
3171
+
3172
+ size /= 4;
3173
+
3174
+ // Note: In the "worst" case, a 32-bit word will yield 4 UTF-8 bytes. So, we
3175
+ // are making a safe assumption that each word will produce 4 bytes.
3176
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
3177
+
3178
+ volatile bool sink{false};
3179
+
3180
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3181
+ result res = implementation.convert_utf32_to_utf8_with_errors(
3182
+ data, size, output_buffer.get());
3183
+ sink = !(res.error);
3184
+ };
3185
+ count_events(proc, iterations); // warming up!
3186
+ const auto result = count_events(proc, iterations);
3187
+ if ((sink == false) && (iterations > 0)) {
3188
+ std::cerr << "The input was declared invalid.\n";
3189
+ }
3190
+ size_t char_count = size;
3191
+ print_summary(result, input_data.size(), char_count);
3192
+ }
3193
+
3194
+ void Benchmark::run_convert_valid_utf32_to_utf8(
3195
+ const simdutf::implementation &implementation, size_t iterations) {
3196
+ const simdutf::encoding_type bom =
3197
+ BOM::check_bom(input_data.data(), input_data.size());
3198
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3199
+ input_data.data() + BOM::bom_byte_size(bom));
3200
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3201
+ if (size % 4 != 0) {
3202
+ printf(
3203
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3204
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3205
+ printf(" Running function on truncated input.\n");
3206
+ }
3207
+
3208
+ size /= 4;
3209
+
3210
+ // Note: In the "worst" case, a 32-bit word will yield 4 UTF-8 bytes. So, we
3211
+ // are making a safe assumption that each word will produce 4 bytes.
3212
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
3213
+
3214
+ volatile size_t sink{0};
3215
+
3216
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3217
+ sink = implementation.convert_valid_utf32_to_utf8(data, size,
3218
+ output_buffer.get());
3219
+ };
3220
+ count_events(proc, iterations); // warming up!
3221
+ const auto result = count_events(proc, iterations);
3222
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3223
+ std::cerr << "The output is zero which might indicate an error.\n";
3224
+ }
3225
+ size_t char_count = size;
3226
+ print_summary(result, input_data.size(), char_count);
3227
+ }
3228
+
3229
+ void Benchmark::run_convert_valid_utf16le_to_utf32(
3230
+ const simdutf::implementation &implementation, size_t iterations) {
3231
+ const simdutf::encoding_type bom =
3232
+ BOM::check_bom(input_data.data(), input_data.size());
3233
+ const char16_t *data = reinterpret_cast<const char16_t *>(
3234
+ input_data.data() + BOM::bom_byte_size(bom));
3235
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3236
+ if (size % 2 != 0) {
3237
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
3238
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3239
+ printf(" Running function on truncated input.\n");
3240
+ }
3241
+
3242
+ size /= 2;
3243
+
3244
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
3245
+ // yields 4 bytes,
3246
+ // thus we're making safe assumption that each 16-bit word will be
3247
+ // expanded to four bytes.
3248
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size * 4]};
3249
+
3250
+ volatile size_t sink{0};
3251
+
3252
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3253
+ sink = implementation.convert_valid_utf16le_to_utf32(data, size,
3254
+ output_buffer.get());
3255
+ };
3256
+ count_events(proc, iterations); // warming up!
3257
+ const auto result = count_events(proc, iterations);
3258
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3259
+ std::cerr << "The output is zero which might indicate an error.\n";
3260
+ }
3261
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
3262
+ print_summary(result, input_data.size(), char_count);
3263
+ }
3264
+
3265
+ template <endianness byte_order>
3266
+ void Benchmark::run_convert_utf32_to_utf16(
3267
+ const simdutf::implementation &implementation, size_t iterations) {
3268
+ const simdutf::encoding_type bom =
3269
+ BOM::check_bom(input_data.data(), input_data.size());
3270
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3271
+ input_data.data() + BOM::bom_byte_size(bom));
3272
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3273
+ if (size % 4 != 0) {
3274
+ printf(
3275
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3276
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3277
+ printf(" Running function on truncated input.\n");
3278
+ }
3279
+
3280
+ size /= 4;
3281
+
3282
+ // Note: In the "worst" case, a 32-bit word will yield two 16-bit code units.
3283
+ // So, we are making a safe assumption that each word will produce 2 bytes.
3284
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size * 2]};
3285
+
3286
+ volatile size_t sink{0};
3287
+
3288
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3289
+ if (byte_order == endianness::LITTLE) {
3290
+ sink = implementation.convert_utf32_to_utf16le(data, size,
3291
+ output_buffer.get());
3292
+ } else {
3293
+ sink = implementation.convert_utf32_to_utf16be(data, size,
3294
+ output_buffer.get());
3295
+ }
3296
+ };
3297
+ count_events(proc, iterations); // warming up!
3298
+ const auto result = count_events(proc, iterations);
3299
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3300
+ std::cerr << "The output is zero which might indicate an error.\n";
3301
+ }
3302
+ size_t char_count = size;
3303
+ print_summary(result, input_data.size(), char_count);
3304
+ }
3305
+
3306
+ template <endianness byte_order>
3307
+ void Benchmark::run_convert_utf32_to_utf16_with_errors(
3308
+ const simdutf::implementation &implementation, size_t iterations) {
3309
+ const simdutf::encoding_type bom =
3310
+ BOM::check_bom(input_data.data(), input_data.size());
3311
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3312
+ input_data.data() + BOM::bom_byte_size(bom));
3313
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3314
+ if (size % 4 != 0) {
3315
+ printf(
3316
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3317
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3318
+ printf(" Running function on truncated input.\n");
3319
+ }
3320
+
3321
+ size /= 4;
3322
+
3323
+ // Note: In the "worst" case, a 32-bit word will yield two 16-bit code units.
3324
+ // So, we are making a safe assumption that each word will produce 2 bytes.
3325
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size * 2]};
3326
+
3327
+ volatile bool sink{false};
3328
+
3329
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3330
+ if (byte_order == endianness::LITTLE) {
3331
+ result res = implementation.convert_utf32_to_utf16le_with_errors(
3332
+ data, size, output_buffer.get());
3333
+ sink = !(res.error);
3334
+ } else {
3335
+ result res = implementation.convert_utf32_to_utf16be_with_errors(
3336
+ data, size, output_buffer.get());
3337
+ sink = !(res.error);
3338
+ }
3339
+ };
3340
+ count_events(proc, iterations); // warming up!
3341
+ const auto result = count_events(proc, iterations);
3342
+ if ((sink == false) && (iterations > 0)) {
3343
+ std::cerr << "The input was declared invalid.\n";
3344
+ }
3345
+ size_t char_count = size;
3346
+ print_summary(result, input_data.size(), char_count);
3347
+ }
3348
+
3349
+ template <endianness byte_order>
3350
+ void Benchmark::run_convert_valid_utf32_to_utf16(
3351
+ const simdutf::implementation &implementation, size_t iterations) {
3352
+ const simdutf::encoding_type bom =
3353
+ BOM::check_bom(input_data.data(), input_data.size());
3354
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3355
+ input_data.data() + BOM::bom_byte_size(bom));
3356
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3357
+ if (size % 4 != 0) {
3358
+ printf(
3359
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3360
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3361
+ printf(" Running function on truncated input.\n");
3362
+ }
3363
+
3364
+ size /= 4;
3365
+
3366
+ // Note: In the "worst" case, a 32-bit word will yield two 16-bit code units.
3367
+ // So, we are making a safe assumption that each word will produce 2 bytes.
3368
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size * 2]};
3369
+
3370
+ volatile size_t sink{0};
3371
+
3372
+ auto proc = [&implementation, data, size, &output_buffer, &sink]() {
3373
+ if (byte_order == endianness::LITTLE) {
3374
+ sink = implementation.convert_valid_utf32_to_utf16le(data, size,
3375
+ output_buffer.get());
3376
+ } else {
3377
+ sink = implementation.convert_valid_utf32_to_utf16be(data, size,
3378
+ output_buffer.get());
3379
+ }
3380
+ };
3381
+ count_events(proc, iterations); // warming up!
3382
+ const auto result = count_events(proc, iterations);
3383
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3384
+ std::cerr << "The output is zero which might indicate an error.\n";
3385
+ }
3386
+ size_t char_count = size;
3387
+ print_summary(result, input_data.size(), char_count);
3388
+ }
3389
+
3390
+ void Benchmark::run_count_utf8(const simdutf::implementation &implementation,
3391
+ size_t iterations) {
3392
+ const char *data = reinterpret_cast<const char *>(input_data.data());
3393
+ const size_t size = input_data.size();
3394
+ volatile size_t sink{0};
3395
+
3396
+ auto proc = [&implementation, data, size, &sink]() {
3397
+ sink = implementation.count_utf8(data, size);
3398
+ };
3399
+ count_events(proc, iterations); // warming up!
3400
+ const auto result = count_events(proc, iterations);
3401
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3402
+ std::cerr << "The output is zero which might indicate an error.\n";
3403
+ }
3404
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
3405
+ print_summary(result, size, char_count);
3406
+ }
3407
+
3408
+ void Benchmark::run_count_utf16le(const simdutf::implementation &implementation,
3409
+ size_t iterations) {
3410
+ const simdutf::encoding_type bom =
3411
+ BOM::check_bom(input_data.data(), input_data.size());
3412
+ const char16_t *data = reinterpret_cast<const char16_t *>(input_data.data());
3413
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3414
+ if (size % 2 != 0) {
3415
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
3416
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3417
+ printf(" Running function on truncated input.\n");
3418
+ }
3419
+ size /= 2;
3420
+ volatile size_t sink{0};
3421
+ auto proc = [&implementation, data, size, &sink]() {
3422
+ sink = implementation.count_utf16le(data, size);
3423
+ };
3424
+ count_events(proc, iterations); // warming up!
3425
+ const auto result = count_events(proc, iterations);
3426
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3427
+ std::cerr << "The output is zero which might indicate an error.\n";
3428
+ }
3429
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
3430
+ print_summary(result, input_data.size(), char_count);
3431
+ }
3432
+
3433
+ void Benchmark::run_detect_encodings(
3434
+ const simdutf::implementation &implementation, size_t iterations) {
3435
+ const simdutf::encoding_type bom =
3436
+ BOM::check_bom(input_data.data(), input_data.size());
3437
+ const char *data = reinterpret_cast<const char *>(input_data.data() +
3438
+ BOM::bom_byte_size(bom));
3439
+ const size_t size = input_data.size() - BOM::bom_byte_size(bom);
3440
+ volatile size_t sink{0};
3441
+ auto proc = [&implementation, data, size, &sink]() {
3442
+ sink = implementation.detect_encodings(data, size);
3443
+ };
3444
+ count_events(proc, iterations); // warming up!
3445
+ const auto result = count_events(proc, iterations);
3446
+ size_t char_count = size;
3447
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3448
+ std::cerr << "The output is zero which might indicate an error.\n";
3449
+ } else {
3450
+ std::cout << "Detected format: ";
3451
+ if (sink & simdutf::encoding_type::UTF8) {
3452
+ char_count = get_active_implementation()->count_utf8(data, size);
3453
+ std::cout << " UTF8";
3454
+ }
3455
+ if (sink & simdutf::encoding_type::UTF16_LE) {
3456
+ std::cout << " UTF16LE";
3457
+ char_count = get_active_implementation()->count_utf16le(
3458
+ reinterpret_cast<const char16_t *>(data), size / 2);
3459
+ }
3460
+ if (sink & simdutf::encoding_type::UTF32_LE) {
3461
+ std::cout << " UTF32LE";
3462
+ char_count = size / 4;
3463
+ }
3464
+ std::cout << std::endl;
3465
+ }
3466
+ if ((bom) && (bom & ~sink)) {
3467
+ std::cerr << "[Error] BOM format : ";
3468
+ if (bom & simdutf::encoding_type::UTF8) {
3469
+ std::cerr << " UTF8";
3470
+ } else if (bom & simdutf::encoding_type::UTF16_LE) {
3471
+ std::cerr << " UTF16LE";
3472
+ } else if (bom & simdutf::encoding_type::UTF32_LE) {
3473
+ std::cerr << " UTF32LE";
3474
+ }
3475
+ std::cerr << std::endl;
3476
+ }
3477
+ if ((sink & (sink - 1)) != 0) {
3478
+ std::cout << "More than one format possible, character count is ambiguous."
3479
+ << std::endl;
3480
+ }
3481
+ print_summary(result, size, char_count);
3482
+ }
3483
+
3484
+ const std::set<std::string> Benchmark::all_procedures() const {
3485
+ std::set<std::string> result;
3486
+ for (const auto &item : benchmarks) {
3487
+ result.insert(item.first);
3488
+ }
3489
+
3490
+ return result;
3491
+ }
3492
+
3493
+ std::set<simdutf::encoding_type>
3494
+ Benchmark::expected_encodings(const std::string &procedure) {
3495
+ return benchmarks[procedure].second;
3496
+ }
3497
+
3498
+ /**
3499
+ * LLVM relies on code from the Unicode Consortium
3500
+ * https://en.wikipedia.org/wiki/Unicode_Consortium
3501
+ */
3502
+ void Benchmark::run_convert_utf8_to_utf16_llvm(size_t iterations) {
3503
+ const char *data = reinterpret_cast<const char *>(input_data.data());
3504
+ const size_t size = input_data.size();
3505
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
3506
+ volatile size_t sink{0};
3507
+ auto proc = [data, size, &output_buffer, &sink]() {
3508
+ const unsigned char *sourceStart =
3509
+ reinterpret_cast<const unsigned char *>(data);
3510
+ const unsigned char *sourceEnd = sourceStart + size;
3511
+ short unsigned int *targetStart =
3512
+ reinterpret_cast<short unsigned int *>(output_buffer.get());
3513
+ short unsigned int *targetEnd = targetStart + size;
3514
+ bool is_ok = (llvm::conversionOK ==
3515
+ llvm::ConvertUTF8toUTF16(
3516
+ &sourceStart, sourceEnd, &targetStart, targetEnd,
3517
+ llvm::ConversionFlags::strictConversion));
3518
+ if (is_ok) {
3519
+ sink = (targetStart -
3520
+ reinterpret_cast<short unsigned int *>(output_buffer.get()));
3521
+ } else {
3522
+ sink = 0;
3523
+ }
3524
+ };
3525
+ count_events(proc, iterations); // warming up!
3526
+ const auto result = count_events(proc, iterations);
3527
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3528
+ std::cerr
3529
+ << "The output is zero which might indicate a misconfiguration.\n";
3530
+ }
3531
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
3532
+ print_summary(result, size, char_count);
3533
+ }
3534
+
3535
+ void Benchmark::run_convert_utf8_to_utf32_llvm(size_t iterations) {
3536
+ const char *data = reinterpret_cast<const char *>(input_data.data());
3537
+ const size_t size = input_data.size();
3538
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size]};
3539
+ volatile size_t sink{0};
3540
+ auto proc = [data, size, &output_buffer, &sink]() {
3541
+ const unsigned char *sourceStart =
3542
+ reinterpret_cast<const unsigned char *>(data);
3543
+ const unsigned char *sourceEnd = sourceStart + size;
3544
+ unsigned int *targetStart =
3545
+ reinterpret_cast<unsigned int *>(output_buffer.get());
3546
+ unsigned int *targetEnd = targetStart + size;
3547
+ bool is_ok = (llvm::conversionOK ==
3548
+ llvm::ConvertUTF8toUTF32(
3549
+ &sourceStart, sourceEnd, &targetStart, targetEnd,
3550
+ llvm::ConversionFlags::strictConversion));
3551
+ if (is_ok) {
3552
+ sink =
3553
+ (targetStart - reinterpret_cast<unsigned int *>(output_buffer.get()));
3554
+ } else {
3555
+ sink = 0;
3556
+ }
3557
+ };
3558
+ count_events(proc, iterations); // warming up!
3559
+ const auto result = count_events(proc, iterations);
3560
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3561
+ std::cerr
3562
+ << "The output is zero which might indicate a misconfiguration.\n";
3563
+ }
3564
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
3565
+ print_summary(result, size, char_count);
3566
+ }
3567
+
3568
+ void Benchmark::run_convert_utf16_to_utf8_llvm(size_t iterations) {
3569
+ const simdutf::encoding_type bom =
3570
+ BOM::check_bom(input_data.data(), input_data.size());
3571
+ const char16_t *data = reinterpret_cast<const char16_t *>(
3572
+ input_data.data() + BOM::bom_byte_size(bom));
3573
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3574
+ if (size % 2 != 0) {
3575
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
3576
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3577
+ printf(" Running function on truncated input.\n");
3578
+ }
3579
+
3580
+ size /= 2;
3581
+
3582
+ // Note: non-surrogate code units can yield up to 3 bytes, a surrogate pair
3583
+ // yields 4 bytes,
3584
+ // thus we're making safe assumption that each 16-bit word will be
3585
+ // expanded to four bytes.
3586
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
3587
+
3588
+ volatile size_t sink{0};
3589
+
3590
+ auto proc = [data, size, &output_buffer, &sink]() {
3591
+ const short unsigned int *sourceStart =
3592
+ reinterpret_cast<const short unsigned int *>(data);
3593
+ const short unsigned int *sourceEnd = sourceStart + size;
3594
+ unsigned char *targetStart =
3595
+ reinterpret_cast<unsigned char *>(output_buffer.get());
3596
+ unsigned char *targetEnd = targetStart + size * 4;
3597
+ bool is_ok = (llvm::conversionOK ==
3598
+ llvm::ConvertUTF16toUTF8(
3599
+ &sourceStart, sourceEnd, &targetStart, targetEnd,
3600
+ llvm::ConversionFlags::strictConversion));
3601
+ if (is_ok) {
3602
+ sink = (targetStart -
3603
+ reinterpret_cast<unsigned char *>(output_buffer.get()));
3604
+ } else {
3605
+ sink = 0;
3606
+ }
3607
+ };
3608
+ count_events(proc, iterations); // warming up!
3609
+ const auto result = count_events(proc, iterations);
3610
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3611
+ std::cerr << "The output is zero which might indicate an error.\n";
3612
+ }
3613
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
3614
+ print_summary(result, input_data.size(), char_count);
3615
+ }
3616
+
3617
+ void Benchmark::run_convert_utf32_to_utf8_llvm(size_t iterations) {
3618
+ const simdutf::encoding_type bom =
3619
+ BOM::check_bom(input_data.data(), input_data.size());
3620
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3621
+ input_data.data() + BOM::bom_byte_size(bom));
3622
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3623
+ if (size % 4 != 0) {
3624
+ printf(
3625
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3626
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3627
+ printf(" Running function on truncated input.\n");
3628
+ }
3629
+
3630
+ size /= 4;
3631
+
3632
+ // Note: a single 32-bit word can yield up to four UTF-8 bytes. We are
3633
+ // making a safe assumption that each 32-bit word will yield four
3634
+ // UTF-8 bytes.
3635
+ std::unique_ptr<char[]> output_buffer{new char[size * 4]};
3636
+
3637
+ volatile size_t sink{0};
3638
+
3639
+ auto proc = [data, size, &output_buffer, &sink]() {
3640
+ const unsigned int *sourceStart =
3641
+ reinterpret_cast<const unsigned int *>(data);
3642
+ const unsigned int *sourceEnd = sourceStart + size;
3643
+ unsigned char *targetStart =
3644
+ reinterpret_cast<unsigned char *>(output_buffer.get());
3645
+ unsigned char *targetEnd = targetStart + size * 4;
3646
+ bool is_ok = (llvm::conversionOK ==
3647
+ llvm::ConvertUTF32toUTF8(
3648
+ &sourceStart, sourceEnd, &targetStart, targetEnd,
3649
+ llvm::ConversionFlags::strictConversion));
3650
+ if (is_ok) {
3651
+ sink = (targetStart -
3652
+ reinterpret_cast<unsigned char *>(output_buffer.get()));
3653
+ } else {
3654
+ sink = 0;
3655
+ }
3656
+ };
3657
+ count_events(proc, iterations); // warming up!
3658
+ const auto result = count_events(proc, iterations);
3659
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3660
+ std::cerr << "The output is zero which might indicate an error.\n";
3661
+ }
3662
+ size_t char_count = size;
3663
+ print_summary(result, input_data.size(), char_count);
3664
+ }
3665
+
3666
+ void Benchmark::run_convert_utf16_to_utf32_llvm(size_t iterations) {
3667
+ const simdutf::encoding_type bom =
3668
+ BOM::check_bom(input_data.data(), input_data.size());
3669
+ const char16_t *data = reinterpret_cast<const char16_t *>(
3670
+ input_data.data() + BOM::bom_byte_size(bom));
3671
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3672
+ if (size % 2 != 0) {
3673
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
3674
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3675
+ printf(" Running function on truncated input.\n");
3676
+ }
3677
+
3678
+ size /= 2;
3679
+
3680
+ // Note: all code units yield four bytes. We make the safe assumption that all
3681
+ // code units will be non surrogate code units so the size will double (16
3682
+ // bits -> 32 bits).
3683
+ std::unique_ptr<char32_t[]> output_buffer{new char32_t[size * 2]};
3684
+
3685
+ volatile size_t sink{0};
3686
+
3687
+ auto proc = [data, size, &output_buffer, &sink]() {
3688
+ const short unsigned int *sourceStart =
3689
+ reinterpret_cast<const short unsigned int *>(data);
3690
+ const short unsigned int *sourceEnd = sourceStart + size;
3691
+ unsigned int *targetStart =
3692
+ reinterpret_cast<unsigned int *>(output_buffer.get());
3693
+ unsigned int *targetEnd = targetStart + 2 * size;
3694
+ bool is_ok = (llvm::conversionOK ==
3695
+ llvm::ConvertUTF16toUTF32(
3696
+ &sourceStart, sourceEnd, &targetStart, targetEnd,
3697
+ llvm::ConversionFlags::strictConversion));
3698
+ if (is_ok) {
3699
+ sink =
3700
+ (targetStart - reinterpret_cast<unsigned int *>(output_buffer.get()));
3701
+ } else {
3702
+ sink = 0;
3703
+ }
3704
+ };
3705
+ count_events(proc, iterations); // warming up!
3706
+ const auto result = count_events(proc, iterations);
3707
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3708
+ std::cerr << "The output is zero which might indicate an error.\n";
3709
+ }
3710
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
3711
+ print_summary(result, input_data.size(), char_count);
3712
+ }
3713
+
3714
+ void Benchmark::run_convert_utf32_to_utf16_llvm(size_t iterations) {
3715
+ const simdutf::encoding_type bom =
3716
+ BOM::check_bom(input_data.data(), input_data.size());
3717
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3718
+ input_data.data() + BOM::bom_byte_size(bom));
3719
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3720
+ if (size % 4 != 0) {
3721
+ printf(
3722
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3723
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3724
+ printf(" Running function on truncated input.\n");
3725
+ }
3726
+
3727
+ size /= 4;
3728
+
3729
+ // Note: a single 32-bit word can produce a surrogate pair, i.e. two
3730
+ // 16-bit code units. We are making a safe assumption that each 32-
3731
+ // bit word will yield two 16-bit code units.
3732
+ std::unique_ptr<char[]> output_buffer{new char[size * 2]};
3733
+
3734
+ volatile size_t sink{0};
3735
+
3736
+ auto proc = [data, size, &output_buffer, &sink]() {
3737
+ const unsigned int *sourceStart =
3738
+ reinterpret_cast<const unsigned int *>(data);
3739
+ const unsigned int *sourceEnd = sourceStart + size;
3740
+ short unsigned int *targetStart =
3741
+ reinterpret_cast<short unsigned int *>(output_buffer.get());
3742
+ short unsigned int *targetEnd = targetStart + size * 2;
3743
+ bool is_ok = (llvm::conversionOK ==
3744
+ llvm::ConvertUTF32toUTF16(
3745
+ &sourceStart, sourceEnd, &targetStart, targetEnd,
3746
+ llvm::ConversionFlags::strictConversion));
3747
+ if (is_ok) {
3748
+ sink = (targetStart -
3749
+ reinterpret_cast<short unsigned int *>(output_buffer.get()));
3750
+ } else {
3751
+ sink = 0;
3752
+ }
3753
+ };
3754
+ count_events(proc, iterations); // warming up!
3755
+ const auto result = count_events(proc, iterations);
3756
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3757
+ std::cerr << "The output is zero which might indicate an error.\n";
3758
+ }
3759
+ size_t char_count = size;
3760
+ print_summary(result, input_data.size(), char_count);
3761
+ }
3762
+
3763
+ /**
3764
+ * Nemanja Trifunovic, UTF8-CPP: UTF-8 with C++ in a Portable Way
3765
+ * https://github.com/nemtrif/utfcpp/releases/tag/v3.2.2
3766
+ */
3767
+ void Benchmark::run_convert_utf8_to_utf16_utfcpp(size_t iterations) {
3768
+ const char *data = reinterpret_cast<const char *>(input_data.data());
3769
+ const size_t size = input_data.size();
3770
+ volatile size_t sink{0};
3771
+
3772
+ auto proc = [data, size, &sink]() {
3773
+ try {
3774
+ std::vector<unsigned short> str;
3775
+ utf8::utf8to16(data, data + size, std::back_inserter(str));
3776
+ sink = str.size();
3777
+ } catch (const char *msg) {
3778
+ std::cout << msg << std::endl;
3779
+ sink = 0;
3780
+ } catch (...) {
3781
+ sink = 0;
3782
+ }
3783
+ };
3784
+ count_events(proc, iterations); // warming up!
3785
+ const auto result = count_events(proc, iterations);
3786
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3787
+ std::cerr
3788
+ << "The output is zero which might indicate a misconfiguration.\n";
3789
+ }
3790
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
3791
+ // checking
3792
+ std::unique_ptr<char16_t[]> output_buffer{new char16_t[size]};
3793
+ size_t expected = convert_utf8_to_utf16le(data, size, output_buffer.get());
3794
+ if (expected != sink) {
3795
+ std::cerr << "The number of UTF-16 code units does not match.\n";
3796
+ }
3797
+ print_summary(result, size, char_count);
3798
+ }
3799
+
3800
+ void Benchmark::run_convert_utf16_to_utf8_utfcpp(size_t iterations) {
3801
+ const simdutf::encoding_type bom =
3802
+ BOM::check_bom(input_data.data(), input_data.size());
3803
+ const char16_t *data = reinterpret_cast<const char16_t *>(
3804
+ input_data.data() + BOM::bom_byte_size(bom));
3805
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3806
+ if (size % 2 != 0) {
3807
+ printf("# The input size is not divisible by two (it is %zu + %zu for BOM)",
3808
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3809
+ printf(" Running function on truncated input.\n");
3810
+ }
3811
+
3812
+ volatile size_t sink{0};
3813
+ auto proc = [data, size, &sink]() {
3814
+ try {
3815
+ std::string str;
3816
+ utf8::utf16to8(data, data + size, std::back_inserter(str));
3817
+ sink = str.size();
3818
+ } catch (const char *msg) {
3819
+ std::cout << msg << std::endl;
3820
+ sink = 0;
3821
+ } catch (...) {
3822
+ sink = 0;
3823
+ }
3824
+ };
3825
+ count_events(proc, iterations); // warming up!
3826
+ const auto result = count_events(proc, iterations);
3827
+ size /= 2;
3828
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3829
+ std::cerr << "The output is zero which might indicate an error.\n";
3830
+ }
3831
+
3832
+ size_t char_count = get_active_implementation()->count_utf16le(data, size);
3833
+ print_summary(result, input_data.size(), char_count);
3834
+ }
3835
+
3836
+ void Benchmark::run_convert_utf8_to_utf32_utfcpp(size_t iterations) {
3837
+ const char *data = reinterpret_cast<const char *>(input_data.data());
3838
+ const size_t size = input_data.size();
3839
+ volatile size_t sink{0};
3840
+
3841
+ auto proc = [data, size, &sink]() {
3842
+ try {
3843
+ std::vector<int> str;
3844
+ utf8::utf8to32(data, data + size, std::back_inserter(str));
3845
+ sink = str.size();
3846
+ } catch (const char *msg) {
3847
+ std::cout << msg << std::endl;
3848
+ sink = 0;
3849
+ } catch (...) {
3850
+ sink = 0;
3851
+ }
3852
+ };
3853
+ count_events(proc, iterations); // warming up!
3854
+ const auto result = count_events(proc, iterations);
3855
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3856
+ std::cerr
3857
+ << "The output is zero which might indicate a misconfiguration.\n";
3858
+ }
3859
+ size_t char_count = get_active_implementation()->count_utf8(data, size);
3860
+ print_summary(result, size, char_count);
3861
+ }
3862
+
3863
+ void Benchmark::run_convert_utf32_to_utf8_utfcpp(size_t iterations) {
3864
+ const simdutf::encoding_type bom =
3865
+ BOM::check_bom(input_data.data(), input_data.size());
3866
+ const char32_t *data = reinterpret_cast<const char32_t *>(
3867
+ input_data.data() + BOM::bom_byte_size(bom));
3868
+ size_t size = input_data.size() - BOM::bom_byte_size(bom);
3869
+ if (size % 4 != 0) {
3870
+ printf(
3871
+ "# The input size is not divisible by four (it is %zu + %zu for BOM)",
3872
+ size_t(input_data.size()), size_t(BOM::bom_byte_size(bom)));
3873
+ printf(" Running function on truncated input.\n");
3874
+ }
3875
+
3876
+ volatile size_t sink{0};
3877
+
3878
+ auto proc = [data, size, &sink]() {
3879
+ try {
3880
+ std::string str;
3881
+ utf8::utf16to8(data, data + size, std::back_inserter(str));
3882
+ sink = str.size();
3883
+ } catch (const char *msg) {
3884
+ std::cout << msg << std::endl;
3885
+ sink = 0;
3886
+ } catch (...) {
3887
+ sink = 0;
3888
+ }
3889
+ };
3890
+ count_events(proc, iterations); // warming up!
3891
+ const auto result = count_events(proc, iterations);
3892
+ if ((sink == 0) && (size != 0) && (iterations > 0)) {
3893
+ std::cerr << "The output is zero which might indicate an error.\n";
3894
+ }
3895
+ size_t char_count = size / 4;
3896
+ print_summary(result, input_data.size(), char_count);
3897
+ }
3898
+
3899
+ } // namespace simdutf::benchmarks