rugged 1.7.2 → 1.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (361) hide show
  1. checksums.yaml +4 -4
  2. data/lib/rugged/version.rb +1 -1
  3. data/vendor/libgit2/AUTHORS +1 -0
  4. data/vendor/libgit2/CMakeLists.txt +23 -10
  5. data/vendor/libgit2/COPYING +195 -1
  6. data/vendor/libgit2/cmake/{FindIconv.cmake → FindIntlIconv.cmake} +6 -0
  7. data/vendor/libgit2/cmake/FindLLHTTP.cmake +39 -0
  8. data/vendor/libgit2/cmake/SelectGSSAPI.cmake +1 -1
  9. data/vendor/libgit2/cmake/SelectHTTPParser.cmake +23 -8
  10. data/vendor/libgit2/cmake/SelectHTTPSBackend.cmake +17 -8
  11. data/vendor/libgit2/cmake/SelectHashes.cmake +28 -11
  12. data/vendor/libgit2/cmake/SelectRegex.cmake +6 -1
  13. data/vendor/libgit2/cmake/SelectSSH.cmake +22 -17
  14. data/vendor/libgit2/cmake/SelectZlib.cmake +4 -0
  15. data/vendor/libgit2/deps/llhttp/CMakeLists.txt +8 -0
  16. data/vendor/libgit2/deps/llhttp/LICENSE-MIT +22 -0
  17. data/vendor/libgit2/deps/llhttp/api.c +510 -0
  18. data/vendor/libgit2/deps/llhttp/http.c +170 -0
  19. data/vendor/libgit2/deps/llhttp/llhttp.c +10168 -0
  20. data/vendor/libgit2/deps/llhttp/llhttp.h +897 -0
  21. data/vendor/libgit2/deps/ntlmclient/CMakeLists.txt +1 -1
  22. data/vendor/libgit2/deps/ntlmclient/crypt_builtin_md4.c +311 -0
  23. data/vendor/libgit2/deps/ntlmclient/crypt_commoncrypto.c +2 -1
  24. data/vendor/libgit2/deps/ntlmclient/crypt_mbedtls.c +0 -20
  25. data/vendor/libgit2/deps/ntlmclient/crypt_openssl.c +4 -4
  26. data/vendor/libgit2/deps/ntlmclient/ntlm.c +21 -21
  27. data/vendor/libgit2/deps/ntlmclient/unicode_builtin.c +5 -4
  28. data/vendor/libgit2/deps/ntlmclient/unicode_iconv.c +2 -1
  29. data/vendor/libgit2/deps/ntlmclient/utf8.h +1176 -721
  30. data/vendor/libgit2/deps/ntlmclient/util.h +11 -0
  31. data/vendor/libgit2/deps/pcre/CMakeLists.txt +1 -0
  32. data/vendor/libgit2/deps/xdiff/xmerge.c +2 -2
  33. data/vendor/libgit2/deps/zlib/CMakeLists.txt +6 -1
  34. data/vendor/libgit2/deps/zlib/LICENSE +22 -0
  35. data/vendor/libgit2/deps/zlib/adler32.c +5 -27
  36. data/vendor/libgit2/deps/zlib/crc32.c +94 -167
  37. data/vendor/libgit2/deps/zlib/deflate.c +358 -435
  38. data/vendor/libgit2/deps/zlib/deflate.h +41 -10
  39. data/vendor/libgit2/deps/zlib/gzguts.h +13 -18
  40. data/vendor/libgit2/deps/zlib/infback.c +17 -30
  41. data/vendor/libgit2/deps/zlib/inffast.c +1 -4
  42. data/vendor/libgit2/deps/zlib/inffast.h +1 -1
  43. data/vendor/libgit2/deps/zlib/inflate.c +36 -102
  44. data/vendor/libgit2/deps/zlib/inftrees.c +6 -11
  45. data/vendor/libgit2/deps/zlib/inftrees.h +6 -6
  46. data/vendor/libgit2/deps/zlib/trees.c +287 -352
  47. data/vendor/libgit2/deps/zlib/zconf.h +23 -14
  48. data/vendor/libgit2/deps/zlib/zlib.h +202 -202
  49. data/vendor/libgit2/deps/zlib/zutil.c +18 -44
  50. data/vendor/libgit2/deps/zlib/zutil.h +13 -33
  51. data/vendor/libgit2/include/git2/annotated_commit.h +12 -5
  52. data/vendor/libgit2/include/git2/apply.h +27 -6
  53. data/vendor/libgit2/include/git2/attr.h +17 -4
  54. data/vendor/libgit2/include/git2/blame.h +133 -28
  55. data/vendor/libgit2/include/git2/blob.h +71 -28
  56. data/vendor/libgit2/include/git2/branch.h +22 -15
  57. data/vendor/libgit2/include/git2/buffer.h +6 -4
  58. data/vendor/libgit2/include/git2/cert.h +2 -1
  59. data/vendor/libgit2/include/git2/checkout.h +83 -32
  60. data/vendor/libgit2/include/git2/cherrypick.h +10 -3
  61. data/vendor/libgit2/include/git2/clone.h +25 -9
  62. data/vendor/libgit2/include/git2/commit.h +132 -3
  63. data/vendor/libgit2/include/git2/common.h +120 -63
  64. data/vendor/libgit2/include/git2/config.h +93 -23
  65. data/vendor/libgit2/include/git2/credential.h +30 -2
  66. data/vendor/libgit2/include/git2/credential_helpers.h +1 -0
  67. data/vendor/libgit2/include/git2/deprecated.h +133 -3
  68. data/vendor/libgit2/include/git2/describe.h +13 -1
  69. data/vendor/libgit2/include/git2/diff.h +38 -8
  70. data/vendor/libgit2/include/git2/email.h +9 -29
  71. data/vendor/libgit2/include/git2/errors.h +46 -73
  72. data/vendor/libgit2/include/git2/filter.h +14 -7
  73. data/vendor/libgit2/include/git2/global.h +8 -1
  74. data/vendor/libgit2/include/git2/graph.h +3 -2
  75. data/vendor/libgit2/include/git2/ignore.h +10 -0
  76. data/vendor/libgit2/include/git2/index.h +99 -14
  77. data/vendor/libgit2/include/git2/indexer.h +21 -4
  78. data/vendor/libgit2/include/git2/mailmap.h +7 -1
  79. data/vendor/libgit2/include/git2/merge.h +46 -1
  80. data/vendor/libgit2/include/git2/message.h +2 -2
  81. data/vendor/libgit2/include/git2/net.h +3 -1
  82. data/vendor/libgit2/include/git2/notes.h +9 -6
  83. data/vendor/libgit2/include/git2/object.h +9 -8
  84. data/vendor/libgit2/include/git2/odb.h +91 -49
  85. data/vendor/libgit2/include/git2/odb_backend.h +80 -52
  86. data/vendor/libgit2/include/git2/oid.h +23 -24
  87. data/vendor/libgit2/include/git2/oidarray.h +7 -1
  88. data/vendor/libgit2/include/git2/pack.h +13 -1
  89. data/vendor/libgit2/include/git2/patch.h +2 -3
  90. data/vendor/libgit2/include/git2/pathspec.h +9 -0
  91. data/vendor/libgit2/include/git2/proxy.h +10 -0
  92. data/vendor/libgit2/include/git2/rebase.h +9 -6
  93. data/vendor/libgit2/include/git2/refdb.h +2 -2
  94. data/vendor/libgit2/include/git2/reflog.h +3 -2
  95. data/vendor/libgit2/include/git2/refs.h +9 -6
  96. data/vendor/libgit2/include/git2/refspec.h +14 -4
  97. data/vendor/libgit2/include/git2/remote.h +94 -18
  98. data/vendor/libgit2/include/git2/repository.h +57 -21
  99. data/vendor/libgit2/include/git2/reset.h +16 -3
  100. data/vendor/libgit2/include/git2/revert.h +9 -4
  101. data/vendor/libgit2/include/git2/revparse.h +3 -3
  102. data/vendor/libgit2/include/git2/revwalk.h +3 -2
  103. data/vendor/libgit2/include/git2/signature.h +46 -1
  104. data/vendor/libgit2/include/git2/stash.h +17 -3
  105. data/vendor/libgit2/include/git2/status.h +10 -6
  106. data/vendor/libgit2/include/git2/stdint.h +87 -85
  107. data/vendor/libgit2/include/git2/strarray.h +2 -3
  108. data/vendor/libgit2/include/git2/submodule.h +20 -9
  109. data/vendor/libgit2/include/git2/sys/alloc.h +12 -0
  110. data/vendor/libgit2/include/git2/sys/commit.h +77 -3
  111. data/vendor/libgit2/include/git2/sys/commit_graph.h +103 -62
  112. data/vendor/libgit2/include/git2/sys/config.h +80 -4
  113. data/vendor/libgit2/include/git2/sys/credential.h +4 -3
  114. data/vendor/libgit2/include/git2/sys/diff.h +21 -1
  115. data/vendor/libgit2/include/git2/sys/email.h +7 -0
  116. data/vendor/libgit2/include/git2/sys/errors.h +76 -0
  117. data/vendor/libgit2/include/git2/sys/filter.h +66 -3
  118. data/vendor/libgit2/include/git2/sys/hashsig.h +11 -0
  119. data/vendor/libgit2/include/git2/sys/index.h +3 -2
  120. data/vendor/libgit2/include/git2/sys/mempack.h +32 -2
  121. data/vendor/libgit2/include/git2/sys/merge.h +55 -7
  122. data/vendor/libgit2/include/git2/sys/midx.h +43 -4
  123. data/vendor/libgit2/include/git2/sys/odb_backend.h +7 -3
  124. data/vendor/libgit2/include/git2/sys/openssl.h +8 -1
  125. data/vendor/libgit2/include/git2/sys/path.h +12 -1
  126. data/vendor/libgit2/include/git2/sys/refdb_backend.h +40 -36
  127. data/vendor/libgit2/include/git2/sys/refs.h +3 -2
  128. data/vendor/libgit2/include/git2/sys/remote.h +8 -1
  129. data/vendor/libgit2/include/git2/sys/repository.h +63 -3
  130. data/vendor/libgit2/include/git2/sys/stream.h +11 -2
  131. data/vendor/libgit2/include/git2/sys/transport.h +24 -3
  132. data/vendor/libgit2/include/git2/tag.h +3 -1
  133. data/vendor/libgit2/include/git2/trace.h +9 -3
  134. data/vendor/libgit2/include/git2/transaction.h +3 -2
  135. data/vendor/libgit2/include/git2/transport.h +11 -3
  136. data/vendor/libgit2/include/git2/tree.h +16 -5
  137. data/vendor/libgit2/include/git2/types.h +19 -3
  138. data/vendor/libgit2/include/git2/version.h +44 -8
  139. data/vendor/libgit2/include/git2/worktree.h +16 -6
  140. data/vendor/libgit2/src/CMakeLists.txt +6 -4
  141. data/vendor/libgit2/src/cli/CMakeLists.txt +2 -2
  142. data/vendor/libgit2/src/cli/cmd.c +1 -1
  143. data/vendor/libgit2/src/cli/cmd.h +4 -0
  144. data/vendor/libgit2/src/cli/cmd_blame.c +287 -0
  145. data/vendor/libgit2/src/cli/cmd_cat_file.c +6 -8
  146. data/vendor/libgit2/src/cli/cmd_clone.c +5 -7
  147. data/vendor/libgit2/src/cli/cmd_config.c +241 -0
  148. data/vendor/libgit2/src/cli/cmd_hash_object.c +6 -8
  149. data/vendor/libgit2/src/cli/cmd_help.c +6 -7
  150. data/vendor/libgit2/src/cli/cmd_index_pack.c +114 -0
  151. data/vendor/libgit2/src/cli/cmd_init.c +102 -0
  152. data/vendor/libgit2/src/cli/common.c +168 -0
  153. data/vendor/libgit2/src/cli/common.h +63 -0
  154. data/vendor/libgit2/src/cli/error.h +1 -1
  155. data/vendor/libgit2/src/cli/main.c +52 -24
  156. data/vendor/libgit2/src/cli/opt.c +29 -3
  157. data/vendor/libgit2/src/cli/opt.h +21 -3
  158. data/vendor/libgit2/src/cli/opt_usage.c +102 -33
  159. data/vendor/libgit2/src/cli/opt_usage.h +6 -1
  160. data/vendor/libgit2/src/cli/progress.c +51 -2
  161. data/vendor/libgit2/src/cli/progress.h +12 -0
  162. data/vendor/libgit2/src/cli/unix/sighandler.c +2 -1
  163. data/vendor/libgit2/src/cli/win32/precompiled.h +1 -1
  164. data/vendor/libgit2/src/cli/win32/sighandler.c +1 -1
  165. data/vendor/libgit2/src/libgit2/CMakeLists.txt +26 -8
  166. data/vendor/libgit2/src/libgit2/apply.c +10 -13
  167. data/vendor/libgit2/src/libgit2/attr.c +30 -13
  168. data/vendor/libgit2/src/libgit2/attr_file.c +7 -2
  169. data/vendor/libgit2/src/libgit2/attr_file.h +2 -0
  170. data/vendor/libgit2/src/libgit2/attrcache.c +69 -33
  171. data/vendor/libgit2/src/libgit2/attrcache.h +5 -9
  172. data/vendor/libgit2/src/libgit2/blame.c +130 -44
  173. data/vendor/libgit2/src/libgit2/blame.h +1 -0
  174. data/vendor/libgit2/src/libgit2/cache.c +22 -17
  175. data/vendor/libgit2/src/libgit2/cache.h +7 -9
  176. data/vendor/libgit2/src/libgit2/checkout.c +34 -24
  177. data/vendor/libgit2/src/libgit2/checkout.h +0 -2
  178. data/vendor/libgit2/src/libgit2/cherrypick.c +1 -2
  179. data/vendor/libgit2/src/libgit2/clone.c +186 -166
  180. data/vendor/libgit2/src/libgit2/clone.h +4 -1
  181. data/vendor/libgit2/src/libgit2/commit.c +92 -0
  182. data/vendor/libgit2/src/libgit2/commit_graph.c +67 -56
  183. data/vendor/libgit2/src/libgit2/commit_graph.h +1 -2
  184. data/vendor/libgit2/src/libgit2/config.c +389 -298
  185. data/vendor/libgit2/src/libgit2/config.cmake.in +3 -0
  186. data/vendor/libgit2/src/libgit2/config.h +9 -4
  187. data/vendor/libgit2/src/libgit2/config_backend.h +8 -10
  188. data/vendor/libgit2/src/libgit2/config_cache.c +4 -5
  189. data/vendor/libgit2/src/libgit2/config_file.c +99 -88
  190. data/vendor/libgit2/src/libgit2/config_list.c +285 -0
  191. data/vendor/libgit2/src/libgit2/config_list.h +32 -0
  192. data/vendor/libgit2/src/libgit2/config_mem.c +194 -40
  193. data/vendor/libgit2/src/libgit2/config_parse.c +10 -9
  194. data/vendor/libgit2/src/libgit2/config_snapshot.c +24 -31
  195. data/vendor/libgit2/src/libgit2/describe.c +24 -24
  196. data/vendor/libgit2/src/libgit2/diff.c +1 -1
  197. data/vendor/libgit2/src/libgit2/diff_driver.c +12 -19
  198. data/vendor/libgit2/src/libgit2/diff_driver.h +2 -2
  199. data/vendor/libgit2/src/libgit2/diff_generate.c +3 -3
  200. data/vendor/libgit2/src/libgit2/diff_parse.c +2 -2
  201. data/vendor/libgit2/src/libgit2/diff_print.c +65 -9
  202. data/vendor/libgit2/src/libgit2/diff_tform.c +36 -8
  203. data/vendor/libgit2/src/libgit2/email.c +1 -0
  204. data/vendor/libgit2/src/libgit2/fetch.c +5 -3
  205. data/vendor/libgit2/src/libgit2/filter.c +5 -5
  206. data/vendor/libgit2/src/libgit2/git2.rc +3 -3
  207. data/vendor/libgit2/src/libgit2/grafts.c +18 -20
  208. data/vendor/libgit2/src/libgit2/grafts.h +0 -1
  209. data/vendor/libgit2/src/libgit2/graph.c +1 -1
  210. data/vendor/libgit2/src/libgit2/hashmap_oid.h +30 -0
  211. data/vendor/libgit2/src/libgit2/ignore.c +9 -5
  212. data/vendor/libgit2/src/libgit2/index.c +68 -90
  213. data/vendor/libgit2/src/libgit2/index.h +2 -2
  214. data/vendor/libgit2/src/libgit2/index_map.c +95 -0
  215. data/vendor/libgit2/src/libgit2/index_map.h +28 -0
  216. data/vendor/libgit2/src/libgit2/indexer.c +34 -38
  217. data/vendor/libgit2/src/libgit2/iterator.c +14 -8
  218. data/vendor/libgit2/src/libgit2/libgit2.c +153 -368
  219. data/vendor/libgit2/src/libgit2/mailmap.c +1 -1
  220. data/vendor/libgit2/src/libgit2/merge.c +42 -37
  221. data/vendor/libgit2/src/libgit2/merge_driver.c +2 -2
  222. data/vendor/libgit2/src/libgit2/midx.c +28 -15
  223. data/vendor/libgit2/src/libgit2/mwindow.c +38 -45
  224. data/vendor/libgit2/src/libgit2/mwindow.h +4 -0
  225. data/vendor/libgit2/src/libgit2/object.c +6 -5
  226. data/vendor/libgit2/src/libgit2/odb.c +5 -4
  227. data/vendor/libgit2/src/libgit2/odb_mempack.c +49 -17
  228. data/vendor/libgit2/src/libgit2/odb_pack.c +13 -5
  229. data/vendor/libgit2/src/libgit2/oid.c +32 -5
  230. data/vendor/libgit2/src/libgit2/oid.h +11 -0
  231. data/vendor/libgit2/src/libgit2/pack-objects.c +58 -31
  232. data/vendor/libgit2/src/libgit2/pack-objects.h +12 -4
  233. data/vendor/libgit2/src/libgit2/pack.c +30 -24
  234. data/vendor/libgit2/src/libgit2/pack.h +15 -10
  235. data/vendor/libgit2/src/libgit2/patch_parse.c +2 -2
  236. data/vendor/libgit2/src/libgit2/path.c +1 -1
  237. data/vendor/libgit2/src/libgit2/pathspec.c +1 -1
  238. data/vendor/libgit2/src/libgit2/push.c +79 -28
  239. data/vendor/libgit2/src/libgit2/push.h +1 -0
  240. data/vendor/libgit2/src/libgit2/refdb_fs.c +128 -61
  241. data/vendor/libgit2/src/libgit2/reflog.c +1 -2
  242. data/vendor/libgit2/src/libgit2/reflog.h +2 -0
  243. data/vendor/libgit2/src/libgit2/refs.c +26 -7
  244. data/vendor/libgit2/src/libgit2/refs.h +6 -1
  245. data/vendor/libgit2/src/libgit2/refspec.c +28 -1
  246. data/vendor/libgit2/src/libgit2/refspec.h +8 -0
  247. data/vendor/libgit2/src/libgit2/remote.c +121 -61
  248. data/vendor/libgit2/src/libgit2/repository.c +231 -51
  249. data/vendor/libgit2/src/libgit2/repository.h +10 -6
  250. data/vendor/libgit2/src/libgit2/revert.c +1 -2
  251. data/vendor/libgit2/src/libgit2/revparse.c +2 -2
  252. data/vendor/libgit2/src/libgit2/revwalk.c +13 -10
  253. data/vendor/libgit2/src/libgit2/revwalk.h +3 -3
  254. data/vendor/libgit2/src/libgit2/settings.c +468 -0
  255. data/vendor/libgit2/src/libgit2/settings.h +6 -2
  256. data/vendor/libgit2/src/libgit2/signature.c +132 -15
  257. data/vendor/libgit2/src/libgit2/signature.h +0 -1
  258. data/vendor/libgit2/src/libgit2/status.c +1 -1
  259. data/vendor/libgit2/src/libgit2/streams/mbedtls.c +54 -60
  260. data/vendor/libgit2/src/libgit2/streams/openssl.c +32 -7
  261. data/vendor/libgit2/src/libgit2/streams/openssl.h +2 -0
  262. data/vendor/libgit2/src/libgit2/streams/openssl_dynamic.c +4 -0
  263. data/vendor/libgit2/src/libgit2/streams/openssl_dynamic.h +3 -0
  264. data/vendor/libgit2/src/libgit2/streams/stransport.c +39 -7
  265. data/vendor/libgit2/src/libgit2/submodule.c +106 -63
  266. data/vendor/libgit2/src/libgit2/submodule.h +6 -7
  267. data/vendor/libgit2/src/libgit2/tag.c +1 -1
  268. data/vendor/libgit2/src/libgit2/trailer.c +6 -6
  269. data/vendor/libgit2/src/libgit2/transaction.c +26 -20
  270. data/vendor/libgit2/src/libgit2/transaction.h +4 -1
  271. data/vendor/libgit2/src/libgit2/transport.c +4 -1
  272. data/vendor/libgit2/src/libgit2/transports/credential.c +1 -1
  273. data/vendor/libgit2/src/libgit2/transports/http.c +1 -2
  274. data/vendor/libgit2/src/libgit2/transports/http.h +0 -10
  275. data/vendor/libgit2/src/libgit2/transports/httpclient.c +112 -72
  276. data/vendor/libgit2/src/libgit2/transports/httpparser.c +128 -0
  277. data/vendor/libgit2/src/libgit2/transports/httpparser.h +99 -0
  278. data/vendor/libgit2/src/libgit2/transports/local.c +8 -7
  279. data/vendor/libgit2/src/libgit2/transports/smart.c +20 -8
  280. data/vendor/libgit2/src/libgit2/transports/smart.h +4 -2
  281. data/vendor/libgit2/src/libgit2/transports/smart_pkt.c +2 -2
  282. data/vendor/libgit2/src/libgit2/transports/smart_protocol.c +55 -10
  283. data/vendor/libgit2/src/libgit2/transports/ssh.c +41 -1103
  284. data/vendor/libgit2/src/libgit2/transports/ssh_exec.c +347 -0
  285. data/vendor/libgit2/src/libgit2/transports/ssh_exec.h +26 -0
  286. data/vendor/libgit2/src/libgit2/transports/ssh_libssh2.c +1126 -0
  287. data/vendor/libgit2/src/libgit2/transports/ssh_libssh2.h +28 -0
  288. data/vendor/libgit2/src/libgit2/transports/winhttp.c +35 -7
  289. data/vendor/libgit2/src/libgit2/tree.c +34 -26
  290. data/vendor/libgit2/src/libgit2/tree.h +3 -2
  291. data/vendor/libgit2/src/libgit2/worktree.c +14 -17
  292. data/vendor/libgit2/src/util/CMakeLists.txt +4 -6
  293. data/vendor/libgit2/src/util/alloc.c +4 -1
  294. data/vendor/libgit2/src/util/allocators/debugalloc.c +73 -0
  295. data/vendor/libgit2/src/{cli/cli.h → util/allocators/debugalloc.h} +6 -9
  296. data/vendor/libgit2/src/util/allocators/stdalloc.c +0 -10
  297. data/vendor/libgit2/src/util/array.h +18 -17
  298. data/vendor/libgit2/src/util/cc-compat.h +2 -0
  299. data/vendor/libgit2/src/util/ctype_compat.h +70 -0
  300. data/vendor/libgit2/src/util/date.c +22 -14
  301. data/vendor/libgit2/src/util/date.h +12 -0
  302. data/vendor/libgit2/src/util/errors.c +401 -0
  303. data/vendor/libgit2/src/{libgit2 → util}/errors.h +21 -17
  304. data/vendor/libgit2/src/util/fs_path.c +15 -4
  305. data/vendor/libgit2/src/util/fs_path.h +23 -0
  306. data/vendor/libgit2/src/util/futils.c +6 -5
  307. data/vendor/libgit2/src/util/futils.h +13 -4
  308. data/vendor/libgit2/src/util/git2_features.h.in +12 -1
  309. data/vendor/libgit2/src/util/git2_util.h +6 -0
  310. data/vendor/libgit2/src/util/hash/openssl.c +152 -0
  311. data/vendor/libgit2/src/util/hash/openssl.h +17 -1
  312. data/vendor/libgit2/src/util/hash/sha.h +4 -1
  313. data/vendor/libgit2/src/util/hashmap.h +424 -0
  314. data/vendor/libgit2/src/util/hashmap_str.h +43 -0
  315. data/vendor/libgit2/src/util/integer.h +3 -1
  316. data/vendor/libgit2/src/util/net.c +13 -7
  317. data/vendor/libgit2/src/util/net.h +2 -0
  318. data/vendor/libgit2/src/util/pool.c +1 -1
  319. data/vendor/libgit2/src/util/pool.h +5 -0
  320. data/vendor/libgit2/src/util/pqueue.h +1 -1
  321. data/vendor/libgit2/src/util/process.h +222 -0
  322. data/vendor/libgit2/src/util/rand.c +1 -7
  323. data/vendor/libgit2/src/util/regexp.c +1 -1
  324. data/vendor/libgit2/src/util/sortedcache.c +14 -13
  325. data/vendor/libgit2/src/util/sortedcache.h +3 -3
  326. data/vendor/libgit2/src/util/str.c +2 -2
  327. data/vendor/libgit2/src/util/strlist.c +108 -0
  328. data/vendor/libgit2/src/util/strlist.h +36 -0
  329. data/vendor/libgit2/src/util/unix/posix.h +0 -2
  330. data/vendor/libgit2/src/util/unix/process.c +629 -0
  331. data/vendor/libgit2/src/util/unix/realpath.c +23 -5
  332. data/vendor/libgit2/src/util/util.c +2 -2
  333. data/vendor/libgit2/src/util/util.h +4 -38
  334. data/vendor/libgit2/src/util/vector.c +3 -3
  335. data/vendor/libgit2/src/util/vector.h +2 -2
  336. data/vendor/libgit2/src/util/win32/posix_w32.c +29 -6
  337. data/vendor/libgit2/src/util/win32/process.c +506 -0
  338. metadata +45 -28
  339. data/vendor/libgit2/deps/http-parser/CMakeLists.txt +0 -6
  340. data/vendor/libgit2/deps/http-parser/COPYING +0 -23
  341. data/vendor/libgit2/deps/http-parser/http_parser.c +0 -2182
  342. data/vendor/libgit2/deps/http-parser/http_parser.h +0 -305
  343. data/vendor/libgit2/deps/zlib/COPYING +0 -27
  344. data/vendor/libgit2/include/git2/sys/reflog.h +0 -21
  345. data/vendor/libgit2/src/libgit2/config_entries.c +0 -237
  346. data/vendor/libgit2/src/libgit2/config_entries.h +0 -24
  347. data/vendor/libgit2/src/libgit2/errors.c +0 -293
  348. data/vendor/libgit2/src/libgit2/idxmap.c +0 -157
  349. data/vendor/libgit2/src/libgit2/idxmap.h +0 -177
  350. data/vendor/libgit2/src/libgit2/libgit2.h +0 -15
  351. data/vendor/libgit2/src/libgit2/offmap.c +0 -101
  352. data/vendor/libgit2/src/libgit2/offmap.h +0 -133
  353. data/vendor/libgit2/src/libgit2/oidmap.c +0 -107
  354. data/vendor/libgit2/src/libgit2/oidmap.h +0 -128
  355. data/vendor/libgit2/src/libgit2/threadstate.c +0 -97
  356. data/vendor/libgit2/src/libgit2/threadstate.h +0 -22
  357. data/vendor/libgit2/src/libgit2/transports/ssh.h +0 -14
  358. data/vendor/libgit2/src/util/khash.h +0 -615
  359. data/vendor/libgit2/src/util/strmap.c +0 -100
  360. data/vendor/libgit2/src/util/strmap.h +0 -131
  361. /data/vendor/libgit2/cmake/{FindHTTPParser.cmake → FindHTTP_Parser.cmake} +0 -0
@@ -1,30 +1,30 @@
1
- // The latest version of this library is available on GitHub;
2
- // https://github.com/sheredom/utf8.h
3
-
4
- // This is free and unencumbered software released into the public domain.
5
- //
6
- // Anyone is free to copy, modify, publish, use, compile, sell, or
7
- // distribute this software, either in source code form or as a compiled
8
- // binary, for any purpose, commercial or non-commercial, and by any
9
- // means.
10
- //
11
- // In jurisdictions that recognize copyright laws, the author or authors
12
- // of this software dedicate any and all copyright interest in the
13
- // software to the public domain. We make this dedication for the benefit
14
- // of the public at large and to the detriment of our heirs and
15
- // successors. We intend this dedication to be an overt act of
16
- // relinquishment in perpetuity of all present and future rights to this
17
- // software under copyright law.
18
- //
19
- // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
- // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
- // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22
- // IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23
- // OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24
- // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
- // OTHER DEALINGS IN THE SOFTWARE.
26
- //
27
- // For more information, please refer to <http://unlicense.org/>
1
+ /* The latest version of this library is available on GitHub;
2
+ * https://github.com/sheredom/utf8.h */
3
+
4
+ /* This is free and unencumbered software released into the public domain.
5
+ *
6
+ * Anyone is free to copy, modify, publish, use, compile, sell, or
7
+ * distribute this software, either in source code form or as a compiled
8
+ * binary, for any purpose, commercial or non-commercial, and by any
9
+ * means.
10
+ *
11
+ * In jurisdictions that recognize copyright laws, the author or authors
12
+ * of this software dedicate any and all copyright interest in the
13
+ * software to the public domain. We make this dedication for the benefit
14
+ * of the public at large and to the detriment of our heirs and
15
+ * successors. We intend this dedication to be an overt act of
16
+ * relinquishment in perpetuity of all present and future rights to this
17
+ * software under copyright law.
18
+ *
19
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25
+ * OTHER DEALINGS IN THE SOFTWARE.
26
+ *
27
+ * For more information, please refer to <http://unlicense.org/> */
28
28
 
29
29
  #ifndef SHEREDOM_UTF8_H_INCLUDED
30
30
  #define SHEREDOM_UTF8_H_INCLUDED
@@ -32,10 +32,25 @@
32
32
  #if defined(_MSC_VER)
33
33
  #pragma warning(push)
34
34
 
35
- // disable 'bytes padding added after construct' warning
35
+ /* disable warning: no function prototype given: converting '()' to '(void)' */
36
+ #pragma warning(disable : 4255)
37
+
38
+ /* disable warning: '__cplusplus' is not defined as a preprocessor macro,
39
+ * replacing with '0' for '#if/#elif' */
40
+ #pragma warning(disable : 4668)
41
+
42
+ /* disable warning: bytes padding added after construct */
36
43
  #pragma warning(disable : 4820)
37
44
  #endif
38
45
 
46
+ #if defined(__cplusplus)
47
+ #if defined(_MSC_VER)
48
+ #define utf8_cplusplus _MSVC_LANG
49
+ #else
50
+ #define utf8_cplusplus __cplusplus
51
+ #endif
52
+ #endif
53
+
39
54
  #include <stddef.h>
40
55
  #include <stdlib.h>
41
56
 
@@ -43,7 +58,7 @@
43
58
  #pragma warning(pop)
44
59
  #endif
45
60
 
46
- #if defined(_MSC_VER)
61
+ #if defined(_MSC_VER) && (_MSC_VER < 1920)
47
62
  typedef __int32 utf8_int32_t;
48
63
  #else
49
64
  #include <stdint.h>
@@ -54,411 +69,516 @@ typedef int32_t utf8_int32_t;
54
69
  #pragma clang diagnostic push
55
70
  #pragma clang diagnostic ignored "-Wold-style-cast"
56
71
  #pragma clang diagnostic ignored "-Wcast-qual"
72
+
73
+ #if __has_warning("-Wunsafe-buffer-usage")
74
+ #pragma clang diagnostic ignored "-Wunsafe-buffer-usage"
75
+ #endif
57
76
  #endif
58
77
 
59
- #ifdef __cplusplus
78
+ #ifdef utf8_cplusplus
60
79
  extern "C" {
61
80
  #endif
62
81
 
63
- #if defined(__clang__) || defined(__GNUC__)
64
- #define utf8_nonnull __attribute__((nonnull))
65
- #define utf8_pure __attribute__((pure))
66
- #define utf8_restrict __restrict__
67
- #define utf8_weak __attribute__((weak))
68
- #elif defined(_MSC_VER)
82
+ #if defined(__TINYC__)
83
+ #define UTF8_ATTRIBUTE(a) __attribute((a))
84
+ #else
85
+ #define UTF8_ATTRIBUTE(a) __attribute__((a))
86
+ #endif
87
+
88
+ #if defined(_MSC_VER)
69
89
  #define utf8_nonnull
70
90
  #define utf8_pure
71
91
  #define utf8_restrict __restrict
72
92
  #define utf8_weak __inline
93
+ #elif defined(__clang__) || defined(__GNUC__)
94
+ #define utf8_nonnull UTF8_ATTRIBUTE(nonnull)
95
+ #define utf8_pure UTF8_ATTRIBUTE(pure)
96
+ #define utf8_restrict __restrict__
97
+ #define utf8_weak UTF8_ATTRIBUTE(weak)
98
+ #elif defined(__TINYC__)
99
+ #define utf8_nonnull UTF8_ATTRIBUTE(nonnull)
100
+ #define utf8_pure UTF8_ATTRIBUTE(pure)
101
+ #define utf8_restrict
102
+ #define utf8_weak UTF8_ATTRIBUTE(weak)
73
103
  #else
74
- #error Non clang, non gcc, non MSVC compiler found!
104
+ #error Non clang, non gcc, non MSVC, non tcc compiler found!
75
105
  #endif
76
106
 
77
- #ifdef __cplusplus
107
+ #ifdef utf8_cplusplus
78
108
  #define utf8_null NULL
79
109
  #else
80
110
  #define utf8_null 0
81
111
  #endif
82
112
 
83
- // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
84
- // src2 respectively, case insensitive.
85
- utf8_nonnull utf8_pure utf8_weak int utf8casecmp(const void *src1,
86
- const void *src2);
87
-
88
- // Append the utf8 string src onto the utf8 string dst.
89
- utf8_nonnull utf8_weak void *utf8cat(void *utf8_restrict dst,
90
- const void *utf8_restrict src);
91
-
92
- // Find the first match of the utf8 codepoint chr in the utf8 string src.
93
- utf8_nonnull utf8_pure utf8_weak void *utf8chr(const void *src,
94
- utf8_int32_t chr);
95
-
96
- // Return less than 0, 0, greater than 0 if src1 < src2,
97
- // src1 == src2, src1 > src2 respectively.
98
- utf8_nonnull utf8_pure utf8_weak int utf8cmp(const void *src1,
99
- const void *src2);
100
-
101
- // Copy the utf8 string src onto the memory allocated in dst.
102
- utf8_nonnull utf8_weak void *utf8cpy(void *utf8_restrict dst,
103
- const void *utf8_restrict src);
104
-
105
- // Number of utf8 codepoints in the utf8 string src that consists entirely
106
- // of utf8 codepoints not from the utf8 string reject.
107
- utf8_nonnull utf8_pure utf8_weak size_t utf8cspn(const void *src,
108
- const void *reject);
109
-
110
- // Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
111
- // copying over the data, and returning that. Or 0 if malloc failed.
112
- utf8_nonnull utf8_weak void *utf8dup(const void *src);
113
-
114
- // Number of utf8 codepoints in the utf8 string str,
115
- // excluding the null terminating byte.
116
- utf8_nonnull utf8_pure utf8_weak size_t utf8len(const void *str);
117
-
118
- // Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
119
- // src2 respectively, case insensitive. Checking at most n bytes of each utf8
120
- // string.
121
- utf8_nonnull utf8_pure utf8_weak int utf8ncasecmp(const void *src1,
122
- const void *src2, size_t n);
123
-
124
- // Append the utf8 string src onto the utf8 string dst,
125
- // writing at most n+1 bytes. Can produce an invalid utf8
126
- // string if n falls partway through a utf8 codepoint.
127
- utf8_nonnull utf8_weak void *utf8ncat(void *utf8_restrict dst,
128
- const void *utf8_restrict src, size_t n);
129
-
130
- // Return less than 0, 0, greater than 0 if src1 < src2,
131
- // src1 == src2, src1 > src2 respectively. Checking at most n
132
- // bytes of each utf8 string.
133
- utf8_nonnull utf8_pure utf8_weak int utf8ncmp(const void *src1,
134
- const void *src2, size_t n);
135
-
136
- // Copy the utf8 string src onto the memory allocated in dst.
137
- // Copies at most n bytes. If there is no terminating null byte in
138
- // the first n bytes of src, the string placed into dst will not be
139
- // null-terminated. If the size (in bytes) of src is less than n,
140
- // extra null terminating bytes are appended to dst such that at
141
- // total of n bytes are written. Can produce an invalid utf8
142
- // string if n falls partway through a utf8 codepoint.
143
- utf8_nonnull utf8_weak void *utf8ncpy(void *utf8_restrict dst,
144
- const void *utf8_restrict src, size_t n);
145
-
146
- // Similar to utf8dup, except that at most n bytes of src are copied. If src is
147
- // longer than n, only n bytes are copied and a null byte is added.
148
- //
149
- // Returns a new string if successful, 0 otherwise
150
- utf8_nonnull utf8_weak void *utf8ndup(const void *src, size_t n);
151
-
152
- // Locates the first occurence in the utf8 string str of any byte in the
153
- // utf8 string accept, or 0 if no match was found.
154
- utf8_nonnull utf8_pure utf8_weak void *utf8pbrk(const void *str,
155
- const void *accept);
156
-
157
- // Find the last match of the utf8 codepoint chr in the utf8 string src.
158
- utf8_nonnull utf8_pure utf8_weak void *utf8rchr(const void *src, int chr);
159
-
160
- // Number of bytes in the utf8 string str,
161
- // including the null terminating byte.
162
- utf8_nonnull utf8_pure utf8_weak size_t utf8size(const void *str);
163
-
164
- // Number of utf8 codepoints in the utf8 string src that consists entirely
165
- // of utf8 codepoints from the utf8 string accept.
166
- utf8_nonnull utf8_pure utf8_weak size_t utf8spn(const void *src,
167
- const void *accept);
168
-
169
- // The position of the utf8 string needle in the utf8 string haystack.
170
- utf8_nonnull utf8_pure utf8_weak void *utf8str(const void *haystack,
171
- const void *needle);
172
-
173
- // The position of the utf8 string needle in the utf8 string haystack, case
174
- // insensitive.
175
- utf8_nonnull utf8_pure utf8_weak void *utf8casestr(const void *haystack,
176
- const void *needle);
177
-
178
- // Return 0 on success, or the position of the invalid
179
- // utf8 codepoint on failure.
180
- utf8_nonnull utf8_pure utf8_weak void *utf8valid(const void *str);
181
-
182
- // Sets out_codepoint to the next utf8 codepoint in str, and returns the address
183
- // of the utf8 codepoint after the current one in str.
184
- utf8_nonnull utf8_weak void *
185
- utf8codepoint(const void *utf8_restrict str,
186
- utf8_int32_t *utf8_restrict out_codepoint);
187
-
188
- // Returns the size of the given codepoint in bytes.
189
- utf8_weak size_t utf8codepointsize(utf8_int32_t chr);
190
-
191
- // Write a codepoint to the given string, and return the address to the next
192
- // place after the written codepoint. Pass how many bytes left in the buffer to
193
- // n. If there is not enough space for the codepoint, this function returns
194
- // null.
195
- utf8_nonnull utf8_weak void *utf8catcodepoint(void *utf8_restrict str,
196
- utf8_int32_t chr, size_t n);
197
-
198
- // Returns 1 if the given character is lowercase, or 0 if it is not.
199
- utf8_weak int utf8islower(utf8_int32_t chr);
200
-
201
- // Returns 1 if the given character is uppercase, or 0 if it is not.
202
- utf8_weak int utf8isupper(utf8_int32_t chr);
203
-
204
- // Transform the given string into all lowercase codepoints.
205
- utf8_nonnull utf8_weak void utf8lwr(void *utf8_restrict str);
113
+ #if defined(utf8_cplusplus) && utf8_cplusplus >= 201402L && (!defined(_MSC_VER) || (defined(_MSC_VER) && _MSC_VER >= 1910))
114
+ #define utf8_constexpr14 constexpr
115
+ #define utf8_constexpr14_impl constexpr
116
+ #else
117
+ /* constexpr and weak are incompatible. so only enable one of them */
118
+ #define utf8_constexpr14 utf8_weak
119
+ #define utf8_constexpr14_impl
120
+ #endif
206
121
 
207
- // Transform the given string into all uppercase codepoints.
208
- utf8_nonnull utf8_weak void utf8upr(void *utf8_restrict str);
122
+ #if defined(utf8_cplusplus) && utf8_cplusplus >= 202002L
123
+ using utf8_int8_t = char8_t; /* Introduced in C++20 */
124
+ #else
125
+ typedef char utf8_int8_t;
126
+ #endif
209
127
 
210
- // Make a codepoint lower case if possible.
211
- utf8_weak utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
128
+ /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
129
+ * src2 respectively, case insensitive. */
130
+ utf8_constexpr14 utf8_nonnull utf8_pure int
131
+ utf8casecmp(const utf8_int8_t *src1, const utf8_int8_t *src2);
132
+
133
+ /* Append the utf8 string src onto the utf8 string dst. */
134
+ utf8_nonnull utf8_weak utf8_int8_t *
135
+ utf8cat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
136
+
137
+ /* Find the first match of the utf8 codepoint chr in the utf8 string src. */
138
+ utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
139
+ utf8chr(const utf8_int8_t *src, utf8_int32_t chr);
140
+
141
+ /* Return less than 0, 0, greater than 0 if src1 < src2,
142
+ * src1 == src2, src1 > src2 respectively. */
143
+ utf8_constexpr14 utf8_nonnull utf8_pure int utf8cmp(const utf8_int8_t *src1,
144
+ const utf8_int8_t *src2);
145
+
146
+ /* Copy the utf8 string src onto the memory allocated in dst. */
147
+ utf8_nonnull utf8_weak utf8_int8_t *
148
+ utf8cpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src);
149
+
150
+ /* Number of utf8 codepoints in the utf8 string src that consists entirely
151
+ * of utf8 codepoints not from the utf8 string reject. */
152
+ utf8_constexpr14 utf8_nonnull utf8_pure size_t
153
+ utf8cspn(const utf8_int8_t *src, const utf8_int8_t *reject);
154
+
155
+ /* Duplicate the utf8 string src by getting its size, malloc'ing a new buffer
156
+ * copying over the data, and returning that. Or 0 if malloc failed. */
157
+ utf8_weak utf8_int8_t *utf8dup(const utf8_int8_t *src);
158
+
159
+ /* Number of utf8 codepoints in the utf8 string str,
160
+ * excluding the null terminating byte. */
161
+ utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8len(const utf8_int8_t *str);
162
+
163
+ /* Similar to utf8len, except that only at most n bytes of src are looked. */
164
+ utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8nlen(const utf8_int8_t *str,
165
+ size_t n);
166
+
167
+ /* Return less than 0, 0, greater than 0 if src1 < src2, src1 == src2, src1 >
168
+ * src2 respectively, case insensitive. Checking at most n bytes of each utf8
169
+ * string. */
170
+ utf8_constexpr14 utf8_nonnull utf8_pure int
171
+ utf8ncasecmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
172
+
173
+ /* Append the utf8 string src onto the utf8 string dst,
174
+ * writing at most n+1 bytes. Can produce an invalid utf8
175
+ * string if n falls partway through a utf8 codepoint. */
176
+ utf8_nonnull utf8_weak utf8_int8_t *
177
+ utf8ncat(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
178
+ size_t n);
179
+
180
+ /* Return less than 0, 0, greater than 0 if src1 < src2,
181
+ * src1 == src2, src1 > src2 respectively. Checking at most n
182
+ * bytes of each utf8 string. */
183
+ utf8_constexpr14 utf8_nonnull utf8_pure int
184
+ utf8ncmp(const utf8_int8_t *src1, const utf8_int8_t *src2, size_t n);
185
+
186
+ /* Copy the utf8 string src onto the memory allocated in dst.
187
+ * Copies at most n bytes. If n falls partway through a utf8
188
+ * codepoint, or if dst doesn't have enough room for a null
189
+ * terminator, the final string will be cut short to preserve
190
+ * utf8 validity. */
191
+
192
+ utf8_nonnull utf8_weak utf8_int8_t *
193
+ utf8ncpy(utf8_int8_t *utf8_restrict dst, const utf8_int8_t *utf8_restrict src,
194
+ size_t n);
195
+
196
+ /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
197
+ * longer than n, only n bytes are copied and a null byte is added.
198
+ *
199
+ * Returns a new string if successful, 0 otherwise */
200
+ utf8_weak utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n);
201
+
202
+ /* Locates the first occurrence in the utf8 string str of any byte in the
203
+ * utf8 string accept, or 0 if no match was found. */
204
+ utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
205
+ utf8pbrk(const utf8_int8_t *str, const utf8_int8_t *accept);
206
+
207
+ /* Find the last match of the utf8 codepoint chr in the utf8 string src. */
208
+ utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
209
+ utf8rchr(const utf8_int8_t *src, int chr);
210
+
211
+ /* Number of bytes in the utf8 string str,
212
+ * including the null terminating byte. */
213
+ utf8_constexpr14 utf8_nonnull utf8_pure size_t utf8size(const utf8_int8_t *str);
214
+
215
+ /* Similar to utf8size, except that the null terminating byte is excluded. */
216
+ utf8_constexpr14 utf8_nonnull utf8_pure size_t
217
+ utf8size_lazy(const utf8_int8_t *str);
218
+
219
+ /* Similar to utf8size, except that only at most n bytes of src are looked and
220
+ * the null terminating byte is excluded. */
221
+ utf8_constexpr14 utf8_nonnull utf8_pure size_t
222
+ utf8nsize_lazy(const utf8_int8_t *str, size_t n);
223
+
224
+ /* Number of utf8 codepoints in the utf8 string src that consists entirely
225
+ * of utf8 codepoints from the utf8 string accept. */
226
+ utf8_constexpr14 utf8_nonnull utf8_pure size_t
227
+ utf8spn(const utf8_int8_t *src, const utf8_int8_t *accept);
228
+
229
+ /* The position of the utf8 string needle in the utf8 string haystack. */
230
+ utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
231
+ utf8str(const utf8_int8_t *haystack, const utf8_int8_t *needle);
232
+
233
+ /* The position of the utf8 string needle in the utf8 string haystack, case
234
+ * insensitive. */
235
+ utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
236
+ utf8casestr(const utf8_int8_t *haystack, const utf8_int8_t *needle);
237
+
238
+ /* Return 0 on success, or the position of the invalid
239
+ * utf8 codepoint on failure. */
240
+ utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
241
+ utf8valid(const utf8_int8_t *str);
242
+
243
+ /* Similar to utf8valid, except that only at most n bytes of src are looked. */
244
+ utf8_constexpr14 utf8_nonnull utf8_pure utf8_int8_t *
245
+ utf8nvalid(const utf8_int8_t *str, size_t n);
246
+
247
+ /* Given a null-terminated string, makes the string valid by replacing invalid
248
+ * codepoints with a 1-byte replacement. Returns 0 on success. */
249
+ utf8_nonnull utf8_weak int utf8makevalid(utf8_int8_t *str,
250
+ const utf8_int32_t replacement);
251
+
252
+ /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
253
+ * address of the next utf8 codepoint after the current one in str. */
254
+ utf8_constexpr14 utf8_nonnull utf8_int8_t *
255
+ utf8codepoint(const utf8_int8_t *utf8_restrict str,
256
+ utf8_int32_t *utf8_restrict out_codepoint);
212
257
 
213
- // Make a codepoint upper case if possible.
214
- utf8_weak utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
258
+ /* Calculates the size of the next utf8 codepoint in str. */
259
+ utf8_constexpr14 utf8_nonnull size_t
260
+ utf8codepointcalcsize(const utf8_int8_t *str);
261
+
262
+ /* Returns the size of the given codepoint in bytes. */
263
+ utf8_constexpr14 size_t utf8codepointsize(utf8_int32_t chr);
264
+
265
+ /* Write a codepoint to the given string, and return the address to the next
266
+ * place after the written codepoint. Pass how many bytes left in the buffer to
267
+ * n. If there is not enough space for the codepoint, this function returns
268
+ * null. */
269
+ utf8_nonnull utf8_weak utf8_int8_t *
270
+ utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n);
271
+
272
+ /* Returns 1 if the given character is lowercase, or 0 if it is not. */
273
+ utf8_constexpr14 int utf8islower(utf8_int32_t chr);
274
+
275
+ /* Returns 1 if the given character is uppercase, or 0 if it is not. */
276
+ utf8_constexpr14 int utf8isupper(utf8_int32_t chr);
277
+
278
+ /* Transform the given string into all lowercase codepoints. */
279
+ utf8_nonnull utf8_weak void utf8lwr(utf8_int8_t *utf8_restrict str);
280
+
281
+ /* Transform the given string into all uppercase codepoints. */
282
+ utf8_nonnull utf8_weak void utf8upr(utf8_int8_t *utf8_restrict str);
283
+
284
+ /* Make a codepoint lower case if possible. */
285
+ utf8_constexpr14 utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp);
286
+
287
+ /* Make a codepoint upper case if possible. */
288
+ utf8_constexpr14 utf8_int32_t utf8uprcodepoint(utf8_int32_t cp);
289
+
290
+ /* Sets out_codepoint to the current utf8 codepoint in str, and returns the
291
+ * address of the previous utf8 codepoint before the current one in str. */
292
+ utf8_constexpr14 utf8_nonnull utf8_int8_t *
293
+ utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
294
+ utf8_int32_t *utf8_restrict out_codepoint);
295
+
296
+ /* Duplicate the utf8 string src by getting its size, calling alloc_func_ptr to
297
+ * copy over data to a new buffer, and returning that. Or 0 if alloc_func_ptr
298
+ * returned null. */
299
+ utf8_weak utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
300
+ utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
301
+ size_t),
302
+ utf8_int8_t *user_data);
303
+
304
+ /* Similar to utf8dup, except that at most n bytes of src are copied. If src is
305
+ * longer than n, only n bytes are copied and a null byte is added.
306
+ *
307
+ * Returns a new string if successful, 0 otherwise. */
308
+ utf8_weak utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
309
+ utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *,
310
+ size_t),
311
+ utf8_int8_t *user_data);
215
312
 
216
313
  #undef utf8_weak
217
314
  #undef utf8_pure
218
315
  #undef utf8_nonnull
219
316
 
220
- int utf8casecmp(const void *src1, const void *src2) {
221
- utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
317
+ utf8_constexpr14_impl int utf8casecmp(const utf8_int8_t *src1,
318
+ const utf8_int8_t *src2) {
319
+ utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
320
+ src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
222
321
 
223
322
  for (;;) {
224
- src1 = utf8codepoint(src1, &src1_cp);
225
- src2 = utf8codepoint(src2, &src2_cp);
323
+ src1 = utf8codepoint(src1, &src1_orig_cp);
324
+ src2 = utf8codepoint(src2, &src2_orig_cp);
226
325
 
227
- // Take a copy of src1 & src2
228
- src1_orig_cp = src1_cp;
229
- src2_orig_cp = src2_cp;
326
+ /* lower the srcs if required */
327
+ src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
328
+ src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
230
329
 
231
- // Lower the srcs if required
232
- src1_cp = utf8lwrcodepoint(src1_cp);
233
- src2_cp = utf8lwrcodepoint(src2_cp);
330
+ /* lower the srcs if required */
331
+ src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
332
+ src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
234
333
 
235
- // Check if the lowered codepoints match
334
+ /* check if the lowered codepoints match */
236
335
  if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
237
336
  return 0;
238
- } else if (src1_cp == src2_cp) {
337
+ } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
239
338
  continue;
240
339
  }
241
340
 
242
- // If they don't match, then we return which of the original's are less
243
- if (src1_orig_cp < src2_orig_cp) {
244
- return -1;
245
- } else if (src1_orig_cp > src2_orig_cp) {
246
- return 1;
247
- }
341
+ /* if they don't match, then we return the difference between the characters
342
+ */
343
+ return src1_lwr_cp - src2_lwr_cp;
248
344
  }
249
345
  }
250
346
 
251
- void *utf8cat(void *utf8_restrict dst, const void *utf8_restrict src) {
252
- char *d = (char *)dst;
253
- const char *s = (const char *)src;
254
-
255
- // find the null terminating byte in dst
347
+ utf8_int8_t *utf8cat(utf8_int8_t *utf8_restrict dst,
348
+ const utf8_int8_t *utf8_restrict src) {
349
+ utf8_int8_t *d = dst;
350
+ /* find the null terminating byte in dst */
256
351
  while ('\0' != *d) {
257
352
  d++;
258
353
  }
259
354
 
260
- // overwriting the null terminating byte in dst, append src byte-by-byte
261
- while ('\0' != *s) {
262
- *d++ = *s++;
355
+ /* overwriting the null terminating byte in dst, append src byte-by-byte */
356
+ while ('\0' != *src) {
357
+ *d++ = *src++;
263
358
  }
264
359
 
265
- // write out a new null terminating byte into dst
360
+ /* write out a new null terminating byte into dst */
266
361
  *d = '\0';
267
362
 
268
363
  return dst;
269
364
  }
270
365
 
271
- void *utf8chr(const void *src, utf8_int32_t chr) {
272
- char c[5] = {'\0', '\0', '\0', '\0', '\0'};
366
+ utf8_constexpr14_impl utf8_int8_t *utf8chr(const utf8_int8_t *src,
367
+ utf8_int32_t chr) {
368
+ utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
273
369
 
274
370
  if (0 == chr) {
275
- // being asked to return position of null terminating byte, so
276
- // just run s to the end, and return!
277
- const char *s = (const char *)src;
278
- while ('\0' != *s) {
279
- s++;
371
+ /* being asked to return position of null terminating byte, so
372
+ * just run s to the end, and return! */
373
+ while ('\0' != *src) {
374
+ src++;
280
375
  }
281
- return (void *)s;
376
+ return (utf8_int8_t *)src;
282
377
  } else if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
283
- // 1-byte/7-bit ascii
284
- // (0b0xxxxxxx)
285
- c[0] = (char)chr;
378
+ /* 1-byte/7-bit ascii
379
+ * (0b0xxxxxxx) */
380
+ c[0] = (utf8_int8_t)chr;
286
381
  } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
287
- // 2-byte/11-bit utf8 code point
288
- // (0b110xxxxx 0b10xxxxxx)
289
- c[0] = 0xc0 | (char)(chr >> 6);
290
- c[1] = 0x80 | (char)(chr & 0x3f);
382
+ /* 2-byte/11-bit utf8 code point
383
+ * (0b110xxxxx 0b10xxxxxx) */
384
+ c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
385
+ c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
291
386
  } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
292
- // 3-byte/16-bit utf8 code point
293
- // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
294
- c[0] = 0xe0 | (char)(chr >> 12);
295
- c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
296
- c[2] = 0x80 | (char)(chr & 0x3f);
297
- } else { // if (0 == ((int)0xffe00000 & chr)) {
298
- // 4-byte/21-bit utf8 code point
299
- // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
300
- c[0] = 0xf0 | (char)(chr >> 18);
301
- c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
302
- c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
303
- c[3] = 0x80 | (char)(chr & 0x3f);
387
+ /* 3-byte/16-bit utf8 code point
388
+ * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
389
+ c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
390
+ c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
391
+ c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
392
+ } else { /* if (0 == ((int)0xffe00000 & chr)) { */
393
+ /* 4-byte/21-bit utf8 code point
394
+ * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
395
+ c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
396
+ c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
397
+ c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
398
+ c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
304
399
  }
305
400
 
306
- // we've made c into a 2 utf8 codepoint string, one for the chr we are
307
- // seeking, another for the null terminating byte. Now use utf8str to
308
- // search
401
+ /* we've made c into a 2 utf8 codepoint string, one for the chr we are
402
+ * seeking, another for the null terminating byte. Now use utf8str to
403
+ * search */
309
404
  return utf8str(src, c);
310
405
  }
311
406
 
312
- int utf8cmp(const void *src1, const void *src2) {
313
- const unsigned char *s1 = (const unsigned char *)src1;
314
- const unsigned char *s2 = (const unsigned char *)src2;
315
-
316
- while (('\0' != *s1) || ('\0' != *s2)) {
317
- if (*s1 < *s2) {
407
+ utf8_constexpr14_impl int utf8cmp(const utf8_int8_t *src1,
408
+ const utf8_int8_t *src2) {
409
+ while (('\0' != *src1) || ('\0' != *src2)) {
410
+ if (*src1 < *src2) {
318
411
  return -1;
319
- } else if (*s1 > *s2) {
412
+ } else if (*src1 > *src2) {
320
413
  return 1;
321
414
  }
322
415
 
323
- s1++;
324
- s2++;
416
+ src1++;
417
+ src2++;
325
418
  }
326
419
 
327
- // both utf8 strings matched
420
+ /* both utf8 strings matched */
328
421
  return 0;
329
422
  }
330
423
 
331
- int utf8coll(const void *src1, const void *src2);
424
+ utf8_constexpr14_impl int utf8coll(const utf8_int8_t *src1,
425
+ const utf8_int8_t *src2);
332
426
 
333
- void *utf8cpy(void *utf8_restrict dst, const void *utf8_restrict src) {
334
- char *d = (char *)dst;
335
- const char *s = (const char *)src;
427
+ utf8_int8_t *utf8cpy(utf8_int8_t *utf8_restrict dst,
428
+ const utf8_int8_t *utf8_restrict src) {
429
+ utf8_int8_t *d = dst;
336
430
 
337
- // overwriting anything previously in dst, write byte-by-byte
338
- // from src
339
- while ('\0' != *s) {
340
- *d++ = *s++;
431
+ /* overwriting anything previously in dst, write byte-by-byte
432
+ * from src */
433
+ while ('\0' != *src) {
434
+ *d++ = *src++;
341
435
  }
342
436
 
343
- // append null terminating byte
437
+ /* append null terminating byte */
344
438
  *d = '\0';
345
439
 
346
440
  return dst;
347
441
  }
348
442
 
349
- size_t utf8cspn(const void *src, const void *reject) {
350
- const char *s = (const char *)src;
443
+ utf8_constexpr14_impl size_t utf8cspn(const utf8_int8_t *src,
444
+ const utf8_int8_t *reject) {
351
445
  size_t chars = 0;
352
446
 
353
- while ('\0' != *s) {
354
- const char *r = (const char *)reject;
447
+ while ('\0' != *src) {
448
+ const utf8_int8_t *r = reject;
355
449
  size_t offset = 0;
356
450
 
357
451
  while ('\0' != *r) {
358
- // checking that if *r is the start of a utf8 codepoint
359
- // (it is not 0b10xxxxxx) and we have successfully matched
360
- // a previous character (0 < offset) - we found a match
452
+ /* checking that if *r is the start of a utf8 codepoint
453
+ * (it is not 0b10xxxxxx) and we have successfully matched
454
+ * a previous character (0 < offset) - we found a match */
361
455
  if ((0x80 != (0xc0 & *r)) && (0 < offset)) {
362
456
  return chars;
363
457
  } else {
364
- if (*r == s[offset]) {
365
- // part of a utf8 codepoint matched, so move our checking
366
- // onwards to the next byte
458
+ if (*r == src[offset]) {
459
+ /* part of a utf8 codepoint matched, so move our checking
460
+ * onwards to the next byte */
367
461
  offset++;
368
462
  r++;
369
463
  } else {
370
- // r could be in the middle of an unmatching utf8 code point,
371
- // so we need to march it on to the next character beginning,
464
+ /* r could be in the middle of an unmatching utf8 code point,
465
+ * so we need to march it on to the next character beginning, */
372
466
 
373
467
  do {
374
468
  r++;
375
469
  } while (0x80 == (0xc0 & *r));
376
470
 
377
- // reset offset too as we found a mismatch
471
+ /* reset offset too as we found a mismatch */
378
472
  offset = 0;
379
473
  }
380
474
  }
381
475
  }
382
476
 
383
- // the current utf8 codepoint in src did not match reject, but src
384
- // could have been partway through a utf8 codepoint, so we need to
385
- // march it onto the next utf8 codepoint starting byte
477
+ /* found a match at the end of *r, so didn't get a chance to test it */
478
+ if (0 < offset) {
479
+ return chars;
480
+ }
481
+
482
+ /* the current utf8 codepoint in src did not match reject, but src
483
+ * could have been partway through a utf8 codepoint, so we need to
484
+ * march it onto the next utf8 codepoint starting byte */
386
485
  do {
387
- s++;
388
- } while ((0x80 == (0xc0 & *s)));
486
+ src++;
487
+ } while ((0x80 == (0xc0 & *src)));
389
488
  chars++;
390
489
  }
391
490
 
392
491
  return chars;
393
492
  }
394
493
 
395
- size_t utf8size(const void *str);
494
+ utf8_int8_t *utf8dup(const utf8_int8_t *src) {
495
+ return utf8dup_ex(src, utf8_null, utf8_null);
496
+ }
396
497
 
397
- void *utf8dup(const void *src) {
398
- const char *s = (const char *)src;
399
- char *n = utf8_null;
498
+ utf8_int8_t *utf8dup_ex(const utf8_int8_t *src,
499
+ utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
500
+ utf8_int8_t *user_data) {
501
+ utf8_int8_t *n = utf8_null;
400
502
 
401
- // figure out how many bytes (including the terminator) we need to copy first
503
+ /* figure out how many bytes (including the terminator) we need to copy first
504
+ */
402
505
  size_t bytes = utf8size(src);
403
506
 
404
- n = (char *)malloc(bytes);
507
+ if (alloc_func_ptr) {
508
+ n = alloc_func_ptr(user_data, bytes);
509
+ } else {
510
+ #if !defined(UTF8_NO_STD_MALLOC)
511
+ n = (utf8_int8_t *)malloc(bytes);
512
+ #else
513
+ return utf8_null;
514
+ #endif
515
+ }
405
516
 
406
517
  if (utf8_null == n) {
407
- // out of memory so we bail
518
+ /* out of memory so we bail */
408
519
  return utf8_null;
409
520
  } else {
410
521
  bytes = 0;
411
522
 
412
- // copy src byte-by-byte into our new utf8 string
413
- while ('\0' != s[bytes]) {
414
- n[bytes] = s[bytes];
523
+ /* copy src byte-by-byte into our new utf8 string */
524
+ while ('\0' != src[bytes]) {
525
+ n[bytes] = src[bytes];
415
526
  bytes++;
416
527
  }
417
528
 
418
- // append null terminating byte
529
+ /* append null terminating byte */
419
530
  n[bytes] = '\0';
420
531
  return n;
421
532
  }
422
533
  }
423
534
 
424
- void *utf8fry(const void *str);
535
+ utf8_constexpr14_impl utf8_int8_t *utf8fry(const utf8_int8_t *str);
536
+
537
+ utf8_constexpr14_impl size_t utf8len(const utf8_int8_t *str) {
538
+ return utf8nlen(str, SIZE_MAX);
539
+ }
425
540
 
426
- size_t utf8len(const void *str) {
427
- const unsigned char *s = (const unsigned char *)str;
541
+ utf8_constexpr14_impl size_t utf8nlen(const utf8_int8_t *str, size_t n) {
542
+ const utf8_int8_t *t = str;
428
543
  size_t length = 0;
429
544
 
430
- while ('\0' != *s) {
431
- if (0xf0 == (0xf8 & *s)) {
432
- // 4-byte utf8 code point (began with 0b11110xxx)
433
- s += 4;
434
- } else if (0xe0 == (0xf0 & *s)) {
435
- // 3-byte utf8 code point (began with 0b1110xxxx)
436
- s += 3;
437
- } else if (0xc0 == (0xe0 & *s)) {
438
- // 2-byte utf8 code point (began with 0b110xxxxx)
439
- s += 2;
440
- } else { // if (0x00 == (0x80 & *s)) {
441
- // 1-byte ascii (began with 0b0xxxxxxx)
442
- s += 1;
545
+ while ((size_t)(str - t) < n && '\0' != *str) {
546
+ if (0xf0 == (0xf8 & *str)) {
547
+ /* 4-byte utf8 code point (began with 0b11110xxx) */
548
+ str += 4;
549
+ } else if (0xe0 == (0xf0 & *str)) {
550
+ /* 3-byte utf8 code point (began with 0b1110xxxx) */
551
+ str += 3;
552
+ } else if (0xc0 == (0xe0 & *str)) {
553
+ /* 2-byte utf8 code point (began with 0b110xxxxx) */
554
+ str += 2;
555
+ } else { /* if (0x00 == (0x80 & *s)) { */
556
+ /* 1-byte ascii (began with 0b0xxxxxxx) */
557
+ str += 1;
443
558
  }
444
559
 
445
- // no matter the bytes we marched s forward by, it was
446
- // only 1 utf8 codepoint
560
+ /* no matter the bytes we marched s forward by, it was
561
+ * only 1 utf8 codepoint */
447
562
  length++;
448
563
  }
449
564
 
565
+ if ((size_t)(str - t) > n) {
566
+ length--;
567
+ }
450
568
  return length;
451
569
  }
452
570
 
453
- int utf8ncasecmp(const void *src1, const void *src2, size_t n) {
454
- utf8_int32_t src1_cp, src2_cp, src1_orig_cp, src2_orig_cp;
571
+ utf8_constexpr14_impl int utf8ncasecmp(const utf8_int8_t *src1,
572
+ const utf8_int8_t *src2, size_t n) {
573
+ utf8_int32_t src1_lwr_cp = 0, src2_lwr_cp = 0, src1_upr_cp = 0,
574
+ src2_upr_cp = 0, src1_orig_cp = 0, src2_orig_cp = 0;
455
575
 
456
576
  do {
457
- const unsigned char *const s1 = (const unsigned char *)src1;
458
- const unsigned char *const s2 = (const unsigned char *)src2;
577
+ const utf8_int8_t *const s1 = src1;
578
+ const utf8_int8_t *const s2 = src2;
459
579
 
460
- // first check that we have enough bytes left in n to contain an entire
461
- // codepoint
580
+ /* first check that we have enough bytes left in n to contain an entire
581
+ * codepoint */
462
582
  if (0 == n) {
463
583
  return 0;
464
584
  }
@@ -467,10 +587,8 @@ int utf8ncasecmp(const void *src1, const void *src2, size_t n) {
467
587
  const utf8_int32_t c1 = (0xe0 & *s1);
468
588
  const utf8_int32_t c2 = (0xe0 & *s2);
469
589
 
470
- if (c1 < c2) {
471
- return -1;
472
- } else if (c1 > c2) {
473
- return 1;
590
+ if (c1 != c2) {
591
+ return c1 - c2;
474
592
  } else {
475
593
  return 0;
476
594
  }
@@ -480,10 +598,8 @@ int utf8ncasecmp(const void *src1, const void *src2, size_t n) {
480
598
  const utf8_int32_t c1 = (0xf0 & *s1);
481
599
  const utf8_int32_t c2 = (0xf0 & *s2);
482
600
 
483
- if (c1 < c2) {
484
- return -1;
485
- } else if (c1 > c2) {
486
- return 1;
601
+ if (c1 != c2) {
602
+ return c1 - c2;
487
603
  } else {
488
604
  return 0;
489
605
  }
@@ -493,307 +609,343 @@ int utf8ncasecmp(const void *src1, const void *src2, size_t n) {
493
609
  const utf8_int32_t c1 = (0xf8 & *s1);
494
610
  const utf8_int32_t c2 = (0xf8 & *s2);
495
611
 
496
- if (c1 < c2) {
497
- return -1;
498
- } else if (c1 > c2) {
499
- return 1;
612
+ if (c1 != c2) {
613
+ return c1 - c2;
500
614
  } else {
501
615
  return 0;
502
616
  }
503
617
  }
504
618
 
505
- src1 = utf8codepoint(src1, &src1_cp);
506
- src2 = utf8codepoint(src2, &src2_cp);
507
- n -= utf8codepointsize(src1_cp);
619
+ src1 = utf8codepoint(src1, &src1_orig_cp);
620
+ src2 = utf8codepoint(src2, &src2_orig_cp);
621
+ n -= utf8codepointsize(src1_orig_cp);
508
622
 
509
- // Take a copy of src1 & src2
510
- src1_orig_cp = src1_cp;
511
- src2_orig_cp = src2_cp;
623
+ src1_lwr_cp = utf8lwrcodepoint(src1_orig_cp);
624
+ src2_lwr_cp = utf8lwrcodepoint(src2_orig_cp);
512
625
 
513
- // Lower srcs if required
514
- src1_cp = utf8lwrcodepoint(src1_cp);
515
- src2_cp = utf8lwrcodepoint(src2_cp);
626
+ src1_upr_cp = utf8uprcodepoint(src1_orig_cp);
627
+ src2_upr_cp = utf8uprcodepoint(src2_orig_cp);
516
628
 
517
- // Check if the lowered codepoints match
629
+ /* check if the lowered codepoints match */
518
630
  if ((0 == src1_orig_cp) && (0 == src2_orig_cp)) {
519
631
  return 0;
520
- } else if (src1_cp == src2_cp) {
632
+ } else if ((src1_lwr_cp == src2_lwr_cp) || (src1_upr_cp == src2_upr_cp)) {
521
633
  continue;
522
634
  }
523
635
 
524
- // If they don't match, then we return which of the original's are less
525
- if (src1_orig_cp < src2_orig_cp) {
526
- return -1;
527
- } else if (src1_orig_cp > src2_orig_cp) {
528
- return 1;
529
- }
636
+ /* if they don't match, then we return the difference between the characters
637
+ */
638
+ return src1_lwr_cp - src2_lwr_cp;
530
639
  } while (0 < n);
531
640
 
532
- // both utf8 strings matched
641
+ /* both utf8 strings matched */
533
642
  return 0;
534
643
  }
535
644
 
536
- void *utf8ncat(void *utf8_restrict dst, const void *utf8_restrict src,
537
- size_t n) {
538
- char *d = (char *)dst;
539
- const char *s = (const char *)src;
645
+ utf8_int8_t *utf8ncat(utf8_int8_t *utf8_restrict dst,
646
+ const utf8_int8_t *utf8_restrict src, size_t n) {
647
+ utf8_int8_t *d = dst;
540
648
 
541
- // find the null terminating byte in dst
649
+ /* find the null terminating byte in dst */
542
650
  while ('\0' != *d) {
543
651
  d++;
544
652
  }
545
653
 
546
- // overwriting the null terminating byte in dst, append src byte-by-byte
547
- // stopping if we run out of space
548
- do {
549
- *d++ = *s++;
550
- } while (('\0' != *s) && (0 != --n));
654
+ /* overwriting the null terminating byte in dst, append src byte-by-byte
655
+ * stopping if we run out of space */
656
+ while (('\0' != *src) && (0 != n--)) {
657
+ *d++ = *src++;
658
+ }
551
659
 
552
- // write out a new null terminating byte into dst
660
+ /* write out a new null terminating byte into dst */
553
661
  *d = '\0';
554
662
 
555
663
  return dst;
556
664
  }
557
665
 
558
- int utf8ncmp(const void *src1, const void *src2, size_t n) {
559
- const unsigned char *s1 = (const unsigned char *)src1;
560
- const unsigned char *s2 = (const unsigned char *)src2;
561
-
562
- while ((('\0' != *s1) || ('\0' != *s2)) && (0 != n--)) {
563
- if (*s1 < *s2) {
666
+ utf8_constexpr14_impl int utf8ncmp(const utf8_int8_t *src1,
667
+ const utf8_int8_t *src2, size_t n) {
668
+ while ((0 != n--) && (('\0' != *src1) || ('\0' != *src2))) {
669
+ if (*src1 < *src2) {
564
670
  return -1;
565
- } else if (*s1 > *s2) {
671
+ } else if (*src1 > *src2) {
566
672
  return 1;
567
673
  }
568
674
 
569
- s1++;
570
- s2++;
675
+ src1++;
676
+ src2++;
571
677
  }
572
678
 
573
- // both utf8 strings matched
679
+ /* both utf8 strings matched */
574
680
  return 0;
575
681
  }
576
682
 
577
- void *utf8ncpy(void *utf8_restrict dst, const void *utf8_restrict src,
578
- size_t n) {
579
- char *d = (char *)dst;
580
- const char *s = (const char *)src;
683
+ utf8_int8_t *utf8ncpy(utf8_int8_t *utf8_restrict dst,
684
+ const utf8_int8_t *utf8_restrict src, size_t n) {
685
+ utf8_int8_t *d = dst;
686
+ size_t index = 0, check_index = 0;
581
687
 
582
- // overwriting anything previously in dst, write byte-by-byte
583
- // from src
584
- do {
585
- *d++ = *s++;
586
- } while (('\0' != *s) && (0 != --n));
688
+ if (n == 0) {
689
+ return dst;
690
+ }
587
691
 
588
- // append null terminating byte
589
- while (0 != n) {
590
- *d++ = '\0';
591
- n--;
692
+ /* overwriting anything previously in dst, write byte-by-byte
693
+ * from src */
694
+ for (index = 0; index < n; index++) {
695
+ d[index] = src[index];
696
+ if ('\0' == src[index]) {
697
+ break;
698
+ }
699
+ }
700
+
701
+ for (check_index = index - 1;
702
+ check_index > 0 && 0x80 == (0xc0 & d[check_index]); check_index--) {
703
+ /* just moving the index */
704
+ }
705
+
706
+ if (check_index < index &&
707
+ ((index - check_index) < utf8codepointcalcsize(&d[check_index]) ||
708
+ (index - check_index) == n)) {
709
+ index = check_index;
710
+ }
711
+
712
+ /* append null terminating byte */
713
+ for (; index < n; index++) {
714
+ d[index] = 0;
592
715
  }
593
716
 
594
717
  return dst;
595
718
  }
596
719
 
597
- void *utf8ndup(const void *src, size_t n) {
598
- const char *s = (const char *)src;
599
- char *c = utf8_null;
720
+ utf8_int8_t *utf8ndup(const utf8_int8_t *src, size_t n) {
721
+ return utf8ndup_ex(src, n, utf8_null, utf8_null);
722
+ }
723
+
724
+ utf8_int8_t *utf8ndup_ex(const utf8_int8_t *src, size_t n,
725
+ utf8_int8_t *(*alloc_func_ptr)(utf8_int8_t *, size_t),
726
+ utf8_int8_t *user_data) {
727
+ utf8_int8_t *c = utf8_null;
600
728
  size_t bytes = 0;
601
729
 
602
- // Find the end of the string or stop when n is reached
603
- while ('\0' != s[bytes] && bytes < n) {
730
+ /* Find the end of the string or stop when n is reached */
731
+ while ('\0' != src[bytes] && bytes < n) {
604
732
  bytes++;
605
733
  }
606
734
 
607
- // In case bytes is actually less than n, we need to set it
608
- // to be used later in the copy byte by byte.
735
+ /* In case bytes is actually less than n, we need to set it
736
+ * to be used later in the copy byte by byte. */
609
737
  n = bytes;
610
738
 
611
- c = (char *)malloc(bytes + 1);
739
+ if (alloc_func_ptr) {
740
+ c = alloc_func_ptr(user_data, bytes + 1);
741
+ } else {
742
+ #if !defined(UTF8_NO_STD_MALLOC)
743
+ c = (utf8_int8_t *)malloc(bytes + 1);
744
+ #else
745
+ c = utf8_null;
746
+ #endif
747
+ }
748
+
612
749
  if (utf8_null == c) {
613
- // out of memory so we bail
750
+ /* out of memory so we bail */
614
751
  return utf8_null;
615
752
  }
616
753
 
617
754
  bytes = 0;
618
755
 
619
- // copy src byte-by-byte into our new utf8 string
620
- while ('\0' != s[bytes] && bytes < n) {
621
- c[bytes] = s[bytes];
756
+ /* copy src byte-by-byte into our new utf8 string */
757
+ while ('\0' != src[bytes] && bytes < n) {
758
+ c[bytes] = src[bytes];
622
759
  bytes++;
623
760
  }
624
761
 
625
- // append null terminating byte
762
+ /* append null terminating byte */
626
763
  c[bytes] = '\0';
627
764
  return c;
628
765
  }
629
766
 
630
- void *utf8rchr(const void *src, int chr) {
631
- const char *s = (const char *)src;
632
- const char *match = utf8_null;
633
- char c[5] = {'\0', '\0', '\0', '\0', '\0'};
767
+ utf8_constexpr14_impl utf8_int8_t *utf8rchr(const utf8_int8_t *src, int chr) {
768
+
769
+ utf8_int8_t *match = utf8_null;
770
+ utf8_int8_t c[5] = {'\0', '\0', '\0', '\0', '\0'};
634
771
 
635
772
  if (0 == chr) {
636
- // being asked to return position of null terminating byte, so
637
- // just run s to the end, and return!
638
- while ('\0' != *s) {
639
- s++;
773
+ /* being asked to return position of null terminating byte, so
774
+ * just run s to the end, and return! */
775
+ while ('\0' != *src) {
776
+ src++;
640
777
  }
641
- return (void *)s;
778
+ return (utf8_int8_t *)src;
642
779
  } else if (0 == ((int)0xffffff80 & chr)) {
643
- // 1-byte/7-bit ascii
644
- // (0b0xxxxxxx)
645
- c[0] = (char)chr;
780
+ /* 1-byte/7-bit ascii
781
+ * (0b0xxxxxxx) */
782
+ c[0] = (utf8_int8_t)chr;
646
783
  } else if (0 == ((int)0xfffff800 & chr)) {
647
- // 2-byte/11-bit utf8 code point
648
- // (0b110xxxxx 0b10xxxxxx)
649
- c[0] = 0xc0 | (char)(chr >> 6);
650
- c[1] = 0x80 | (char)(chr & 0x3f);
784
+ /* 2-byte/11-bit utf8 code point
785
+ * (0b110xxxxx 0b10xxxxxx) */
786
+ c[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)(chr >> 6));
787
+ c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
651
788
  } else if (0 == ((int)0xffff0000 & chr)) {
652
- // 3-byte/16-bit utf8 code point
653
- // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
654
- c[0] = 0xe0 | (char)(chr >> 12);
655
- c[1] = 0x80 | (char)((chr >> 6) & 0x3f);
656
- c[2] = 0x80 | (char)(chr & 0x3f);
657
- } else { // if (0 == ((int)0xffe00000 & chr)) {
658
- // 4-byte/21-bit utf8 code point
659
- // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
660
- c[0] = 0xf0 | (char)(chr >> 18);
661
- c[1] = 0x80 | (char)((chr >> 12) & 0x3f);
662
- c[2] = 0x80 | (char)((chr >> 6) & 0x3f);
663
- c[3] = 0x80 | (char)(chr & 0x3f);
789
+ /* 3-byte/16-bit utf8 code point
790
+ * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
791
+ c[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)(chr >> 12));
792
+ c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
793
+ c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
794
+ } else { /* if (0 == ((int)0xffe00000 & chr)) { */
795
+ /* 4-byte/21-bit utf8 code point
796
+ * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
797
+ c[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)(chr >> 18));
798
+ c[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
799
+ c[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
800
+ c[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
664
801
  }
665
802
 
666
- // we've created a 2 utf8 codepoint string in c that is
667
- // the utf8 character asked for by chr, and a null
668
- // terminating byte
803
+ /* we've created a 2 utf8 codepoint string in c that is
804
+ * the utf8 character asked for by chr, and a null
805
+ * terminating byte */
669
806
 
670
- while ('\0' != *s) {
807
+ while ('\0' != *src) {
671
808
  size_t offset = 0;
672
809
 
673
- while (s[offset] == c[offset]) {
810
+ while ((src[offset] == c[offset]) && ('\0' != src[offset])) {
674
811
  offset++;
675
812
  }
676
813
 
677
814
  if ('\0' == c[offset]) {
678
- // we found a matching utf8 code point
679
- match = s;
680
- s += offset;
815
+ /* we found a matching utf8 code point */
816
+ match = (utf8_int8_t *)src;
817
+ src += offset;
818
+
819
+ if ('\0' == *src) {
820
+ break;
821
+ }
681
822
  } else {
682
- s += offset;
823
+ src += offset;
683
824
 
684
- // need to march s along to next utf8 codepoint start
685
- // (the next byte that doesn't match 0b10xxxxxx)
686
- if ('\0' != *s) {
825
+ /* need to march s along to next utf8 codepoint start
826
+ * (the next byte that doesn't match 0b10xxxxxx) */
827
+ if ('\0' != *src) {
687
828
  do {
688
- s++;
689
- } while (0x80 == (0xc0 & *s));
829
+ src++;
830
+ } while (0x80 == (0xc0 & *src));
690
831
  }
691
832
  }
692
833
  }
693
834
 
694
- // return the last match we found (or 0 if no match was found)
695
- return (void *)match;
835
+ /* return the last match we found (or 0 if no match was found) */
836
+ return match;
696
837
  }
697
838
 
698
- void *utf8pbrk(const void *str, const void *accept) {
699
- const char *s = (const char *)str;
700
-
701
- while ('\0' != *s) {
702
- const char *a = (const char *)accept;
839
+ utf8_constexpr14_impl utf8_int8_t *utf8pbrk(const utf8_int8_t *str,
840
+ const utf8_int8_t *accept) {
841
+ while ('\0' != *str) {
842
+ const utf8_int8_t *a = accept;
703
843
  size_t offset = 0;
704
844
 
705
845
  while ('\0' != *a) {
706
- // checking that if *a is the start of a utf8 codepoint
707
- // (it is not 0b10xxxxxx) and we have successfully matched
708
- // a previous character (0 < offset) - we found a match
846
+ /* checking that if *a is the start of a utf8 codepoint
847
+ * (it is not 0b10xxxxxx) and we have successfully matched
848
+ * a previous character (0 < offset) - we found a match */
709
849
  if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
710
- return (void *)s;
850
+ return (utf8_int8_t *)str;
711
851
  } else {
712
- if (*a == s[offset]) {
713
- // part of a utf8 codepoint matched, so move our checking
714
- // onwards to the next byte
852
+ if (*a == str[offset]) {
853
+ /* part of a utf8 codepoint matched, so move our checking
854
+ * onwards to the next byte */
715
855
  offset++;
716
856
  a++;
717
857
  } else {
718
- // r could be in the middle of an unmatching utf8 code point,
719
- // so we need to march it on to the next character beginning,
858
+ /* r could be in the middle of an unmatching utf8 code point,
859
+ * so we need to march it on to the next character beginning, */
720
860
 
721
861
  do {
722
862
  a++;
723
863
  } while (0x80 == (0xc0 & *a));
724
864
 
725
- // reset offset too as we found a mismatch
865
+ /* reset offset too as we found a mismatch */
726
866
  offset = 0;
727
867
  }
728
868
  }
729
869
  }
730
870
 
731
- // we found a match on the last utf8 codepoint
871
+ /* we found a match on the last utf8 codepoint */
732
872
  if (0 < offset) {
733
- return (void *)s;
873
+ return (utf8_int8_t *)str;
734
874
  }
735
875
 
736
- // the current utf8 codepoint in src did not match accept, but src
737
- // could have been partway through a utf8 codepoint, so we need to
738
- // march it onto the next utf8 codepoint starting byte
876
+ /* the current utf8 codepoint in src did not match accept, but src
877
+ * could have been partway through a utf8 codepoint, so we need to
878
+ * march it onto the next utf8 codepoint starting byte */
739
879
  do {
740
- s++;
741
- } while ((0x80 == (0xc0 & *s)));
880
+ str++;
881
+ } while ((0x80 == (0xc0 & *str)));
742
882
  }
743
883
 
744
884
  return utf8_null;
745
885
  }
746
886
 
747
- size_t utf8size(const void *str) {
748
- const char *s = (const char *)str;
887
+ utf8_constexpr14_impl size_t utf8size(const utf8_int8_t *str) {
888
+ return utf8size_lazy(str) + 1;
889
+ }
890
+
891
+ utf8_constexpr14_impl size_t utf8size_lazy(const utf8_int8_t *str) {
892
+ return utf8nsize_lazy(str, SIZE_MAX);
893
+ }
894
+
895
+ utf8_constexpr14_impl size_t utf8nsize_lazy(const utf8_int8_t *str, size_t n) {
749
896
  size_t size = 0;
750
- while ('\0' != s[size]) {
897
+ while (size < n && '\0' != str[size]) {
751
898
  size++;
752
899
  }
753
-
754
- // we are including the null terminating byte in the size calculation
755
- size++;
756
900
  return size;
757
901
  }
758
902
 
759
- size_t utf8spn(const void *src, const void *accept) {
760
- const char *s = (const char *)src;
903
+ utf8_constexpr14_impl size_t utf8spn(const utf8_int8_t *src,
904
+ const utf8_int8_t *accept) {
761
905
  size_t chars = 0;
762
906
 
763
- while ('\0' != *s) {
764
- const char *a = (const char *)accept;
907
+ while ('\0' != *src) {
908
+ const utf8_int8_t *a = accept;
765
909
  size_t offset = 0;
766
910
 
767
911
  while ('\0' != *a) {
768
- // checking that if *r is the start of a utf8 codepoint
769
- // (it is not 0b10xxxxxx) and we have successfully matched
770
- // a previous character (0 < offset) - we found a match
912
+ /* checking that if *r is the start of a utf8 codepoint
913
+ * (it is not 0b10xxxxxx) and we have successfully matched
914
+ * a previous character (0 < offset) - we found a match */
771
915
  if ((0x80 != (0xc0 & *a)) && (0 < offset)) {
772
- // found a match, so increment the number of utf8 codepoints
773
- // that have matched and stop checking whether any other utf8
774
- // codepoints in a match
916
+ /* found a match, so increment the number of utf8 codepoints
917
+ * that have matched and stop checking whether any other utf8
918
+ * codepoints in a match */
775
919
  chars++;
776
- s += offset;
920
+ src += offset;
921
+ offset = 0;
777
922
  break;
778
923
  } else {
779
- if (*a == s[offset]) {
924
+ if (*a == src[offset]) {
780
925
  offset++;
781
926
  a++;
782
927
  } else {
783
- // a could be in the middle of an unmatching utf8 codepoint,
784
- // so we need to march it on to the next character beginning,
928
+ /* a could be in the middle of an unmatching utf8 codepoint,
929
+ * so we need to march it on to the next character beginning, */
785
930
  do {
786
931
  a++;
787
932
  } while (0x80 == (0xc0 & *a));
788
933
 
789
- // reset offset too as we found a mismatch
934
+ /* reset offset too as we found a mismatch */
790
935
  offset = 0;
791
936
  }
792
937
  }
793
938
  }
794
939
 
795
- // if a got to its terminating null byte, then we didn't find a match.
796
- // Return the current number of matched utf8 codepoints
940
+ /* found a match at the end of *a, so didn't get a chance to test it */
941
+ if (0 < offset) {
942
+ chars++;
943
+ src += offset;
944
+ continue;
945
+ }
946
+
947
+ /* if a got to its terminating null byte, then we didn't find a match.
948
+ * Return the current number of matched utf8 codepoints */
797
949
  if ('\0' == *a) {
798
950
  return chars;
799
951
  }
@@ -802,302 +954,405 @@ size_t utf8spn(const void *src, const void *accept) {
802
954
  return chars;
803
955
  }
804
956
 
805
- void *utf8str(const void *haystack, const void *needle) {
806
- const char *h = (const char *)haystack;
957
+ utf8_constexpr14_impl utf8_int8_t *utf8str(const utf8_int8_t *haystack,
958
+ const utf8_int8_t *needle) {
959
+ utf8_int32_t throwaway_codepoint = 0;
807
960
 
808
- // if needle has no utf8 codepoints before the null terminating
809
- // byte then return haystack
810
- if ('\0' == *((const char *)needle)) {
811
- return (void *)haystack;
961
+ /* if needle has no utf8 codepoints before the null terminating
962
+ * byte then return haystack */
963
+ if ('\0' == *needle) {
964
+ return (utf8_int8_t *)haystack;
812
965
  }
813
966
 
814
- while ('\0' != *h) {
815
- const char *maybeMatch = h;
816
- const char *n = (const char *)needle;
967
+ while ('\0' != *haystack) {
968
+ const utf8_int8_t *maybeMatch = haystack;
969
+ const utf8_int8_t *n = needle;
817
970
 
818
- while (*h == *n && (*h != '\0' && *n != '\0')) {
971
+ while (*haystack == *n && (*haystack != '\0' && *n != '\0')) {
819
972
  n++;
820
- h++;
973
+ haystack++;
821
974
  }
822
975
 
823
976
  if ('\0' == *n) {
824
- // we found the whole utf8 string for needle in haystack at
825
- // maybeMatch, so return it
826
- return (void *)maybeMatch;
977
+ /* we found the whole utf8 string for needle in haystack at
978
+ * maybeMatch, so return it */
979
+ return (utf8_int8_t *)maybeMatch;
827
980
  } else {
828
- // h could be in the middle of an unmatching utf8 codepoint,
829
- // so we need to march it on to the next character beginning,
830
- if ('\0' != *h) {
831
- do {
832
- h++;
833
- } while (0x80 == (0xc0 & *h));
834
- }
981
+ /* h could be in the middle of an unmatching utf8 codepoint,
982
+ * so we need to march it on to the next character beginning
983
+ * starting from the current character */
984
+ haystack = utf8codepoint(maybeMatch, &throwaway_codepoint);
835
985
  }
836
986
  }
837
987
 
838
- // no match
988
+ /* no match */
839
989
  return utf8_null;
840
990
  }
841
991
 
842
- void *utf8casestr(const void *haystack, const void *needle) {
843
- const void *h = haystack;
844
-
845
- // if needle has no utf8 codepoints before the null terminating
846
- // byte then return haystack
847
- if ('\0' == *((const char *)needle)) {
848
- return (void *)haystack;
992
+ utf8_constexpr14_impl utf8_int8_t *utf8casestr(const utf8_int8_t *haystack,
993
+ const utf8_int8_t *needle) {
994
+ /* if needle has no utf8 codepoints before the null terminating
995
+ * byte then return haystack */
996
+ if ('\0' == *needle) {
997
+ return (utf8_int8_t *)haystack;
849
998
  }
850
999
 
851
1000
  for (;;) {
852
- const void *maybeMatch = h;
853
- const void *n = needle;
854
- utf8_int32_t h_cp, n_cp;
1001
+ const utf8_int8_t *maybeMatch = haystack;
1002
+ const utf8_int8_t *n = needle;
1003
+ utf8_int32_t h_cp = 0, n_cp = 0;
855
1004
 
856
- h = utf8codepoint(h, &h_cp);
1005
+ /* Get the next code point and track it */
1006
+ const utf8_int8_t *nextH = haystack = utf8codepoint(haystack, &h_cp);
857
1007
  n = utf8codepoint(n, &n_cp);
858
1008
 
859
1009
  while ((0 != h_cp) && (0 != n_cp)) {
860
1010
  h_cp = utf8lwrcodepoint(h_cp);
861
1011
  n_cp = utf8lwrcodepoint(n_cp);
862
1012
 
863
- // if we find a mismatch, bail out!
1013
+ /* if we find a mismatch, bail out! */
864
1014
  if (h_cp != n_cp) {
865
1015
  break;
866
1016
  }
867
1017
 
868
- h = utf8codepoint(h, &h_cp);
1018
+ haystack = utf8codepoint(haystack, &h_cp);
869
1019
  n = utf8codepoint(n, &n_cp);
870
1020
  }
871
1021
 
872
1022
  if (0 == n_cp) {
873
- // we found the whole utf8 string for needle in haystack at
874
- // maybeMatch, so return it
875
- return (void *)maybeMatch;
1023
+ /* we found the whole utf8 string for needle in haystack at
1024
+ * maybeMatch, so return it */
1025
+ return (utf8_int8_t *)maybeMatch;
876
1026
  }
877
1027
 
878
1028
  if (0 == h_cp) {
879
- // no match
1029
+ /* no match */
880
1030
  return utf8_null;
881
1031
  }
1032
+
1033
+ /* Roll back to the next code point in the haystack to test */
1034
+ haystack = nextH;
882
1035
  }
883
1036
  }
884
1037
 
885
- void *utf8valid(const void *str) {
886
- const char *s = (const char *)str;
1038
+ utf8_constexpr14_impl utf8_int8_t *utf8valid(const utf8_int8_t *str) {
1039
+ return utf8nvalid(str, SIZE_MAX);
1040
+ }
1041
+
1042
+ utf8_constexpr14_impl utf8_int8_t *utf8nvalid(const utf8_int8_t *str,
1043
+ size_t n) {
1044
+ const utf8_int8_t *t = str;
1045
+ size_t consumed = 0;
1046
+
1047
+ while ((void)(consumed = (size_t)(str - t)), consumed < n && '\0' != *str) {
1048
+ const size_t remaining = n - consumed;
1049
+
1050
+ if (0xf0 == (0xf8 & *str)) {
1051
+ /* ensure that there's 4 bytes or more remaining */
1052
+ if (remaining < 4) {
1053
+ return (utf8_int8_t *)str;
1054
+ }
1055
+
1056
+ /* ensure each of the 3 following bytes in this 4-byte
1057
+ * utf8 codepoint began with 0b10xxxxxx */
1058
+ if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2])) ||
1059
+ (0x80 != (0xc0 & str[3]))) {
1060
+ return (utf8_int8_t *)str;
1061
+ }
1062
+
1063
+ /* ensure that our utf8 codepoint ended after 4 bytes */
1064
+ if ((remaining != 4) && (0x80 == (0xc0 & str[4]))) {
1065
+ return (utf8_int8_t *)str;
1066
+ }
887
1067
 
888
- while ('\0' != *s) {
889
- if (0xf0 == (0xf8 & *s)) {
890
- // ensure each of the 3 following bytes in this 4-byte
891
- // utf8 codepoint began with 0b10xxxxxx
892
- if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) ||
893
- (0x80 != (0xc0 & s[3]))) {
894
- return (void *)s;
1068
+ /* ensure that the top 5 bits of this 4-byte utf8
1069
+ * codepoint were not 0, as then we could have used
1070
+ * one of the smaller encodings */
1071
+ if ((0 == (0x07 & str[0])) && (0 == (0x30 & str[1]))) {
1072
+ return (utf8_int8_t *)str;
895
1073
  }
896
1074
 
897
- // ensure that our utf8 codepoint ended after 4 bytes
898
- if (0x80 == (0xc0 & s[4])) {
899
- return (void *)s;
1075
+ /* 4-byte utf8 code point (began with 0b11110xxx) */
1076
+ str += 4;
1077
+ } else if (0xe0 == (0xf0 & *str)) {
1078
+ /* ensure that there's 3 bytes or more remaining */
1079
+ if (remaining < 3) {
1080
+ return (utf8_int8_t *)str;
900
1081
  }
901
1082
 
902
- // ensure that the top 5 bits of this 4-byte utf8
903
- // codepoint were not 0, as then we could have used
904
- // one of the smaller encodings
905
- if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1]))) {
906
- return (void *)s;
1083
+ /* ensure each of the 2 following bytes in this 3-byte
1084
+ * utf8 codepoint began with 0b10xxxxxx */
1085
+ if ((0x80 != (0xc0 & str[1])) || (0x80 != (0xc0 & str[2]))) {
1086
+ return (utf8_int8_t *)str;
907
1087
  }
908
1088
 
909
- // 4-byte utf8 code point (began with 0b11110xxx)
910
- s += 4;
911
- } else if (0xe0 == (0xf0 & *s)) {
912
- // ensure each of the 2 following bytes in this 3-byte
913
- // utf8 codepoint began with 0b10xxxxxx
914
- if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2]))) {
915
- return (void *)s;
1089
+ /* ensure that our utf8 codepoint ended after 3 bytes */
1090
+ if ((remaining != 3) && (0x80 == (0xc0 & str[3]))) {
1091
+ return (utf8_int8_t *)str;
916
1092
  }
917
1093
 
918
- // ensure that our utf8 codepoint ended after 3 bytes
919
- if (0x80 == (0xc0 & s[3])) {
920
- return (void *)s;
1094
+ /* ensure that the top 5 bits of this 3-byte utf8
1095
+ * codepoint were not 0, as then we could have used
1096
+ * one of the smaller encodings */
1097
+ if ((0 == (0x0f & str[0])) && (0 == (0x20 & str[1]))) {
1098
+ return (utf8_int8_t *)str;
921
1099
  }
922
1100
 
923
- // ensure that the top 5 bits of this 3-byte utf8
924
- // codepoint were not 0, as then we could have used
925
- // one of the smaller encodings
926
- if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1]))) {
927
- return (void *)s;
1101
+ /* 3-byte utf8 code point (began with 0b1110xxxx) */
1102
+ str += 3;
1103
+ } else if (0xc0 == (0xe0 & *str)) {
1104
+ /* ensure that there's 2 bytes or more remaining */
1105
+ if (remaining < 2) {
1106
+ return (utf8_int8_t *)str;
928
1107
  }
929
1108
 
930
- // 3-byte utf8 code point (began with 0b1110xxxx)
931
- s += 3;
932
- } else if (0xc0 == (0xe0 & *s)) {
933
- // ensure the 1 following byte in this 2-byte
934
- // utf8 codepoint began with 0b10xxxxxx
935
- if (0x80 != (0xc0 & s[1])) {
936
- return (void *)s;
1109
+ /* ensure the 1 following byte in this 2-byte
1110
+ * utf8 codepoint began with 0b10xxxxxx */
1111
+ if (0x80 != (0xc0 & str[1])) {
1112
+ return (utf8_int8_t *)str;
937
1113
  }
938
1114
 
939
- // ensure that our utf8 codepoint ended after 2 bytes
940
- if (0x80 == (0xc0 & s[2])) {
941
- return (void *)s;
1115
+ /* ensure that our utf8 codepoint ended after 2 bytes */
1116
+ if ((remaining != 2) && (0x80 == (0xc0 & str[2]))) {
1117
+ return (utf8_int8_t *)str;
942
1118
  }
943
1119
 
944
- // ensure that the top 4 bits of this 2-byte utf8
945
- // codepoint were not 0, as then we could have used
946
- // one of the smaller encodings
947
- if (0 == (0x1e & s[0])) {
948
- return (void *)s;
1120
+ /* ensure that the top 4 bits of this 2-byte utf8
1121
+ * codepoint were not 0, as then we could have used
1122
+ * one of the smaller encodings */
1123
+ if (0 == (0x1e & str[0])) {
1124
+ return (utf8_int8_t *)str;
949
1125
  }
950
1126
 
951
- // 2-byte utf8 code point (began with 0b110xxxxx)
952
- s += 2;
953
- } else if (0x00 == (0x80 & *s)) {
954
- // 1-byte ascii (began with 0b0xxxxxxx)
955
- s += 1;
1127
+ /* 2-byte utf8 code point (began with 0b110xxxxx) */
1128
+ str += 2;
1129
+ } else if (0x00 == (0x80 & *str)) {
1130
+ /* 1-byte ascii (began with 0b0xxxxxxx) */
1131
+ str += 1;
956
1132
  } else {
957
- // we have an invalid 0b1xxxxxxx utf8 code point entry
958
- return (void *)s;
1133
+ /* we have an invalid 0b1xxxxxxx utf8 code point entry */
1134
+ return (utf8_int8_t *)str;
959
1135
  }
960
1136
  }
961
1137
 
962
1138
  return utf8_null;
963
1139
  }
964
1140
 
965
- void *utf8codepoint(const void *utf8_restrict str,
966
- utf8_int32_t *utf8_restrict out_codepoint) {
967
- const char *s = (const char *)str;
1141
+ int utf8makevalid(utf8_int8_t *str, const utf8_int32_t replacement) {
1142
+ utf8_int8_t *read = str;
1143
+ utf8_int8_t *write = read;
1144
+ const utf8_int8_t r = (utf8_int8_t)replacement;
1145
+ utf8_int32_t codepoint = 0;
968
1146
 
969
- if (0xf0 == (0xf8 & s[0])) {
970
- // 4 byte utf8 codepoint
971
- *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
972
- ((0x3f & s[2]) << 6) | (0x3f & s[3]);
973
- s += 4;
974
- } else if (0xe0 == (0xf0 & s[0])) {
975
- // 3 byte utf8 codepoint
1147
+ if (replacement > 0x7f) {
1148
+ return -1;
1149
+ }
1150
+
1151
+ while ('\0' != *read) {
1152
+ if (0xf0 == (0xf8 & *read)) {
1153
+ /* ensure each of the 3 following bytes in this 4-byte
1154
+ * utf8 codepoint began with 0b10xxxxxx */
1155
+ if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2])) ||
1156
+ (0x80 != (0xc0 & read[3]))) {
1157
+ *write++ = r;
1158
+ read++;
1159
+ continue;
1160
+ }
1161
+
1162
+ /* 4-byte utf8 code point (began with 0b11110xxx) */
1163
+ read = utf8codepoint(read, &codepoint);
1164
+ write = utf8catcodepoint(write, codepoint, 4);
1165
+ } else if (0xe0 == (0xf0 & *read)) {
1166
+ /* ensure each of the 2 following bytes in this 3-byte
1167
+ * utf8 codepoint began with 0b10xxxxxx */
1168
+ if ((0x80 != (0xc0 & read[1])) || (0x80 != (0xc0 & read[2]))) {
1169
+ *write++ = r;
1170
+ read++;
1171
+ continue;
1172
+ }
1173
+
1174
+ /* 3-byte utf8 code point (began with 0b1110xxxx) */
1175
+ read = utf8codepoint(read, &codepoint);
1176
+ write = utf8catcodepoint(write, codepoint, 3);
1177
+ } else if (0xc0 == (0xe0 & *read)) {
1178
+ /* ensure the 1 following byte in this 2-byte
1179
+ * utf8 codepoint began with 0b10xxxxxx */
1180
+ if (0x80 != (0xc0 & read[1])) {
1181
+ *write++ = r;
1182
+ read++;
1183
+ continue;
1184
+ }
1185
+
1186
+ /* 2-byte utf8 code point (began with 0b110xxxxx) */
1187
+ read = utf8codepoint(read, &codepoint);
1188
+ write = utf8catcodepoint(write, codepoint, 2);
1189
+ } else if (0x00 == (0x80 & *read)) {
1190
+ /* 1-byte ascii (began with 0b0xxxxxxx) */
1191
+ read = utf8codepoint(read, &codepoint);
1192
+ write = utf8catcodepoint(write, codepoint, 1);
1193
+ } else {
1194
+ /* if we got here then we've got a dangling continuation (0b10xxxxxx) */
1195
+ *write++ = r;
1196
+ read++;
1197
+ continue;
1198
+ }
1199
+ }
1200
+
1201
+ *write = '\0';
1202
+
1203
+ return 0;
1204
+ }
1205
+
1206
+ utf8_constexpr14_impl utf8_int8_t *
1207
+ utf8codepoint(const utf8_int8_t *utf8_restrict str,
1208
+ utf8_int32_t *utf8_restrict out_codepoint) {
1209
+ if (0xf0 == (0xf8 & str[0])) {
1210
+ /* 4 byte utf8 codepoint */
1211
+ *out_codepoint = ((0x07 & str[0]) << 18) | ((0x3f & str[1]) << 12) |
1212
+ ((0x3f & str[2]) << 6) | (0x3f & str[3]);
1213
+ str += 4;
1214
+ } else if (0xe0 == (0xf0 & str[0])) {
1215
+ /* 3 byte utf8 codepoint */
976
1216
  *out_codepoint =
977
- ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
978
- s += 3;
979
- } else if (0xc0 == (0xe0 & s[0])) {
980
- // 2 byte utf8 codepoint
981
- *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
982
- s += 2;
1217
+ ((0x0f & str[0]) << 12) | ((0x3f & str[1]) << 6) | (0x3f & str[2]);
1218
+ str += 3;
1219
+ } else if (0xc0 == (0xe0 & str[0])) {
1220
+ /* 2 byte utf8 codepoint */
1221
+ *out_codepoint = ((0x1f & str[0]) << 6) | (0x3f & str[1]);
1222
+ str += 2;
983
1223
  } else {
984
- // 1 byte utf8 codepoint otherwise
985
- *out_codepoint = s[0];
986
- s += 1;
1224
+ /* 1 byte utf8 codepoint otherwise */
1225
+ *out_codepoint = str[0];
1226
+ str += 1;
987
1227
  }
988
1228
 
989
- return (void *)s;
1229
+ return (utf8_int8_t *)str;
990
1230
  }
991
1231
 
992
- size_t utf8codepointsize(utf8_int32_t chr) {
1232
+ utf8_constexpr14_impl size_t utf8codepointcalcsize(const utf8_int8_t *str) {
1233
+ if (0xf0 == (0xf8 & str[0])) {
1234
+ /* 4 byte utf8 codepoint */
1235
+ return 4;
1236
+ } else if (0xe0 == (0xf0 & str[0])) {
1237
+ /* 3 byte utf8 codepoint */
1238
+ return 3;
1239
+ } else if (0xc0 == (0xe0 & str[0])) {
1240
+ /* 2 byte utf8 codepoint */
1241
+ return 2;
1242
+ }
1243
+
1244
+ /* 1 byte utf8 codepoint otherwise */
1245
+ return 1;
1246
+ }
1247
+
1248
+ utf8_constexpr14_impl size_t utf8codepointsize(utf8_int32_t chr) {
993
1249
  if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
994
1250
  return 1;
995
1251
  } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
996
1252
  return 2;
997
1253
  } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
998
1254
  return 3;
999
- } else { // if (0 == ((int)0xffe00000 & chr)) {
1255
+ } else { /* if (0 == ((int)0xffe00000 & chr)) { */
1000
1256
  return 4;
1001
1257
  }
1002
1258
  }
1003
1259
 
1004
- void *utf8catcodepoint(void *utf8_restrict str, utf8_int32_t chr, size_t n) {
1005
- char *s = (char *)str;
1006
-
1260
+ utf8_int8_t *utf8catcodepoint(utf8_int8_t *str, utf8_int32_t chr, size_t n) {
1007
1261
  if (0 == ((utf8_int32_t)0xffffff80 & chr)) {
1008
- // 1-byte/7-bit ascii
1009
- // (0b0xxxxxxx)
1262
+ /* 1-byte/7-bit ascii
1263
+ * (0b0xxxxxxx) */
1010
1264
  if (n < 1) {
1011
1265
  return utf8_null;
1012
1266
  }
1013
- s[0] = (char)chr;
1014
- s += 1;
1267
+ str[0] = (utf8_int8_t)chr;
1268
+ str += 1;
1015
1269
  } else if (0 == ((utf8_int32_t)0xfffff800 & chr)) {
1016
- // 2-byte/11-bit utf8 code point
1017
- // (0b110xxxxx 0b10xxxxxx)
1270
+ /* 2-byte/11-bit utf8 code point
1271
+ * (0b110xxxxx 0b10xxxxxx) */
1018
1272
  if (n < 2) {
1019
1273
  return utf8_null;
1020
1274
  }
1021
- s[0] = 0xc0 | (char)(chr >> 6);
1022
- s[1] = 0x80 | (char)(chr & 0x3f);
1023
- s += 2;
1275
+ str[0] = (utf8_int8_t)(0xc0 | (utf8_int8_t)((chr >> 6) & 0x1f));
1276
+ str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1277
+ str += 2;
1024
1278
  } else if (0 == ((utf8_int32_t)0xffff0000 & chr)) {
1025
- // 3-byte/16-bit utf8 code point
1026
- // (0b1110xxxx 0b10xxxxxx 0b10xxxxxx)
1279
+ /* 3-byte/16-bit utf8 code point
1280
+ * (0b1110xxxx 0b10xxxxxx 0b10xxxxxx) */
1027
1281
  if (n < 3) {
1028
1282
  return utf8_null;
1029
1283
  }
1030
- s[0] = 0xe0 | (char)(chr >> 12);
1031
- s[1] = 0x80 | (char)((chr >> 6) & 0x3f);
1032
- s[2] = 0x80 | (char)(chr & 0x3f);
1033
- s += 3;
1034
- } else { // if (0 == ((int)0xffe00000 & chr)) {
1035
- // 4-byte/21-bit utf8 code point
1036
- // (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx)
1284
+ str[0] = (utf8_int8_t)(0xe0 | (utf8_int8_t)((chr >> 12) & 0x0f));
1285
+ str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1286
+ str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1287
+ str += 3;
1288
+ } else { /* if (0 == ((int)0xffe00000 & chr)) { */
1289
+ /* 4-byte/21-bit utf8 code point
1290
+ * (0b11110xxx 0b10xxxxxx 0b10xxxxxx 0b10xxxxxx) */
1037
1291
  if (n < 4) {
1038
1292
  return utf8_null;
1039
1293
  }
1040
- s[0] = 0xf0 | (char)(chr >> 18);
1041
- s[1] = 0x80 | (char)((chr >> 12) & 0x3f);
1042
- s[2] = 0x80 | (char)((chr >> 6) & 0x3f);
1043
- s[3] = 0x80 | (char)(chr & 0x3f);
1044
- s += 4;
1294
+ str[0] = (utf8_int8_t)(0xf0 | (utf8_int8_t)((chr >> 18) & 0x07));
1295
+ str[1] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 12) & 0x3f));
1296
+ str[2] = (utf8_int8_t)(0x80 | (utf8_int8_t)((chr >> 6) & 0x3f));
1297
+ str[3] = (utf8_int8_t)(0x80 | (utf8_int8_t)(chr & 0x3f));
1298
+ str += 4;
1045
1299
  }
1046
1300
 
1047
- return s;
1301
+ return str;
1048
1302
  }
1049
1303
 
1050
- int utf8islower(utf8_int32_t chr) { return chr != utf8uprcodepoint(chr); }
1051
-
1052
- int utf8isupper(utf8_int32_t chr) { return chr != utf8lwrcodepoint(chr); }
1304
+ utf8_constexpr14_impl int utf8islower(utf8_int32_t chr) {
1305
+ return chr != utf8uprcodepoint(chr);
1306
+ }
1053
1307
 
1054
- void utf8lwr(void *utf8_restrict str) {
1055
- void *p, *pn;
1056
- utf8_int32_t cp;
1308
+ utf8_constexpr14_impl int utf8isupper(utf8_int32_t chr) {
1309
+ return chr != utf8lwrcodepoint(chr);
1310
+ }
1057
1311
 
1058
- p = (char *)str;
1059
- pn = utf8codepoint(p, &cp);
1312
+ void utf8lwr(utf8_int8_t *utf8_restrict str) {
1313
+ utf8_int32_t cp = 0;
1314
+ utf8_int8_t *pn = utf8codepoint(str, &cp);
1060
1315
 
1061
1316
  while (cp != 0) {
1062
1317
  const utf8_int32_t lwr_cp = utf8lwrcodepoint(cp);
1063
1318
  const size_t size = utf8codepointsize(lwr_cp);
1064
1319
 
1065
1320
  if (lwr_cp != cp) {
1066
- utf8catcodepoint(p, lwr_cp, size);
1321
+ utf8catcodepoint(str, lwr_cp, size);
1067
1322
  }
1068
1323
 
1069
- p = pn;
1070
- pn = utf8codepoint(p, &cp);
1324
+ str = pn;
1325
+ pn = utf8codepoint(str, &cp);
1071
1326
  }
1072
1327
  }
1073
1328
 
1074
- void utf8upr(void *utf8_restrict str) {
1075
- void *p, *pn;
1076
- utf8_int32_t cp;
1077
-
1078
- p = (char *)str;
1079
- pn = utf8codepoint(p, &cp);
1329
+ void utf8upr(utf8_int8_t *utf8_restrict str) {
1330
+ utf8_int32_t cp = 0;
1331
+ utf8_int8_t *pn = utf8codepoint(str, &cp);
1080
1332
 
1081
1333
  while (cp != 0) {
1082
1334
  const utf8_int32_t lwr_cp = utf8uprcodepoint(cp);
1083
1335
  const size_t size = utf8codepointsize(lwr_cp);
1084
1336
 
1085
1337
  if (lwr_cp != cp) {
1086
- utf8catcodepoint(p, lwr_cp, size);
1338
+ utf8catcodepoint(str, lwr_cp, size);
1087
1339
  }
1088
1340
 
1089
- p = pn;
1090
- pn = utf8codepoint(p, &cp);
1341
+ str = pn;
1342
+ pn = utf8codepoint(str, &cp);
1091
1343
  }
1092
1344
  }
1093
1345
 
1094
- utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1346
+ utf8_constexpr14_impl utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1095
1347
  if (((0x0041 <= cp) && (0x005a >= cp)) ||
1096
1348
  ((0x00c0 <= cp) && (0x00d6 >= cp)) ||
1097
1349
  ((0x00d8 <= cp) && (0x00de >= cp)) ||
1098
1350
  ((0x0391 <= cp) && (0x03a1 >= cp)) ||
1099
- ((0x03a3 <= cp) && (0x03ab >= cp))) {
1351
+ ((0x03a3 <= cp) && (0x03ab >= cp)) ||
1352
+ ((0x0410 <= cp) && (0x042f >= cp))) {
1100
1353
  cp += 32;
1354
+ } else if ((0x0400 <= cp) && (0x040f >= cp)) {
1355
+ cp += 80;
1101
1356
  } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1102
1357
  ((0x0132 <= cp) && (0x0137 >= cp)) ||
1103
1358
  ((0x014a <= cp) && (0x0177 >= cp)) ||
@@ -1107,7 +1362,9 @@ utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1107
1362
  ((0x01f8 <= cp) && (0x021f >= cp)) ||
1108
1363
  ((0x0222 <= cp) && (0x0233 >= cp)) ||
1109
1364
  ((0x0246 <= cp) && (0x024f >= cp)) ||
1110
- ((0x03d8 <= cp) && (0x03ef >= cp))) {
1365
+ ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1366
+ ((0x0460 <= cp) && (0x0481 >= cp)) ||
1367
+ ((0x048a <= cp) && (0x04ff >= cp))) {
1111
1368
  cp |= 0x1;
1112
1369
  } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1113
1370
  ((0x0179 <= cp) && (0x017e >= cp)) ||
@@ -1118,62 +1375,147 @@ utf8_int32_t utf8lwrcodepoint(utf8_int32_t cp) {
1118
1375
  cp &= ~0x1;
1119
1376
  } else {
1120
1377
  switch (cp) {
1121
- default: break;
1122
- case 0x0178: cp = 0x00ff; break;
1123
- case 0x0243: cp = 0x0180; break;
1124
- case 0x018e: cp = 0x01dd; break;
1125
- case 0x023d: cp = 0x019a; break;
1126
- case 0x0220: cp = 0x019e; break;
1127
- case 0x01b7: cp = 0x0292; break;
1128
- case 0x01c4: cp = 0x01c6; break;
1129
- case 0x01c7: cp = 0x01c9; break;
1130
- case 0x01ca: cp = 0x01cc; break;
1131
- case 0x01f1: cp = 0x01f3; break;
1132
- case 0x01f7: cp = 0x01bf; break;
1133
- case 0x0187: cp = 0x0188; break;
1134
- case 0x018b: cp = 0x018c; break;
1135
- case 0x0191: cp = 0x0192; break;
1136
- case 0x0198: cp = 0x0199; break;
1137
- case 0x01a7: cp = 0x01a8; break;
1138
- case 0x01ac: cp = 0x01ad; break;
1139
- case 0x01af: cp = 0x01b0; break;
1140
- case 0x01b8: cp = 0x01b9; break;
1141
- case 0x01bc: cp = 0x01bd; break;
1142
- case 0x01f4: cp = 0x01f5; break;
1143
- case 0x023b: cp = 0x023c; break;
1144
- case 0x0241: cp = 0x0242; break;
1145
- case 0x03fd: cp = 0x037b; break;
1146
- case 0x03fe: cp = 0x037c; break;
1147
- case 0x03ff: cp = 0x037d; break;
1148
- case 0x037f: cp = 0x03f3; break;
1149
- case 0x0386: cp = 0x03ac; break;
1150
- case 0x0388: cp = 0x03ad; break;
1151
- case 0x0389: cp = 0x03ae; break;
1152
- case 0x038a: cp = 0x03af; break;
1153
- case 0x038c: cp = 0x03cc; break;
1154
- case 0x038e: cp = 0x03cd; break;
1155
- case 0x038f: cp = 0x03ce; break;
1156
- case 0x0370: cp = 0x0371; break;
1157
- case 0x0372: cp = 0x0373; break;
1158
- case 0x0376: cp = 0x0377; break;
1159
- case 0x03f4: cp = 0x03d1; break;
1160
- case 0x03cf: cp = 0x03d7; break;
1161
- case 0x03f9: cp = 0x03f2; break;
1162
- case 0x03f7: cp = 0x03f8; break;
1163
- case 0x03fa: cp = 0x03fb; break;
1164
- };
1378
+ default:
1379
+ break;
1380
+ case 0x0178:
1381
+ cp = 0x00ff;
1382
+ break;
1383
+ case 0x0243:
1384
+ cp = 0x0180;
1385
+ break;
1386
+ case 0x018e:
1387
+ cp = 0x01dd;
1388
+ break;
1389
+ case 0x023d:
1390
+ cp = 0x019a;
1391
+ break;
1392
+ case 0x0220:
1393
+ cp = 0x019e;
1394
+ break;
1395
+ case 0x01b7:
1396
+ cp = 0x0292;
1397
+ break;
1398
+ case 0x01c4:
1399
+ cp = 0x01c6;
1400
+ break;
1401
+ case 0x01c7:
1402
+ cp = 0x01c9;
1403
+ break;
1404
+ case 0x01ca:
1405
+ cp = 0x01cc;
1406
+ break;
1407
+ case 0x01f1:
1408
+ cp = 0x01f3;
1409
+ break;
1410
+ case 0x01f7:
1411
+ cp = 0x01bf;
1412
+ break;
1413
+ case 0x0187:
1414
+ cp = 0x0188;
1415
+ break;
1416
+ case 0x018b:
1417
+ cp = 0x018c;
1418
+ break;
1419
+ case 0x0191:
1420
+ cp = 0x0192;
1421
+ break;
1422
+ case 0x0198:
1423
+ cp = 0x0199;
1424
+ break;
1425
+ case 0x01a7:
1426
+ cp = 0x01a8;
1427
+ break;
1428
+ case 0x01ac:
1429
+ cp = 0x01ad;
1430
+ break;
1431
+ case 0x01b8:
1432
+ cp = 0x01b9;
1433
+ break;
1434
+ case 0x01bc:
1435
+ cp = 0x01bd;
1436
+ break;
1437
+ case 0x01f4:
1438
+ cp = 0x01f5;
1439
+ break;
1440
+ case 0x023b:
1441
+ cp = 0x023c;
1442
+ break;
1443
+ case 0x0241:
1444
+ cp = 0x0242;
1445
+ break;
1446
+ case 0x03fd:
1447
+ cp = 0x037b;
1448
+ break;
1449
+ case 0x03fe:
1450
+ cp = 0x037c;
1451
+ break;
1452
+ case 0x03ff:
1453
+ cp = 0x037d;
1454
+ break;
1455
+ case 0x037f:
1456
+ cp = 0x03f3;
1457
+ break;
1458
+ case 0x0386:
1459
+ cp = 0x03ac;
1460
+ break;
1461
+ case 0x0388:
1462
+ cp = 0x03ad;
1463
+ break;
1464
+ case 0x0389:
1465
+ cp = 0x03ae;
1466
+ break;
1467
+ case 0x038a:
1468
+ cp = 0x03af;
1469
+ break;
1470
+ case 0x038c:
1471
+ cp = 0x03cc;
1472
+ break;
1473
+ case 0x038e:
1474
+ cp = 0x03cd;
1475
+ break;
1476
+ case 0x038f:
1477
+ cp = 0x03ce;
1478
+ break;
1479
+ case 0x0370:
1480
+ cp = 0x0371;
1481
+ break;
1482
+ case 0x0372:
1483
+ cp = 0x0373;
1484
+ break;
1485
+ case 0x0376:
1486
+ cp = 0x0377;
1487
+ break;
1488
+ case 0x03f4:
1489
+ cp = 0x03b8;
1490
+ break;
1491
+ case 0x03cf:
1492
+ cp = 0x03d7;
1493
+ break;
1494
+ case 0x03f9:
1495
+ cp = 0x03f2;
1496
+ break;
1497
+ case 0x03f7:
1498
+ cp = 0x03f8;
1499
+ break;
1500
+ case 0x03fa:
1501
+ cp = 0x03fb;
1502
+ break;
1503
+ }
1165
1504
  }
1166
1505
 
1167
1506
  return cp;
1168
1507
  }
1169
1508
 
1170
- utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1509
+ utf8_constexpr14_impl utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1171
1510
  if (((0x0061 <= cp) && (0x007a >= cp)) ||
1172
1511
  ((0x00e0 <= cp) && (0x00f6 >= cp)) ||
1173
1512
  ((0x00f8 <= cp) && (0x00fe >= cp)) ||
1174
1513
  ((0x03b1 <= cp) && (0x03c1 >= cp)) ||
1175
- ((0x03c3 <= cp) && (0x03cb >= cp))) {
1514
+ ((0x03c3 <= cp) && (0x03cb >= cp)) ||
1515
+ ((0x0430 <= cp) && (0x044f >= cp))) {
1176
1516
  cp -= 32;
1517
+ } else if ((0x0450 <= cp) && (0x045f >= cp)) {
1518
+ cp -= 80;
1177
1519
  } else if (((0x0100 <= cp) && (0x012f >= cp)) ||
1178
1520
  ((0x0132 <= cp) && (0x0137 >= cp)) ||
1179
1521
  ((0x014a <= cp) && (0x0177 >= cp)) ||
@@ -1183,7 +1525,9 @@ utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1183
1525
  ((0x01f8 <= cp) && (0x021f >= cp)) ||
1184
1526
  ((0x0222 <= cp) && (0x0233 >= cp)) ||
1185
1527
  ((0x0246 <= cp) && (0x024f >= cp)) ||
1186
- ((0x03d8 <= cp) && (0x03ef >= cp))) {
1528
+ ((0x03d8 <= cp) && (0x03ef >= cp)) ||
1529
+ ((0x0460 <= cp) && (0x0481 >= cp)) ||
1530
+ ((0x048a <= cp) && (0x04ff >= cp))) {
1187
1531
  cp &= ~0x1;
1188
1532
  } else if (((0x0139 <= cp) && (0x0148 >= cp)) ||
1189
1533
  ((0x0179 <= cp) && (0x017e >= cp)) ||
@@ -1194,64 +1538,175 @@ utf8_int32_t utf8uprcodepoint(utf8_int32_t cp) {
1194
1538
  cp |= 0x1;
1195
1539
  } else {
1196
1540
  switch (cp) {
1197
- default: break;
1198
- case 0x00ff: cp = 0x0178; break;
1199
- case 0x0180: cp = 0x0243; break;
1200
- case 0x01dd: cp = 0x018e; break;
1201
- case 0x019a: cp = 0x023d; break;
1202
- case 0x019e: cp = 0x0220; break;
1203
- case 0x0292: cp = 0x01b7; break;
1204
- case 0x01c6: cp = 0x01c4; break;
1205
- case 0x01c9: cp = 0x01c7; break;
1206
- case 0x01cc: cp = 0x01ca; break;
1207
- case 0x01f3: cp = 0x01f1; break;
1208
- case 0x01bf: cp = 0x01f7; break;
1209
- case 0x0188: cp = 0x0187; break;
1210
- case 0x018c: cp = 0x018b; break;
1211
- case 0x0192: cp = 0x0191; break;
1212
- case 0x0199: cp = 0x0198; break;
1213
- case 0x01a8: cp = 0x01a7; break;
1214
- case 0x01ad: cp = 0x01ac; break;
1215
- case 0x01b0: cp = 0x01af; break;
1216
- case 0x01b9: cp = 0x01b8; break;
1217
- case 0x01bd: cp = 0x01bc; break;
1218
- case 0x01f5: cp = 0x01f4; break;
1219
- case 0x023c: cp = 0x023b; break;
1220
- case 0x0242: cp = 0x0241; break;
1221
- case 0x037b: cp = 0x03fd; break;
1222
- case 0x037c: cp = 0x03fe; break;
1223
- case 0x037d: cp = 0x03ff; break;
1224
- case 0x03f3: cp = 0x037f; break;
1225
- case 0x03ac: cp = 0x0386; break;
1226
- case 0x03ad: cp = 0x0388; break;
1227
- case 0x03ae: cp = 0x0389; break;
1228
- case 0x03af: cp = 0x038a; break;
1229
- case 0x03cc: cp = 0x038c; break;
1230
- case 0x03cd: cp = 0x038e; break;
1231
- case 0x03ce: cp = 0x038f; break;
1232
- case 0x0371: cp = 0x0370; break;
1233
- case 0x0373: cp = 0x0372; break;
1234
- case 0x0377: cp = 0x0376; break;
1235
- case 0x03d1: cp = 0x03f4; break;
1236
- case 0x03d7: cp = 0x03cf; break;
1237
- case 0x03f2: cp = 0x03f9; break;
1238
- case 0x03f8: cp = 0x03f7; break;
1239
- case 0x03fb: cp = 0x03fa; break;
1240
- };
1541
+ default:
1542
+ break;
1543
+ case 0x00ff:
1544
+ cp = 0x0178;
1545
+ break;
1546
+ case 0x0180:
1547
+ cp = 0x0243;
1548
+ break;
1549
+ case 0x01dd:
1550
+ cp = 0x018e;
1551
+ break;
1552
+ case 0x019a:
1553
+ cp = 0x023d;
1554
+ break;
1555
+ case 0x019e:
1556
+ cp = 0x0220;
1557
+ break;
1558
+ case 0x0292:
1559
+ cp = 0x01b7;
1560
+ break;
1561
+ case 0x01c6:
1562
+ cp = 0x01c4;
1563
+ break;
1564
+ case 0x01c9:
1565
+ cp = 0x01c7;
1566
+ break;
1567
+ case 0x01cc:
1568
+ cp = 0x01ca;
1569
+ break;
1570
+ case 0x01f3:
1571
+ cp = 0x01f1;
1572
+ break;
1573
+ case 0x01bf:
1574
+ cp = 0x01f7;
1575
+ break;
1576
+ case 0x0188:
1577
+ cp = 0x0187;
1578
+ break;
1579
+ case 0x018c:
1580
+ cp = 0x018b;
1581
+ break;
1582
+ case 0x0192:
1583
+ cp = 0x0191;
1584
+ break;
1585
+ case 0x0199:
1586
+ cp = 0x0198;
1587
+ break;
1588
+ case 0x01a8:
1589
+ cp = 0x01a7;
1590
+ break;
1591
+ case 0x01ad:
1592
+ cp = 0x01ac;
1593
+ break;
1594
+ case 0x01b9:
1595
+ cp = 0x01b8;
1596
+ break;
1597
+ case 0x01bd:
1598
+ cp = 0x01bc;
1599
+ break;
1600
+ case 0x01f5:
1601
+ cp = 0x01f4;
1602
+ break;
1603
+ case 0x023c:
1604
+ cp = 0x023b;
1605
+ break;
1606
+ case 0x0242:
1607
+ cp = 0x0241;
1608
+ break;
1609
+ case 0x037b:
1610
+ cp = 0x03fd;
1611
+ break;
1612
+ case 0x037c:
1613
+ cp = 0x03fe;
1614
+ break;
1615
+ case 0x037d:
1616
+ cp = 0x03ff;
1617
+ break;
1618
+ case 0x03f3:
1619
+ cp = 0x037f;
1620
+ break;
1621
+ case 0x03ac:
1622
+ cp = 0x0386;
1623
+ break;
1624
+ case 0x03ad:
1625
+ cp = 0x0388;
1626
+ break;
1627
+ case 0x03ae:
1628
+ cp = 0x0389;
1629
+ break;
1630
+ case 0x03af:
1631
+ cp = 0x038a;
1632
+ break;
1633
+ case 0x03cc:
1634
+ cp = 0x038c;
1635
+ break;
1636
+ case 0x03cd:
1637
+ cp = 0x038e;
1638
+ break;
1639
+ case 0x03ce:
1640
+ cp = 0x038f;
1641
+ break;
1642
+ case 0x0371:
1643
+ cp = 0x0370;
1644
+ break;
1645
+ case 0x0373:
1646
+ cp = 0x0372;
1647
+ break;
1648
+ case 0x0377:
1649
+ cp = 0x0376;
1650
+ break;
1651
+ case 0x03d1:
1652
+ cp = 0x0398;
1653
+ break;
1654
+ case 0x03d7:
1655
+ cp = 0x03cf;
1656
+ break;
1657
+ case 0x03f2:
1658
+ cp = 0x03f9;
1659
+ break;
1660
+ case 0x03f8:
1661
+ cp = 0x03f7;
1662
+ break;
1663
+ case 0x03fb:
1664
+ cp = 0x03fa;
1665
+ break;
1666
+ }
1241
1667
  }
1242
1668
 
1243
1669
  return cp;
1244
1670
  }
1245
1671
 
1672
+ utf8_constexpr14_impl utf8_int8_t *
1673
+ utf8rcodepoint(const utf8_int8_t *utf8_restrict str,
1674
+ utf8_int32_t *utf8_restrict out_codepoint) {
1675
+ const utf8_int8_t *s = (const utf8_int8_t *)str;
1676
+
1677
+ if (0xf0 == (0xf8 & s[0])) {
1678
+ /* 4 byte utf8 codepoint */
1679
+ *out_codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) |
1680
+ ((0x3f & s[2]) << 6) | (0x3f & s[3]);
1681
+ } else if (0xe0 == (0xf0 & s[0])) {
1682
+ /* 3 byte utf8 codepoint */
1683
+ *out_codepoint =
1684
+ ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
1685
+ } else if (0xc0 == (0xe0 & s[0])) {
1686
+ /* 2 byte utf8 codepoint */
1687
+ *out_codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
1688
+ } else {
1689
+ /* 1 byte utf8 codepoint otherwise */
1690
+ *out_codepoint = s[0];
1691
+ }
1692
+
1693
+ do {
1694
+ s--;
1695
+ } while ((0 != (0x80 & s[0])) && (0x80 == (0xc0 & s[0])));
1696
+
1697
+ return (utf8_int8_t *)s;
1698
+ }
1699
+
1246
1700
  #undef utf8_restrict
1701
+ #undef utf8_constexpr14
1247
1702
  #undef utf8_null
1248
1703
 
1249
- #ifdef __cplusplus
1250
- } // extern "C"
1704
+ #ifdef utf8_cplusplus
1705
+ } /* extern "C" */
1251
1706
  #endif
1252
1707
 
1253
1708
  #if defined(__clang__)
1254
1709
  #pragma clang diagnostic pop
1255
1710
  #endif
1256
1711
 
1257
- #endif // SHEREDOM_UTF8_H_INCLUDED
1712
+ #endif /* SHEREDOM_UTF8_H_INCLUDED */