@fugood/llama.node 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/CMakeLists.txt +85 -0
  2. package/README.md +56 -0
  3. package/bin/darwin/arm64/llama-node.node +0 -0
  4. package/bin/darwin/x64/llama-node.node +0 -0
  5. package/bin/linux/arm64/llama-node.node +0 -0
  6. package/bin/linux/x64/llama-node.node +0 -0
  7. package/bin/win32/arm64/llama-node.node +0 -0
  8. package/bin/win32/arm64/node.lib +0 -0
  9. package/bin/win32/x64/llama-node.node +0 -0
  10. package/bin/win32/x64/node.lib +0 -0
  11. package/lib/binding.js +13 -0
  12. package/lib/binding.ts +57 -0
  13. package/lib/index.js +24 -0
  14. package/lib/index.ts +13 -0
  15. package/package.json +65 -0
  16. package/src/addons.cpp +506 -0
  17. package/src/llama.cpp/CMakeLists.txt +1320 -0
  18. package/src/llama.cpp/build.zig +172 -0
  19. package/src/llama.cpp/cmake/FindSIMD.cmake +100 -0
  20. package/src/llama.cpp/common/CMakeLists.txt +87 -0
  21. package/src/llama.cpp/common/base64.hpp +392 -0
  22. package/src/llama.cpp/common/common.cpp +2949 -0
  23. package/src/llama.cpp/common/common.h +324 -0
  24. package/src/llama.cpp/common/console.cpp +501 -0
  25. package/src/llama.cpp/common/console.h +19 -0
  26. package/src/llama.cpp/common/grammar-parser.cpp +440 -0
  27. package/src/llama.cpp/common/grammar-parser.h +29 -0
  28. package/src/llama.cpp/common/json-schema-to-grammar.cpp +764 -0
  29. package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
  30. package/src/llama.cpp/common/json.hpp +24766 -0
  31. package/src/llama.cpp/common/log.h +724 -0
  32. package/src/llama.cpp/common/ngram-cache.cpp +282 -0
  33. package/src/llama.cpp/common/ngram-cache.h +94 -0
  34. package/src/llama.cpp/common/sampling.cpp +353 -0
  35. package/src/llama.cpp/common/sampling.h +147 -0
  36. package/src/llama.cpp/common/stb_image.h +8396 -0
  37. package/src/llama.cpp/common/train.cpp +1513 -0
  38. package/src/llama.cpp/common/train.h +233 -0
  39. package/src/llama.cpp/examples/CMakeLists.txt +52 -0
  40. package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +5 -0
  41. package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +1640 -0
  42. package/src/llama.cpp/examples/batched/CMakeLists.txt +5 -0
  43. package/src/llama.cpp/examples/batched/batched.cpp +262 -0
  44. package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +5 -0
  45. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +261 -0
  46. package/src/llama.cpp/examples/beam-search/CMakeLists.txt +5 -0
  47. package/src/llama.cpp/examples/beam-search/beam-search.cpp +188 -0
  48. package/src/llama.cpp/examples/benchmark/CMakeLists.txt +6 -0
  49. package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +275 -0
  50. package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
  51. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +936 -0
  52. package/src/llama.cpp/examples/embedding/CMakeLists.txt +5 -0
  53. package/src/llama.cpp/examples/embedding/embedding.cpp +211 -0
  54. package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +9 -0
  55. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +195 -0
  56. package/src/llama.cpp/examples/export-lora/CMakeLists.txt +5 -0
  57. package/src/llama.cpp/examples/export-lora/export-lora.cpp +462 -0
  58. package/src/llama.cpp/examples/finetune/CMakeLists.txt +5 -0
  59. package/src/llama.cpp/examples/finetune/finetune.cpp +1861 -0
  60. package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +5 -0
  61. package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +132 -0
  62. package/src/llama.cpp/examples/gguf/CMakeLists.txt +5 -0
  63. package/src/llama.cpp/examples/gguf/gguf.cpp +256 -0
  64. package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +5 -0
  65. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +553 -0
  66. package/src/llama.cpp/examples/gritlm/CMakeLists.txt +5 -0
  67. package/src/llama.cpp/examples/gritlm/gritlm.cpp +215 -0
  68. package/src/llama.cpp/examples/imatrix/CMakeLists.txt +5 -0
  69. package/src/llama.cpp/examples/imatrix/imatrix.cpp +655 -0
  70. package/src/llama.cpp/examples/infill/CMakeLists.txt +5 -0
  71. package/src/llama.cpp/examples/infill/infill.cpp +767 -0
  72. package/src/llama.cpp/examples/jeopardy/questions.txt +100 -0
  73. package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +5 -0
  74. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +1286 -0
  75. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/CMakeLists.txt +50 -0
  76. package/src/llama.cpp/examples/llama.android/app/src/main/cpp/llama-android.cpp +443 -0
  77. package/src/llama.cpp/examples/llava/CMakeLists.txt +37 -0
  78. package/src/llama.cpp/examples/llava/clip.cpp +2027 -0
  79. package/src/llama.cpp/examples/llava/clip.h +85 -0
  80. package/src/llama.cpp/examples/llava/llava-cli.cpp +309 -0
  81. package/src/llama.cpp/examples/llava/llava.cpp +426 -0
  82. package/src/llama.cpp/examples/llava/llava.h +50 -0
  83. package/src/llama.cpp/examples/llava/requirements.txt +3 -0
  84. package/src/llama.cpp/examples/lookahead/CMakeLists.txt +5 -0
  85. package/src/llama.cpp/examples/lookahead/lookahead.cpp +485 -0
  86. package/src/llama.cpp/examples/lookup/CMakeLists.txt +23 -0
  87. package/src/llama.cpp/examples/lookup/lookup-create.cpp +41 -0
  88. package/src/llama.cpp/examples/lookup/lookup-merge.cpp +47 -0
  89. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +160 -0
  90. package/src/llama.cpp/examples/lookup/lookup.cpp +258 -0
  91. package/src/llama.cpp/examples/main/CMakeLists.txt +5 -0
  92. package/src/llama.cpp/examples/main/main.cpp +957 -0
  93. package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +33 -0
  94. package/src/llama.cpp/examples/parallel/CMakeLists.txt +5 -0
  95. package/src/llama.cpp/examples/parallel/parallel.cpp +427 -0
  96. package/src/llama.cpp/examples/passkey/CMakeLists.txt +5 -0
  97. package/src/llama.cpp/examples/passkey/passkey.cpp +302 -0
  98. package/src/llama.cpp/examples/perplexity/CMakeLists.txt +5 -0
  99. package/src/llama.cpp/examples/perplexity/perplexity.cpp +1943 -0
  100. package/src/llama.cpp/examples/quantize/CMakeLists.txt +6 -0
  101. package/src/llama.cpp/examples/quantize/quantize.cpp +423 -0
  102. package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +6 -0
  103. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +424 -0
  104. package/src/llama.cpp/examples/retrieval/CMakeLists.txt +5 -0
  105. package/src/llama.cpp/examples/retrieval/retrieval.cpp +350 -0
  106. package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +5 -0
  107. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +246 -0
  108. package/src/llama.cpp/examples/server/CMakeLists.txt +40 -0
  109. package/src/llama.cpp/examples/server/bench/requirements.txt +2 -0
  110. package/src/llama.cpp/examples/server/httplib.h +9465 -0
  111. package/src/llama.cpp/examples/server/server.cpp +3826 -0
  112. package/src/llama.cpp/examples/server/tests/requirements.txt +6 -0
  113. package/src/llama.cpp/examples/server/utils.hpp +653 -0
  114. package/src/llama.cpp/examples/simple/CMakeLists.txt +5 -0
  115. package/src/llama.cpp/examples/simple/simple.cpp +183 -0
  116. package/src/llama.cpp/examples/speculative/CMakeLists.txt +5 -0
  117. package/src/llama.cpp/examples/speculative/speculative.cpp +614 -0
  118. package/src/llama.cpp/examples/sycl/CMakeLists.txt +9 -0
  119. package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +13 -0
  120. package/src/llama.cpp/examples/tokenize/CMakeLists.txt +5 -0
  121. package/src/llama.cpp/examples/tokenize/tokenize.cpp +42 -0
  122. package/src/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +5 -0
  123. package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +1252 -0
  124. package/src/llama.cpp/ggml-alloc.c +985 -0
  125. package/src/llama.cpp/ggml-alloc.h +76 -0
  126. package/src/llama.cpp/ggml-backend-impl.h +141 -0
  127. package/src/llama.cpp/ggml-backend.c +2099 -0
  128. package/src/llama.cpp/ggml-backend.h +233 -0
  129. package/src/llama.cpp/ggml-common.h +1853 -0
  130. package/src/llama.cpp/ggml-cuda.h +43 -0
  131. package/src/llama.cpp/ggml-impl.h +265 -0
  132. package/src/llama.cpp/ggml-kompute.cpp +2006 -0
  133. package/src/llama.cpp/ggml-kompute.h +46 -0
  134. package/src/llama.cpp/ggml-metal.h +66 -0
  135. package/src/llama.cpp/ggml-mpi.c +216 -0
  136. package/src/llama.cpp/ggml-mpi.h +39 -0
  137. package/src/llama.cpp/ggml-opencl.cpp +2301 -0
  138. package/src/llama.cpp/ggml-opencl.h +36 -0
  139. package/src/llama.cpp/ggml-quants.c +12678 -0
  140. package/src/llama.cpp/ggml-quants.h +133 -0
  141. package/src/llama.cpp/ggml-sycl.cpp +17882 -0
  142. package/src/llama.cpp/ggml-sycl.h +49 -0
  143. package/src/llama.cpp/ggml-vulkan-shaders.hpp +69849 -0
  144. package/src/llama.cpp/ggml-vulkan.cpp +6442 -0
  145. package/src/llama.cpp/ggml-vulkan.h +29 -0
  146. package/src/llama.cpp/ggml.c +21819 -0
  147. package/src/llama.cpp/ggml.h +2403 -0
  148. package/src/llama.cpp/llama.cpp +17468 -0
  149. package/src/llama.cpp/llama.h +1117 -0
  150. package/src/llama.cpp/pocs/CMakeLists.txt +12 -0
  151. package/src/llama.cpp/pocs/vdot/CMakeLists.txt +9 -0
  152. package/src/llama.cpp/pocs/vdot/q8dot.cpp +172 -0
  153. package/src/llama.cpp/pocs/vdot/vdot.cpp +310 -0
  154. package/src/llama.cpp/prompts/LLM-questions.txt +49 -0
  155. package/src/llama.cpp/prompts/alpaca.txt +1 -0
  156. package/src/llama.cpp/prompts/assistant.txt +31 -0
  157. package/src/llama.cpp/prompts/chat-with-baichuan.txt +4 -0
  158. package/src/llama.cpp/prompts/chat-with-bob.txt +7 -0
  159. package/src/llama.cpp/prompts/chat-with-qwen.txt +1 -0
  160. package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +7 -0
  161. package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +7 -0
  162. package/src/llama.cpp/prompts/chat.txt +28 -0
  163. package/src/llama.cpp/prompts/dan-modified.txt +1 -0
  164. package/src/llama.cpp/prompts/dan.txt +1 -0
  165. package/src/llama.cpp/prompts/mnemonics.txt +93 -0
  166. package/src/llama.cpp/prompts/parallel-questions.txt +43 -0
  167. package/src/llama.cpp/prompts/reason-act.txt +18 -0
  168. package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +3 -0
  169. package/src/llama.cpp/requirements/requirements-convert-llama-ggml-to-gguf.txt +1 -0
  170. package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +2 -0
  171. package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +2 -0
  172. package/src/llama.cpp/requirements/requirements-convert.txt +5 -0
  173. package/src/llama.cpp/requirements.txt +12 -0
  174. package/src/llama.cpp/scripts/gen-build-info-cpp.cmake +24 -0
  175. package/src/llama.cpp/scripts/xxd.cmake +16 -0
  176. package/src/llama.cpp/sgemm.cpp +999 -0
  177. package/src/llama.cpp/sgemm.h +12 -0
  178. package/src/llama.cpp/tests/CMakeLists.txt +78 -0
  179. package/src/llama.cpp/tests/get-model.cpp +21 -0
  180. package/src/llama.cpp/tests/get-model.h +2 -0
  181. package/src/llama.cpp/tests/test-autorelease.cpp +24 -0
  182. package/src/llama.cpp/tests/test-backend-ops.cpp +2266 -0
  183. package/src/llama.cpp/tests/test-c.c +7 -0
  184. package/src/llama.cpp/tests/test-chat-template.cpp +107 -0
  185. package/src/llama.cpp/tests/test-double-float.cpp +57 -0
  186. package/src/llama.cpp/tests/test-grad0.cpp +1606 -0
  187. package/src/llama.cpp/tests/test-grammar-integration.cpp +243 -0
  188. package/src/llama.cpp/tests/test-grammar-parser.cpp +250 -0
  189. package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +899 -0
  190. package/src/llama.cpp/tests/test-llama-grammar.cpp +402 -0
  191. package/src/llama.cpp/tests/test-model-load-cancel.cpp +27 -0
  192. package/src/llama.cpp/tests/test-opt.cpp +181 -0
  193. package/src/llama.cpp/tests/test-quantize-fns.cpp +185 -0
  194. package/src/llama.cpp/tests/test-quantize-perf.cpp +363 -0
  195. package/src/llama.cpp/tests/test-rope.cpp +221 -0
  196. package/src/llama.cpp/tests/test-sampling.cpp +301 -0
  197. package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +187 -0
  198. package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +190 -0
  199. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +123 -0
  200. package/src/llama.cpp/tests/test-tokenizer-1-llama.cpp +111 -0
  201. package/src/llama.cpp/unicode-data.cpp +1651 -0
  202. package/src/llama.cpp/unicode-data.h +16 -0
  203. package/src/llama.cpp/unicode.cpp +277 -0
  204. package/src/llama.cpp/unicode.h +28 -0
@@ -0,0 +1,16 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <map>
5
+ #include <utility>
6
+ #include <vector>
7
+
8
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_digit;
9
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
10
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
11
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
12
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
13
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_symbol;
14
+ extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_control;
15
+ extern const std::multimap<uint32_t, uint32_t> unicode_map_nfd;
16
+ extern const std::map<char32_t, char32_t> unicode_map_lowercase;
@@ -0,0 +1,277 @@
1
+ #include "unicode.h"
2
+ #include "unicode-data.h"
3
+
4
+ #include <cassert>
5
+ #include <cstddef>
6
+ #include <cstdint>
7
+ #include <map>
8
+ #include <stdexcept>
9
+ #include <string>
10
+ #include <unordered_map>
11
+ #include <utility>
12
+ #include <vector>
13
+
14
+ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
15
+ std::string result;
16
+ for (size_t i = 0; i < cps.size(); ++i) {
17
+ result.append(unicode_cpt_to_utf8(cps[i]));
18
+ }
19
+ return result;
20
+ }
21
+
22
+ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
23
+ assert(offset < utf8.size());
24
+ if (!(utf8[offset + 0] & 0x80)) {
25
+ auto result = utf8[offset + 0];
26
+ offset += 1;
27
+ return result;
28
+ }
29
+ if (!(utf8[offset + 0] & 0x40)) {
30
+ throw std::invalid_argument("invalid character");
31
+ }
32
+ if (!(utf8[offset + 0] & 0x20)) {
33
+ if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
34
+ throw std::invalid_argument("invalid character");
35
+ }
36
+ auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
37
+ offset += 2;
38
+ return result;
39
+ }
40
+ if (!(utf8[offset + 0] & 0x10)) {
41
+ if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
42
+ throw std::invalid_argument("invalid character");
43
+ }
44
+ auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
45
+ offset += 3;
46
+ return result;
47
+ }
48
+ if (!(utf8[offset + 0] & 0x08)) {
49
+ if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
50
+ throw std::invalid_argument("invalid character");
51
+ }
52
+ auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
53
+ offset += 4;
54
+ return result;
55
+ }
56
+ throw std::invalid_argument("invalid string");
57
+ }
58
+
59
+ static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
60
+ std::vector<uint16_t> result;
61
+ if (/* 0x0000 <= cp && */ cp <= 0xffff) {
62
+ result.emplace_back(cp);
63
+ }
64
+ else if (0x10000 <= cp && cp <= 0x10ffff) {
65
+ result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
66
+ result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
67
+ }
68
+ else {
69
+ throw std::invalid_argument("invalid cpt");
70
+ }
71
+ return result;
72
+ }
73
+
74
+ //static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
75
+ // std::vector<uint16_t> result;
76
+ // for (size_t i = 0; i < cps.size(); ++i) {
77
+ // auto temp = unicode_cpt_to_utf16(cps[i]);
78
+ // result.insert(result.end(), temp.begin(), temp.end());
79
+ // }
80
+ // return result;
81
+ //}
82
+
83
+ static uint32_t cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
84
+ assert(offset < utf16.size());
85
+ if (((utf16[0] >> 10) << 10) != 0xd800) {
86
+ auto result = utf16[offset + 0];
87
+ offset += 1;
88
+ return result;
89
+ }
90
+
91
+ if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
92
+ throw std::invalid_argument("invalid character");
93
+ }
94
+
95
+ auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
96
+ offset += 2;
97
+ return result;
98
+ }
99
+
100
+ //static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
101
+ // std::vector<uint32_t> result;
102
+ // size_t offset = 0;
103
+ // while (offset < utf16.size()) {
104
+ // result.push_back(cpt_from_utf16(utf16, offset));
105
+ // }
106
+ // return result;
107
+ //}
108
+
109
+ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
110
+ std::unordered_map<uint32_t, int> cpt_types;
111
+ for (auto p : unicode_ranges_digit) {
112
+ for (auto i = p.first; i <= p.second; ++ i) {
113
+ cpt_types[i] = CODEPOINT_TYPE_DIGIT;
114
+ }
115
+ }
116
+ for (auto p : unicode_ranges_letter) {
117
+ for (auto i = p.first; i <= p.second; ++ i) {
118
+ cpt_types[i] = CODEPOINT_TYPE_LETTER;
119
+ }
120
+ }
121
+ for (auto p : unicode_ranges_whitespace) {
122
+ for (auto i = p.first; i <= p.second; ++ i) {
123
+ cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
124
+ }
125
+ }
126
+ for (auto p : unicode_ranges_accent_mark) {
127
+ for (auto i = p.first; i <= p.second; ++ i) {
128
+ cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
129
+ }
130
+ }
131
+ for (auto p : unicode_ranges_punctuation) {
132
+ for (auto i = p.first; i <= p.second; ++ i) {
133
+ cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
134
+ }
135
+ }
136
+ for (auto p : unicode_ranges_symbol) {
137
+ for (auto i = p.first; i <= p.second; ++i) {
138
+ cpt_types[i] = CODEPOINT_TYPE_SYMBOL;
139
+ }
140
+ }
141
+ for (auto p : unicode_ranges_control) {
142
+ for (auto i = p.first; i <= p.second; ++ i) {
143
+ cpt_types[i] = CODEPOINT_TYPE_CONTROL;
144
+ }
145
+ }
146
+ return cpt_types;
147
+ }
148
+
149
+ static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
150
+ std::unordered_map<uint8_t, std::string> map;
151
+ for (int ch = u'!'; ch <= u'~'; ++ch) {
152
+ assert(0 <= ch && ch < 256);
153
+ map[ch] = unicode_cpt_to_utf8(ch);
154
+ }
155
+ for (int ch = u'¡'; ch <= u'¬'; ++ch) {
156
+ assert(0 <= ch && ch < 256);
157
+ map[ch] = unicode_cpt_to_utf8(ch);
158
+ }
159
+ for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
160
+ assert(0 <= ch && ch < 256);
161
+ map[ch] = unicode_cpt_to_utf8(ch);
162
+ }
163
+ auto n = 0;
164
+ for (int ch = 0; ch < 256; ++ch) {
165
+ if (map.find(ch) == map.end()) {
166
+ map[ch] = unicode_cpt_to_utf8(256 + n);
167
+ ++n;
168
+ }
169
+ }
170
+ return map;
171
+ }
172
+
173
+ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
174
+ std::unordered_map<std::string, uint8_t> map;
175
+ for (int ch = u'!'; ch <= u'~'; ++ch) {
176
+ assert(0 <= ch && ch < 256);
177
+ map[unicode_cpt_to_utf8(ch)] = ch;
178
+ }
179
+ for (int ch = u'¡'; ch <= u'¬'; ++ch) {
180
+ assert(0 <= ch && ch < 256);
181
+ map[unicode_cpt_to_utf8(ch)] = ch;
182
+ }
183
+ for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
184
+ assert(0 <= ch && ch < 256);
185
+ map[unicode_cpt_to_utf8(ch)] = ch;
186
+ }
187
+ auto n = 0;
188
+ for (int ch = 0; ch < 256; ++ch) {
189
+ if (map.find(unicode_cpt_to_utf8(ch)) == map.end()) {
190
+ map[unicode_cpt_to_utf8(256 + n)] = ch;
191
+ ++n;
192
+ }
193
+ }
194
+ return map;
195
+ }
196
+
197
+ //
198
+ // interface
199
+ //
200
+
201
+ std::string unicode_cpt_to_utf8(uint32_t cp) {
202
+ std::string result;
203
+ if (/* 0x00 <= cp && */ cp <= 0x7f) {
204
+ result.push_back(cp);
205
+ }
206
+ else if (0x80 <= cp && cp <= 0x7ff) {
207
+ result.push_back(0xc0 | ((cp >> 6) & 0x1f));
208
+ result.push_back(0x80 | (cp & 0x3f));
209
+ }
210
+ else if (0x800 <= cp && cp <= 0xffff) {
211
+ result.push_back(0xe0 | ((cp >> 12) & 0x0f));
212
+ result.push_back(0x80 | ((cp >> 6) & 0x3f));
213
+ result.push_back(0x80 | (cp & 0x3f));
214
+ }
215
+ else if (0x10000 <= cp && cp <= 0x10ffff) {
216
+ result.push_back(0xf0 | ((cp >> 18) & 0x07));
217
+ result.push_back(0x80 | ((cp >> 12) & 0x3f));
218
+ result.push_back(0x80 | ((cp >> 6) & 0x3f));
219
+ result.push_back(0x80 | (cp & 0x3f));
220
+ }
221
+ else {
222
+ throw std::invalid_argument("invalid codepoint");
223
+ }
224
+ return result;
225
+ }
226
+
227
+ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
228
+ std::vector<uint32_t> result;
229
+ result.reserve(cpts.size());
230
+ for (size_t i = 0; i < cpts.size(); ++i) {
231
+ auto it = unicode_map_nfd.find(cpts[i]);
232
+ if (it == unicode_map_nfd.end()) {
233
+ result.push_back(cpts[i]);
234
+ } else {
235
+ result.push_back(it->second);
236
+ }
237
+ }
238
+ return result;
239
+ }
240
+
241
+ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
242
+ std::vector<uint32_t> result;
243
+ size_t offset = 0;
244
+ while (offset < utf8.size()) {
245
+ result.push_back(unicode_cpt_from_utf8(utf8, offset));
246
+ }
247
+ return result;
248
+ }
249
+
250
+ int unicode_cpt_type(uint32_t cp) {
251
+ static std::unordered_map<uint32_t, int> cpt_types = unicode_cpt_type_map();
252
+ const auto it = cpt_types.find(cp);
253
+ return it == cpt_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : it->second;
254
+ }
255
+
256
+ int unicode_cpt_type(const std::string & utf8) {
257
+ if (utf8.length() == 0) {
258
+ return CODEPOINT_TYPE_UNIDENTIFIED;
259
+ }
260
+ size_t offset = 0;
261
+ return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
262
+ }
263
+
264
+ std::string unicode_byte_to_utf8(uint8_t byte) {
265
+ static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
266
+ return map.at(byte);
267
+ }
268
+
269
+ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
270
+ static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
271
+ return map.at(utf8);
272
+ }
273
+
274
+ char32_t unicode_tolower(char32_t cp) {
275
+ auto it = unicode_map_lowercase.find(cp);
276
+ return it == unicode_map_lowercase.end() ? cp : it->second;
277
+ }
@@ -0,0 +1,28 @@
1
+ #pragma once
2
+
3
+ #include <cstdint>
4
+ #include <string>
5
+ #include <vector>
6
+
7
+ #define CODEPOINT_TYPE_UNIDENTIFIED 0
8
+ #define CODEPOINT_TYPE_DIGIT 1
9
+ #define CODEPOINT_TYPE_LETTER 2
10
+ #define CODEPOINT_TYPE_WHITESPACE 3
11
+ #define CODEPOINT_TYPE_ACCENT_MARK 4
12
+ #define CODEPOINT_TYPE_PUNCTUATION 5
13
+ #define CODEPOINT_TYPE_SYMBOL 6
14
+ #define CODEPOINT_TYPE_CONTROL 7
15
+
16
+ std::string unicode_cpt_to_utf8(uint32_t cp);
17
+ std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
18
+
19
+ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
20
+
21
+ int unicode_cpt_type(uint32_t cp);
22
+ int unicode_cpt_type(const std::string & utf8);
23
+
24
+ std::string unicode_byte_to_utf8(uint8_t byte);
25
+ uint8_t unicode_utf8_to_byte(const std::string & utf8);
26
+
27
+ // simple tolower that only implements one-to-one mapping, not one-to-many
28
+ char32_t unicode_tolower(char32_t cp);