llama-cpp-capacitor 0.0.6 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/android/src/main/CMakeLists.txt +9 -9
  2. package/cpp/LICENSE +21 -0
  3. package/cpp/README.md +4 -0
  4. package/cpp/anyascii.c +22223 -0
  5. package/cpp/anyascii.h +42 -0
  6. package/cpp/chat-parser.cpp +393 -0
  7. package/cpp/chat-parser.h +120 -0
  8. package/cpp/chat.cpp +2315 -0
  9. package/cpp/chat.h +221 -0
  10. package/cpp/common.cpp +1619 -0
  11. package/cpp/common.h +744 -0
  12. package/cpp/ggml-alloc.c +1028 -0
  13. package/cpp/ggml-alloc.h +76 -0
  14. package/cpp/ggml-backend-impl.h +255 -0
  15. package/cpp/ggml-backend-reg.cpp +600 -0
  16. package/cpp/ggml-backend.cpp +2118 -0
  17. package/cpp/ggml-backend.h +354 -0
  18. package/cpp/ggml-common.h +1878 -0
  19. package/cpp/ggml-cpp.h +39 -0
  20. package/cpp/ggml-cpu/amx/amx.cpp +221 -0
  21. package/cpp/ggml-cpu/amx/amx.h +8 -0
  22. package/cpp/ggml-cpu/amx/common.h +91 -0
  23. package/cpp/ggml-cpu/amx/mmq.cpp +2512 -0
  24. package/cpp/ggml-cpu/amx/mmq.h +10 -0
  25. package/cpp/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
  26. package/cpp/ggml-cpu/arch/arm/quants.c +3650 -0
  27. package/cpp/ggml-cpu/arch/arm/repack.cpp +1891 -0
  28. package/cpp/ggml-cpu/arch/x86/cpu-feats.cpp +327 -0
  29. package/cpp/ggml-cpu/arch/x86/quants.c +3820 -0
  30. package/cpp/ggml-cpu/arch/x86/repack.cpp +6307 -0
  31. package/cpp/ggml-cpu/arch-fallback.h +215 -0
  32. package/cpp/ggml-cpu/binary-ops.cpp +158 -0
  33. package/cpp/ggml-cpu/binary-ops.h +16 -0
  34. package/cpp/ggml-cpu/common.h +73 -0
  35. package/cpp/ggml-cpu/ggml-cpu-impl.h +525 -0
  36. package/cpp/ggml-cpu/ggml-cpu.c +3578 -0
  37. package/cpp/ggml-cpu/ggml-cpu.cpp +672 -0
  38. package/cpp/ggml-cpu/ops.cpp +10587 -0
  39. package/cpp/ggml-cpu/ops.h +114 -0
  40. package/cpp/ggml-cpu/quants.c +1193 -0
  41. package/cpp/ggml-cpu/quants.h +97 -0
  42. package/cpp/ggml-cpu/repack.cpp +1982 -0
  43. package/cpp/ggml-cpu/repack.h +120 -0
  44. package/cpp/ggml-cpu/simd-mappings.h +1184 -0
  45. package/cpp/ggml-cpu/traits.cpp +36 -0
  46. package/cpp/ggml-cpu/traits.h +38 -0
  47. package/cpp/ggml-cpu/unary-ops.cpp +186 -0
  48. package/cpp/ggml-cpu/unary-ops.h +28 -0
  49. package/cpp/ggml-cpu/vec.cpp +348 -0
  50. package/cpp/ggml-cpu/vec.h +1121 -0
  51. package/cpp/ggml-cpu.h +145 -0
  52. package/cpp/ggml-impl.h +622 -0
  53. package/cpp/ggml-metal-impl.h +688 -0
  54. package/cpp/ggml-metal.h +66 -0
  55. package/cpp/ggml-metal.m +6833 -0
  56. package/cpp/ggml-opt.cpp +1093 -0
  57. package/cpp/ggml-opt.h +256 -0
  58. package/cpp/ggml-quants.c +5324 -0
  59. package/cpp/ggml-quants.h +106 -0
  60. package/cpp/ggml-threading.cpp +12 -0
  61. package/cpp/ggml-threading.h +14 -0
  62. package/cpp/ggml.c +7108 -0
  63. package/cpp/ggml.h +2492 -0
  64. package/cpp/gguf.cpp +1358 -0
  65. package/cpp/gguf.h +202 -0
  66. package/cpp/json-partial.cpp +256 -0
  67. package/cpp/json-partial.h +38 -0
  68. package/cpp/json-schema-to-grammar.cpp +985 -0
  69. package/cpp/json-schema-to-grammar.h +21 -0
  70. package/cpp/llama-adapter.cpp +388 -0
  71. package/cpp/llama-adapter.h +76 -0
  72. package/cpp/llama-arch.cpp +2355 -0
  73. package/cpp/llama-arch.h +499 -0
  74. package/cpp/llama-batch.cpp +875 -0
  75. package/cpp/llama-batch.h +160 -0
  76. package/cpp/llama-chat.cpp +783 -0
  77. package/cpp/llama-chat.h +65 -0
  78. package/cpp/llama-context.cpp +2748 -0
  79. package/cpp/llama-context.h +306 -0
  80. package/cpp/llama-cparams.cpp +5 -0
  81. package/cpp/llama-cparams.h +41 -0
  82. package/cpp/llama-cpp.h +30 -0
  83. package/cpp/llama-grammar.cpp +1229 -0
  84. package/cpp/llama-grammar.h +173 -0
  85. package/cpp/llama-graph.cpp +1891 -0
  86. package/cpp/llama-graph.h +810 -0
  87. package/cpp/llama-hparams.cpp +180 -0
  88. package/cpp/llama-hparams.h +233 -0
  89. package/cpp/llama-impl.cpp +167 -0
  90. package/cpp/llama-impl.h +61 -0
  91. package/cpp/llama-io.cpp +15 -0
  92. package/cpp/llama-io.h +35 -0
  93. package/cpp/llama-kv-cache-iswa.cpp +318 -0
  94. package/cpp/llama-kv-cache-iswa.h +135 -0
  95. package/cpp/llama-kv-cache.cpp +2059 -0
  96. package/cpp/llama-kv-cache.h +374 -0
  97. package/cpp/llama-kv-cells.h +491 -0
  98. package/cpp/llama-memory-hybrid.cpp +258 -0
  99. package/cpp/llama-memory-hybrid.h +137 -0
  100. package/cpp/llama-memory-recurrent.cpp +1146 -0
  101. package/cpp/llama-memory-recurrent.h +179 -0
  102. package/cpp/llama-memory.cpp +59 -0
  103. package/cpp/llama-memory.h +119 -0
  104. package/cpp/llama-mmap.cpp +600 -0
  105. package/cpp/llama-mmap.h +68 -0
  106. package/cpp/llama-model-loader.cpp +1164 -0
  107. package/cpp/llama-model-loader.h +170 -0
  108. package/cpp/llama-model-saver.cpp +282 -0
  109. package/cpp/llama-model-saver.h +37 -0
  110. package/cpp/llama-model.cpp +19042 -0
  111. package/cpp/llama-model.h +491 -0
  112. package/cpp/llama-sampling.cpp +2575 -0
  113. package/cpp/llama-sampling.h +32 -0
  114. package/cpp/llama-vocab.cpp +3792 -0
  115. package/cpp/llama-vocab.h +176 -0
  116. package/cpp/llama.cpp +358 -0
  117. package/cpp/llama.h +1373 -0
  118. package/cpp/log.cpp +427 -0
  119. package/cpp/log.h +103 -0
  120. package/cpp/minja/chat-template.hpp +550 -0
  121. package/cpp/minja/minja.hpp +3009 -0
  122. package/cpp/nlohmann/json.hpp +25526 -0
  123. package/cpp/nlohmann/json_fwd.hpp +187 -0
  124. package/cpp/regex-partial.cpp +204 -0
  125. package/cpp/regex-partial.h +56 -0
  126. package/cpp/rn-completion.cpp +681 -0
  127. package/cpp/rn-completion.h +116 -0
  128. package/cpp/rn-llama.cpp +345 -0
  129. package/cpp/rn-llama.h +149 -0
  130. package/cpp/rn-mtmd.hpp +602 -0
  131. package/cpp/rn-tts.cpp +591 -0
  132. package/cpp/rn-tts.h +59 -0
  133. package/cpp/sampling.cpp +579 -0
  134. package/cpp/sampling.h +107 -0
  135. package/cpp/tools/mtmd/clip-impl.h +473 -0
  136. package/cpp/tools/mtmd/clip.cpp +4322 -0
  137. package/cpp/tools/mtmd/clip.h +106 -0
  138. package/cpp/tools/mtmd/miniaudio/miniaudio.h +93468 -0
  139. package/cpp/tools/mtmd/mtmd-audio.cpp +769 -0
  140. package/cpp/tools/mtmd/mtmd-audio.h +47 -0
  141. package/cpp/tools/mtmd/mtmd-helper.cpp +460 -0
  142. package/cpp/tools/mtmd/mtmd-helper.h +91 -0
  143. package/cpp/tools/mtmd/mtmd.cpp +1066 -0
  144. package/cpp/tools/mtmd/mtmd.h +298 -0
  145. package/cpp/tools/mtmd/stb/stb_image.h +7988 -0
  146. package/cpp/unicode-data.cpp +7034 -0
  147. package/cpp/unicode-data.h +20 -0
  148. package/cpp/unicode.cpp +1061 -0
  149. package/cpp/unicode.h +68 -0
  150. package/package.json +2 -1
@@ -0,0 +1,3792 @@
1
+ #include "llama-vocab.h"
2
+
3
+ #include "ggml.h"
4
+ #include "gguf.h"
5
+ #include "llama-impl.h"
6
+ #include "llama-model-loader.h"
7
+
8
+ #include "unicode.h"
9
+
10
+ #include <algorithm>
11
+ #include <cassert>
12
+ #include <cctype>
13
+ #include <cfloat>
14
+ #include <cmath>
15
+ #include <cstdarg>
16
+ #include <cstring>
17
+ #include <forward_list>
18
+ #include <limits>
19
+ #include <map>
20
+ #include <queue>
21
+ #include <set>
22
+ #include <unordered_map>
23
+
24
+ //
25
+ // helpers
26
+ //
27
+
28
+ struct naive_trie {
29
+ naive_trie() : has_value(false), value(0) {
30
+ }
31
+ void insert(const char * key, size_t len, int32_t value = 0) {
32
+ if (len == 0) {
33
+ this->has_value = true;
34
+ this->value = value;
35
+ return;
36
+ }
37
+ char c = key[0];
38
+ auto res = children.find(c);
39
+ if (res != children.end()) {
40
+ res->second.insert(key + 1, len - 1, value);
41
+ } else {
42
+ auto res = children.insert(std::make_pair(c, naive_trie()));
43
+ res.first->second.insert(key + 1, len - 1, value);
44
+ }
45
+ }
46
+ std::pair<const char *, size_t> get_longest_prefix(const char * key, size_t len, size_t offset = 0) const {
47
+ if (len == 0 || offset == len) {
48
+ return std::make_pair(key, offset);
49
+ }
50
+ char c = key[offset];
51
+ auto res = children.find(c);
52
+ if (res != children.end()) {
53
+ return res->second.get_longest_prefix(key, len, offset + 1);
54
+ }
55
+
56
+ return std::make_pair(key, offset);
57
+ }
58
+ const struct naive_trie * traverse(const char c) const {
59
+ auto res = children.find(c);
60
+ if (res != children.end()) {
61
+ return &res->second;
62
+ }
63
+
64
+ return NULL;
65
+ }
66
+ std::map<char, struct naive_trie> children;
67
+ bool has_value;
68
+ llama_token value;
69
+ };
70
+
71
+ //
72
+ // tokenizers
73
+ //
74
+
75
+ struct llm_tokenizer {
76
+ llm_tokenizer() {}
77
+ virtual ~llm_tokenizer() = default;
78
+ };
79
+
80
+ struct llm_symbol {
81
+ using index = int;
82
+ index prev;
83
+ index next;
84
+ const char * text;
85
+ size_t n;
86
+ };
87
+
88
+ static_assert(std::is_trivially_copyable<llm_symbol>::value, "llm_symbol is not trivially copyable");
89
+
90
+ //
91
+ // SPM tokenizer
92
+ // original implementation:
93
+ // https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
94
+ //
95
+
96
+ struct llm_bigram_spm {
97
+ struct comparator {
98
+ bool operator()(llm_bigram_spm & l, llm_bigram_spm & r) {
99
+ return (l.score < r.score) || (l.score == r.score && l.left > r.left);
100
+ }
101
+ };
102
+ using queue_storage = std::vector<llm_bigram_spm>;
103
+ using queue = std::priority_queue<llm_bigram_spm, queue_storage, comparator>;
104
+ llm_symbol::index left;
105
+ llm_symbol::index right;
106
+ float score;
107
+ size_t size;
108
+ };
109
+
110
+ struct llm_tokenizer_spm : llm_tokenizer {
111
+ llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
112
+ };
113
+
114
+ struct llm_tokenizer_spm_session {
115
+ llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
116
+
117
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
118
+ // split string into utf8 chars
119
+ int index = 0;
120
+ size_t offs = 0;
121
+ while (offs < text.size()) {
122
+ llm_symbol sym;
123
+ size_t len = unicode_len_utf8(text[offs]);
124
+ sym.text = text.c_str() + offs;
125
+ sym.n = std::min(len, text.size() - offs);
126
+ offs += sym.n;
127
+ sym.prev = index - 1;
128
+ sym.next = offs == text.size() ? -1 : index + 1;
129
+ index++;
130
+ symbols.emplace_back(sym);
131
+ }
132
+
133
+ // seed the work queue with all possible 2-character tokens.
134
+ for (int i = 1; i < (int) symbols.size(); ++i) {
135
+ try_add_bigram(i - 1, i);
136
+ }
137
+
138
+ // keep substituting the highest frequency pairs for as long as we can.
139
+ while (!work_queue.empty()) {
140
+ auto bigram = work_queue.top();
141
+ work_queue.pop();
142
+
143
+ auto & left_sym = symbols[bigram.left];
144
+ auto & right_sym = symbols[bigram.right];
145
+
146
+ // if one of the symbols already got merged, skip it.
147
+ if (left_sym.n == 0 || right_sym.n == 0 ||
148
+ left_sym.n + right_sym.n != bigram.size) {
149
+ continue;
150
+ }
151
+
152
+ // merge the right sym into the left one
153
+ left_sym.n += right_sym.n;
154
+ right_sym.n = 0;
155
+
156
+ //LLAMA_LOG_INFO("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
157
+
158
+ // remove the right sym from the chain
159
+ left_sym.next = right_sym.next;
160
+ if (right_sym.next >= 0) {
161
+ symbols[right_sym.next].prev = bigram.left;
162
+ }
163
+
164
+ // find more substitutions
165
+ try_add_bigram(left_sym.prev, bigram.left);
166
+ try_add_bigram(bigram.left, left_sym.next);
167
+ }
168
+
169
+ for (int i = 0; i != -1; i = symbols[i].next) {
170
+ auto & symbol = symbols[i];
171
+ resegment(symbol, output);
172
+ }
173
+ }
174
+
175
+ private:
176
+ void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
177
+ auto text = std::string(symbol.text, symbol.n);
178
+ auto token = vocab.text_to_token(text);
179
+
180
+ // Do we need to support is_unused?
181
+ if (token != LLAMA_TOKEN_NULL) {
182
+ output.push_back(token);
183
+ return;
184
+ }
185
+
186
+ const auto p = rev_merge.find(text);
187
+
188
+ if (p == rev_merge.end()) {
189
+ // output any symbols that did not form tokens as bytes.
190
+ output.reserve(output.size() + symbol.n);
191
+ for (int j = 0; j < (int)symbol.n; ++j) {
192
+ llama_token id = vocab.byte_to_token(symbol.text[j]);
193
+ output.push_back(id);
194
+ }
195
+ return;
196
+ }
197
+
198
+ resegment(symbols[p->second.first], output);
199
+ resegment(symbols[p->second.second], output);
200
+ }
201
+
202
+ void try_add_bigram(int left, int right) {
203
+ if (left == -1 || right == -1) {
204
+ return;
205
+ }
206
+ const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
207
+ auto token = vocab.text_to_token(text);
208
+
209
+ if (token == LLAMA_TOKEN_NULL) {
210
+ return;
211
+ }
212
+
213
+ if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
214
+ return;
215
+ }
216
+
217
+ const auto & tok_data = vocab.get_token_data(token);
218
+
219
+ llm_bigram_spm bigram;
220
+ bigram.left = left;
221
+ bigram.right = right;
222
+ bigram.score = tok_data.score;
223
+ bigram.size = text.size();
224
+
225
+ work_queue.push(bigram);
226
+
227
+ // Do we need to support is_unused?
228
+ rev_merge[text] = std::make_pair(left, right);
229
+ }
230
+
231
+ const llama_vocab & vocab;
232
+ // currently unused
233
+ // const llm_tokenizer_spm * spm_tokenizer;
234
+
235
+ std::vector<llm_symbol> symbols;
236
+ llm_bigram_spm::queue work_queue;
237
+ std::map<std::string, std::pair<int, int>> rev_merge;
238
+ };
239
+
240
+ //
241
+ // BPE tokenizer
242
+ // adapted from https://github.com/cmp-nct/ggllm.cpp [MIT License]
243
+ // tried to simplify unicode stuff, so most likely does not work 100% correctly!
244
+ //
245
+
246
+ // TODO: there are a lot of common parts between spm and bpe tokenizers, should be refactored and reused
247
+
248
+ template<typename T, typename Container = std::vector<T>, typename Compare = std::less<typename Container::value_type>>
249
+ class llama_priority_queue : public std::priority_queue<T, Container, Compare> {
250
+ public:
251
+ using std::priority_queue<T, Container, Compare>::priority_queue;
252
+
253
+ T pop_move() {
254
+ T item = std::move(this->c.front());
255
+ std::pop_heap(this->c.begin(), this->c.end(), this->comp);
256
+ this->c.pop_back();
257
+ return item;
258
+ }
259
+
260
+ void pop() = delete;
261
+ };
262
+
263
+ struct llm_bigram_bpe {
264
+ struct comparator {
265
+ bool operator()(const llm_bigram_bpe & l, const llm_bigram_bpe & r) const {
266
+ return l.rank > r.rank || (l.rank == r.rank && l.left > r.left);
267
+ }
268
+ };
269
+
270
+ using queue_storage = std::vector<llm_bigram_bpe>;
271
+ using queue = llama_priority_queue<llm_bigram_bpe, queue_storage, comparator>;
272
+ llm_symbol::index left;
273
+ llm_symbol::index right;
274
+ std::string text;
275
+ int rank;
276
+ size_t size;
277
+ };
278
+
279
+ struct llm_tokenizer_bpe : llm_tokenizer {
280
+ llm_tokenizer_bpe(const llama_vocab & vocab) {
281
+ LM_GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
282
+ switch (vocab.get_pre_type()) {
283
+ case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
284
+ regex_exprs = {
285
+ // original regex from tokenizer.json
286
+ //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
287
+
288
+ // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
289
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
290
+ };
291
+ break;
292
+ case LLAMA_VOCAB_PRE_TYPE_DBRX:
293
+ case LLAMA_VOCAB_PRE_TYPE_SMAUG:
294
+ regex_exprs = {
295
+ // same as llama3
296
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
297
+ };
298
+ break;
299
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
300
+ regex_exprs = {
301
+ "[\r\n]",
302
+ "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
303
+ "\\s?[!-/:-~!-/:-~‘-‟ -。]+",
304
+ "\\s+$",
305
+ "[一-龥ࠀ-一가-퟿]+",
306
+ "\\p{N}+",
307
+ };
308
+ break;
309
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
310
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE:
311
+ regex_exprs = {
312
+ "\\p{N}{1,3}",
313
+ "[一-龥぀-ゟ゠-ヿ]+",
314
+ "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
315
+ };
316
+ break;
317
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
318
+ regex_exprs = {
319
+ "[\r\n]",
320
+ "\\s?\\p{L}+",
321
+ "\\s?\\p{P}+",
322
+ "[一-龥ࠀ-一가-퟿]+",
323
+ "\\p{N}",
324
+ };
325
+ break;
326
+ case LLAMA_VOCAB_PRE_TYPE_FALCON:
327
+ regex_exprs = {
328
+ "[\\p{P}\\$\\+<=>\\^~\\|`]+",
329
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
330
+ "[0-9][0-9][0-9]",
331
+ };
332
+ break;
333
+ case LLAMA_VOCAB_PRE_TYPE_STARCODER:
334
+ case LLAMA_VOCAB_PRE_TYPE_REFACT:
335
+ case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
336
+ case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
337
+ case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
338
+ case LLAMA_VOCAB_PRE_TYPE_EXAONE:
339
+ case LLAMA_VOCAB_PRE_TYPE_MINERVA:
340
+ regex_exprs = {
341
+ "\\p{N}",
342
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
343
+ };
344
+ break;
345
+ case LLAMA_VOCAB_PRE_TYPE_GPT2:
346
+ case LLAMA_VOCAB_PRE_TYPE_MPT:
347
+ case LLAMA_VOCAB_PRE_TYPE_OLMO:
348
+ case LLAMA_VOCAB_PRE_TYPE_JAIS:
349
+ case LLAMA_VOCAB_PRE_TYPE_TRILLION:
350
+ regex_exprs = {
351
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
352
+ };
353
+ break;
354
+ case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
355
+ case LLAMA_VOCAB_PRE_TYPE_QWEN2:
356
+ case LLAMA_VOCAB_PRE_TYPE_HUNYUAN:
357
+ regex_exprs = {
358
+ // original regex from tokenizer.json
359
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
360
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
361
+ };
362
+ break;
363
+ case LLAMA_VOCAB_PRE_TYPE_PORO:
364
+ case LLAMA_VOCAB_PRE_TYPE_BLOOM:
365
+ case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
366
+ regex_exprs = {
367
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
368
+ };
369
+ break;
370
+ case LLAMA_VOCAB_PRE_TYPE_CHATGLM4:
371
+ regex_exprs = {
372
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
373
+ };
374
+ break;
375
+ case LLAMA_VOCAB_PRE_TYPE_VIKING:
376
+ regex_exprs = {
377
+ " ?[^(\\s|.,!?…。,、।۔،)]+",
378
+ "\\p{N}",
379
+ };
380
+ break;
381
+ case LLAMA_VOCAB_PRE_TYPE_TEKKEN:
382
+ // original regex from tokenizer.json
383
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
384
+ regex_exprs = {
385
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
386
+ };
387
+ break;
388
+ case LLAMA_VOCAB_PRE_TYPE_CHAMELEON:
389
+ // Note: in theory, the special token (sentinel and image token) regex_exprs below
390
+ // are unnecessary, as they are split in `tokenizer_st_partition` anyway.
391
+ // However, since the upstream pre-tokenizer uses them, they are also
392
+ // included here (see https://huggingface.co/facebook/chameleon-7b).
393
+ regex_exprs = {
394
+ "<sentinel:[0-9]+>", // Sentinel tokens
395
+ "(IMGIMG)((A|B|C|D|E|F|G|H|I){1,4})Z", // Image tokens
396
+ "([\\t\\n]| | )", // directly from tokenizer.json
397
+ "\\p{N}", // Individual digits
398
+ "[\\p{P}!-/:-@\\[-`{-~]", // Punctuation, Isolated
399
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
400
+ };
401
+ break;
402
+ case LLAMA_VOCAB_PRE_TYPE_GPT4O:
403
+ regex_exprs = {
404
+ // original regex from tokenizer.json
405
+ // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
406
+ "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
407
+ };
408
+ break;
409
+ case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
410
+ regex_exprs = {
411
+ // K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
412
+ // The custom handler implements all K2 patterns with proper Han character exclusion
413
+ "\\p{Han}+",
414
+ };
415
+ break;
416
+ case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
417
+ regex_exprs = {
418
+ "\\p{N}+",
419
+ "(?=(\\d{3})+(?!\\d))",
420
+ };
421
+ break;
422
+ case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
423
+ regex_exprs = {
424
+ // original regex from tokenizer.json
425
+ // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
426
+ // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
427
+ "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
428
+ };
429
+ break;
430
+ case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
431
+ regex_exprs = {
432
+ // original regex from tokenizer.json
433
+ // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
434
+ "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
435
+ };
436
+ break;
437
+ default:
438
+ // default regex for BPE tokenization pre-processing
439
+ regex_exprs = {
440
+ "[\\p{P}\\$\\+<=>\\^~\\|]+",
441
+ "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
442
+ "\\p{N}+",
443
+ "[0-9][0-9][0-9]",
444
+ };
445
+ break;
446
+ }
447
+ }
448
+
449
+ std::vector<std::string> regex_exprs;
450
+ };
451
+
452
+ struct llm_tokenizer_bpe_session {
453
+ llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
454
+
455
+ static void append(const llama_token token_id, std::vector<llama_token> & output) {
456
+ output.push_back(token_id);
457
+ }
458
+
459
+ bool append_bos(std::vector<llama_token> & output) const {
460
+ if (vocab.get_add_bos()) {
461
+ LM_GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
462
+ output.push_back(vocab.token_bos());
463
+ return true;
464
+ }
465
+ return false;
466
+ }
467
+
468
+ bool append_eos(std::vector<llama_token> & output) const {
469
+ if (vocab.get_add_eos()) {
470
+ LM_GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
471
+ output.push_back(vocab.token_eos());
472
+ return true;
473
+ }
474
+ return false;
475
+ }
476
+
477
+ void check_double_bos_eos(const std::vector<llama_token> & output) const {
478
+ if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
479
+ LLAMA_LOG_WARN(
480
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
481
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
482
+ "Are you sure this is what you want?\n", __FUNCTION__);
483
+ }
484
+ if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
485
+ LLAMA_LOG_WARN(
486
+ "%s: Added a EOS token to the prompt as specified by the model but the prompt "
487
+ "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
488
+ "Are you sure this is what you want?\n", __FUNCTION__);
489
+ }
490
+ }
491
+
492
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
493
+ int final_prev_index = -1;
494
+ const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
495
+
496
+ symbols_final.clear();
497
+
498
+ for (const auto & word : word_collection) {
499
+ work_queue = llm_bigram_bpe::queue();
500
+ symbols.clear();
501
+
502
+ int index = 0;
503
+ size_t offset = 0;
504
+
505
+ //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
506
+ if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
507
+ symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
508
+ offset = word.size();
509
+ }
510
+
511
+ while (offset < word.size()) {
512
+ llm_symbol sym;
513
+ size_t char_len = std::min(word.size() - offset, (size_t) unicode_len_utf8(word[offset]));
514
+ sym.text = word.c_str() + offset;
515
+ sym.n = char_len;
516
+ offset += sym.n;
517
+ sym.prev = index - 1;
518
+ sym.next = offset == word.size() ? -1 : index + 1;
519
+ index++;
520
+ symbols.emplace_back(sym);
521
+ }
522
+ for (int i = 1; i < (int) symbols.size(); ++i) {
523
+ add_new_bigram(i - 1, i);
524
+ }
525
+
526
+ // build token(s)
527
+ while (!work_queue.empty()) {
528
+ auto bigram = work_queue.pop_move();
529
+
530
+ auto & left_symbol = symbols[bigram.left];
531
+ auto & right_symbol = symbols[bigram.right];
532
+
533
+ if (left_symbol.n == 0 || right_symbol.n == 0) {
534
+ continue;
535
+ }
536
+ std::string left_token = std::string(left_symbol.text, left_symbol.n);
537
+ std::string right_token = std::string(right_symbol.text, right_symbol.n);
538
+ if (left_token + right_token != bigram.text) {
539
+ continue; // Skip this bigram if it's outdated
540
+ }
541
+
542
+ // merge the right sym into the left one
543
+ left_symbol.n += right_symbol.n;
544
+ right_symbol.n = 0;
545
+
546
+ // remove the right sym from the chain
547
+ left_symbol.next = right_symbol.next;
548
+ if (right_symbol.next >= 0) {
549
+ symbols[right_symbol.next].prev = bigram.left;
550
+ }
551
+
552
+ add_new_bigram(left_symbol.prev, bigram.left); // left side of current symbol
553
+ add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol
554
+ }
555
+
556
+ // add the finished tokens to the final list keeping correct order for next and prev
557
+ for (auto & sym : symbols) {
558
+ if (sym.n > 0) {
559
+ sym.prev = final_prev_index;
560
+ sym.next = -1;
561
+ if (final_prev_index != -1) {
562
+ symbols_final[final_prev_index].next = symbols_final.size();
563
+ }
564
+ symbols_final.emplace_back(sym);
565
+ final_prev_index = symbols_final.size() - 1;
566
+ }
567
+ }
568
+ }
569
+
570
+ symbols = symbols_final;
571
+
572
+ if (!symbols.empty()) {
573
+ for (int i = 0; i != -1; i = symbols[i].next) {
574
+ auto & symbol = symbols[i];
575
+ if (symbol.n == 0) {
576
+ continue;
577
+ }
578
+
579
+ const std::string str = std::string(symbol.text, symbol.n);
580
+ const auto token = vocab.text_to_token(str);
581
+
582
+ if (token == LLAMA_TOKEN_NULL) {
583
+ for (auto j = str.begin(); j != str.end(); ++j) {
584
+ std::string byte_str(1, *j);
585
+ auto token_multibyte = vocab.text_to_token(byte_str);
586
+ if (token_multibyte != LLAMA_TOKEN_NULL) {
587
+ output.push_back(token_multibyte);
588
+ }
589
+ }
590
+ } else {
591
+ output.push_back(token);
592
+ }
593
+ }
594
+ }
595
+ }
596
+
597
+ private:
598
+ void add_new_bigram(int left, int right) {
599
+ if (left == -1 || right == -1) {
600
+ return;
601
+ }
602
+ std::string left_token = std::string(symbols[left].text, symbols[left].n);
603
+ std::string right_token = std::string(symbols[right].text, symbols[right].n);
604
+
605
+ int rank_found = -1;
606
+
607
+ rank_found = vocab.find_bpe_rank(left_token, right_token);
608
+
609
+ if (rank_found < 0) {
610
+ return;
611
+ }
612
+
613
+ llm_bigram_bpe bigram;
614
+
615
+ bigram.left = left;
616
+ bigram.right = right;
617
+ bigram.text = left_token + right_token;
618
+ bigram.size = left_token.size() + right_token.size();
619
+ bigram.rank = rank_found;
620
+
621
+ work_queue.push(bigram);
622
+ }
623
+
624
+ const llama_vocab & vocab;
625
+ const llm_tokenizer_bpe & tokenizer;
626
+
627
+ std::vector<llm_symbol> symbols;
628
+ std::vector<llm_symbol> symbols_final;
629
+ llm_bigram_bpe::queue work_queue;
630
+ };
631
+
632
+ //
633
+ // WPM tokenizer
634
+ //
635
+
636
+ struct llm_tokenizer_wpm : llm_tokenizer {
637
+ llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
638
+ };
639
+
640
+ struct llm_tokenizer_wpm_session {
641
+ llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
642
+
643
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
644
+ // normalize and split by whitespace
645
+ std::vector<std::string> words = preprocess(text);
646
+ // bos token prepended already
647
+
648
+ // find the longest tokens that form the words
649
+ for (const std::string & word : words) {
650
+ // skip empty words
651
+ if (word.size() == 0) {
652
+ continue;
653
+ }
654
+
655
+ // prepend phantom space
656
+ const std::string word1 = "\xe2\x96\x81" + word;
657
+ const int n = word1.size();
658
+
659
+ const size_t current_tokens = output.size();
660
+
661
+ // we're at the start of a new word
662
+ // move through character position in word
663
+ for (int i = 0; i < n; ++i) {
664
+ // loop through possible match length
665
+ bool match = false;
666
+ for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
667
+ auto id = vocab.text_to_token(word1.substr(i, j - i));
668
+ if (id != LLAMA_TOKEN_NULL) {
669
+ output.push_back(id);
670
+ match = true;
671
+ i = j - 1;
672
+ break;
673
+ }
674
+ }
675
+
676
+ if (!match) { // discard all
677
+ output.resize(current_tokens);
678
+ break; // and discard next tokens
679
+ }
680
+ }
681
+
682
+ // we didn't find any matches for this word
683
+ if (current_tokens == output.size()) {
684
+ output.push_back(vocab.token_unk());
685
+ }
686
+ }
687
+ }
688
+
689
+ // TODO: reduce string copies by using cpts_offs array
690
+ static std::vector<std::string> preprocess(const std::string & text) {
691
+ const std::vector<uint32_t> cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text));
692
+ std::vector<std::string> words(1, "");
693
+
694
+ for (const uint32_t cpt : cpts_nfd) {
695
+ const auto flags = unicode_cpt_flags_from_cpt(cpt);
696
+
697
+ if (flags.is_whitespace) {
698
+ if (words.back().size()) { // finish previous word if any
699
+ words.emplace_back();
700
+ }
701
+ continue;
702
+ }
703
+
704
+ assert (!flags.is_separator);
705
+ if (cpt == 0 || cpt == 0xFFFD || flags.is_control) {
706
+ continue;
707
+ }
708
+
709
+ const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt));
710
+ if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) {
711
+ if (words.back().size()) { // finish previous word if any
712
+ words.emplace_back();
713
+ }
714
+ words.back() = s; // single char word
715
+ words.emplace_back(); // start a new word
716
+ } else {
717
+ words.back() += s; // append char to word
718
+ }
719
+ }
720
+
721
+ if (!words.back().size()) {
722
+ words.pop_back();
723
+ }
724
+
725
+ return words;
726
+ }
727
+
728
+ static bool is_chinese_char(uint32_t cpt) {
729
+ return
730
+ (cpt >= 0x04E00 && cpt <= 0x09FFF) ||
731
+ (cpt >= 0x03400 && cpt <= 0x04DBF) ||
732
+ (cpt >= 0x20000 && cpt <= 0x2A6DF) ||
733
+ (cpt >= 0x2A700 && cpt <= 0x2B73F) ||
734
+ (cpt >= 0x2B740 && cpt <= 0x2B81F) ||
735
+ (cpt >= 0x2B920 && cpt <= 0x2CEAF) || // this should be 0x2B820 but in hf rust code it is 0x2B920
736
+ (cpt >= 0x0F900 && cpt <= 0x0FAFF) ||
737
+ (cpt >= 0x2F800 && cpt <= 0x2FA1F);
738
+ //(cpt >= 0x3000 && cpt <= 0x303F) ||
739
+ //(cpt >= 0xFF00 && cpt <= 0xFFEF);
740
+ }
741
+
742
+ private:
743
+ const llama_vocab & vocab;
744
+ // currently unused
745
+ // const llm_tokenizer_wpm * wpm_tokenizer;
746
+ };
747
+
748
+ //
749
+ // UGM tokenizer
750
+ //
751
+
752
+ struct llm_tokenizer_ugm : llm_tokenizer {
753
+ llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
754
+ if (precompiled_charsmap.size() > 0) {
755
+ size_t charsmap_offset = 0;
756
+
757
+ // First four bytes of precompiled_charsmap contains length of binary
758
+ // blob containing XOR-compressed compact double array (XCDA) entries
759
+ uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
760
+ charsmap_offset += sizeof(xcda_blob_size);
761
+ if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
762
+ throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
763
+ }
764
+
765
+ // Next xcda_blob_size bytes contain entries of XOR-compressed compact
766
+ // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
767
+ xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
768
+ xcda_array_size = xcda_blob_size / sizeof(uint32_t);
769
+ charsmap_offset += xcda_blob_size;
770
+
771
+ // Remaining bytes of precompiled charsmap contain null-terminated
772
+ // replacement strings for prefixes matched by the XCDA.
773
+ prefix_replacements = &precompiled_charsmap[charsmap_offset];
774
+ prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
775
+ }
776
+
777
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
778
+ const auto & token_data = vocab.get_token_data(id);
779
+
780
+ if (vocab.is_normal(id)) {
781
+ min_score = std::min<float>(min_score, token_data.score);
782
+ max_score = std::max<float>(max_score, token_data.score);
783
+ }
784
+
785
+ if (vocab.is_normal(id) ||
786
+ vocab.is_user_defined(id) ||
787
+ vocab.is_unused(id)) {
788
+ token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
789
+ }
790
+
791
+ if (vocab.is_user_defined(id)) {
792
+ user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
793
+ }
794
+ }
795
+
796
+ unknown_token_score = min_score - unknown_token_score_penalty;
797
+ }
798
+
799
+ // escaped space symbol - U+2581 (Lower One Eighth Block)
800
+ const std::string escaped_space = "\xE2\x96\x81";
801
+
802
+ const char * prefix_replacements = NULL;
803
+ size_t prefix_replacements_size = 0;
804
+
805
+ const uint32_t * xcda_array = NULL;
806
+ size_t xcda_array_size = 0;
807
+
808
+ struct naive_trie user_defined_token_matcher;
809
+
810
+ float min_score = FLT_MAX;
811
+ float max_score = -FLT_MAX;
812
+
813
+ float unknown_token_score_penalty = 10.0;
814
+ float unknown_token_score;
815
+
816
+ struct naive_trie token_matcher;
817
+ };
818
+
819
+ struct llm_tokenizer_ugm_session {
820
+ llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
821
+
822
+ /* This implementation is based on SentencePiece optimized Viterbi algorithm for
823
+ * unigram language models. The general idea is to:
824
+ * - move along the input sequence in steps of one UTF code point,
825
+ * - at each step find all possible tokenizations of the prefix by
826
+ * traversing the tokens trie,
827
+ * - for each tokenization store the best one so far (by higher score)
828
+ * - use the position in sequence after given token as an index to store
829
+ * results
830
+ * - if there was no valid tokenization of the current UTF code point
831
+ * then use unknown token with additional score penalty
832
+ * After processing the whole sequence we backtrack from the end to get
833
+ * the best tokenization.
834
+ */
835
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
836
+ // get current size of output (for reversal later)
837
+ size_t output_size = output.size();
838
+
839
+ // normalize the input first
840
+ std::string normalized;
841
+ normalize(text, &normalized);
842
+ size_t input_len = normalized.size();
843
+ if (input_len == 0) {
844
+ return;
845
+ }
846
+
847
+ // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
848
+ std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
849
+ // at the beginning tokenization score is zero
850
+ tokenization_results[0] = { vocab.token_unk(), 0, 0 };
851
+
852
+ for (size_t input_offset = 0; input_offset < input_len;) {
853
+ size_t prefix_offset = input_offset;
854
+ // calculate how many code units are in the currently processed UTF code point
855
+ size_t n_utf8_code_units = std::min<size_t>(unicode_len_utf8(normalized[input_offset]), input_len - input_offset);
856
+
857
+ // traverse the token matcher trie to find a matching token
858
+ bool single_codepoint_token_found = false;
859
+ const struct best_tokenization & current_best = tokenization_results[input_offset];
860
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
861
+
862
+ while (prefix_offset <= input_len && node != NULL) {
863
+ // check if we found valid token in prefix
864
+ if (node->has_value) {
865
+ // check if it corresponds to the whole UTF code point
866
+ if (prefix_offset - input_offset == n_utf8_code_units) {
867
+ single_codepoint_token_found = true;
868
+ }
869
+ llama_token token_id = node->value;
870
+ const auto & token_data = vocab.get_token_data(token_id);
871
+
872
+ // we set the user-defined token scores to 0 to make them more likely to be selected
873
+ // (normal token scores are log probabilities, so they are negative)
874
+ // score type is double here to make tokenization results exactly
875
+ // the same as in the HF tokenizer using SentencePiece
876
+ const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
877
+ const double challenger_score = current_best.score_sum + token_score;
878
+ struct best_tokenization & current_champ = tokenization_results[prefix_offset];
879
+ if (challenger_score > current_champ.score_sum) {
880
+ struct best_tokenization challenger = { token_id, input_offset, challenger_score };
881
+ current_champ = challenger;
882
+ }
883
+ }
884
+ node = node->traverse(normalized[prefix_offset++]);
885
+ }
886
+
887
+ // if we didn't find a valid token corresponding to the whole UTF code point
888
+ // then use unknown token as the tokenization of this UTF code point
889
+ if (!single_codepoint_token_found) {
890
+ const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
891
+ prefix_offset = input_offset + n_utf8_code_units;
892
+ struct best_tokenization & current_champ = tokenization_results[prefix_offset];
893
+ if (challenger_score > current_champ.score_sum) {
894
+ struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
895
+ current_champ = challenger;
896
+ }
897
+ }
898
+
899
+ // move to the next UTF code point
900
+ input_offset += n_utf8_code_units;
901
+ }
902
+
903
+ // now backtrack from the end to gather token ids of the best tokenization
904
+ // merge sequences of consecutive unknown tokens into single unknown tokens
905
+ bool is_prev_unknown = false;
906
+ for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
907
+ bool is_unknown = tokenization.token_id == vocab.token_unk();
908
+ if (!(is_prev_unknown && is_unknown)) {
909
+ output.push_back(tokenization.token_id);
910
+ }
911
+ if (tokenization.input_offset == 0) {
912
+ break;
913
+ }
914
+ is_prev_unknown = is_unknown;
915
+ }
916
+
917
+ // reverse the output since we added tokens starting from the end of the input
918
+ std::reverse(output.begin() + output_size, output.end());
919
+ }
920
+
921
+ private:
922
+
923
+ // helper structure for returning normalization results
924
+ struct normalization_result {
925
+ const char * normalized;
926
+ size_t normalized_len;
927
+ size_t consumed_input;
928
+ };
929
+
930
+ void normalize(const std::string& input, std::string * normalized) {
931
+ normalized->clear();
932
+ normalized->reserve(input.size() * 3);
933
+
934
+ const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
935
+
936
+ const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
937
+ const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
938
+ const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
939
+
940
+ bool is_space_prepended = false;
941
+ bool processing_non_ws = false;
942
+
943
+ size_t input_len = input.size();
944
+
945
+ for (size_t input_offset = 0; input_offset < input_len; ) {
946
+ auto norm_res = normalize_prefix(input, input_offset);
947
+ for (size_t i = 0; i < norm_res.normalized_len; i++) {
948
+ char c = norm_res.normalized[i];
949
+ if (c != ' ') {
950
+ if (!processing_non_ws) {
951
+ processing_non_ws = true;
952
+ if ((shall_prepend_space && !is_space_prepended) || shall_merge_spaces) {
953
+ normalized->append(space);
954
+ is_space_prepended = true;
955
+ }
956
+ }
957
+ normalized->push_back(c);
958
+ } else {
959
+ if (processing_non_ws) {
960
+ processing_non_ws = false;
961
+ }
962
+ if (!shall_merge_spaces) {
963
+ normalized->append(space);
964
+ }
965
+ }
966
+ }
967
+
968
+ input_offset += norm_res.consumed_input;
969
+ }
970
+
971
+ if (shall_append_space) {
972
+ normalized->append(space);
973
+ }
974
+ }
975
+
976
+ /*
977
+ * This structure is a view wrapper for XOR-compressed double array (XCDA)
978
+ * See Shunsuke Kanda (2018). Space- and Time-Efficient String Dictionaries.
979
+ * Each bit-packed entry contains:
980
+ * - BASE array value in bits 10-30
981
+ * - LCHECK array value in bits 0-7
982
+ * - LEAF array value in bit 9
983
+ * Entries containing indexes of replacement sequences have set bit 31
984
+ */
985
+ struct xcda_array_view {
986
+ public:
987
+ xcda_array_view(const uint32_t * xcda_array, size_t xcda_array_size) : xcda_array(xcda_array), xcda_array_size(xcda_array_size) {
988
+ }
989
+ uint32_t get_base(size_t index) {
990
+ uint32_t packed_node = get_node(index);
991
+ return (packed_node >> 10) << ((packed_node & (1U << 9)) >> 6);
992
+ }
993
+ uint32_t get_lcheck(size_t index) {
994
+ uint32_t packed_node = get_node(index);
995
+ return packed_node & ((1U << 31) | 0xff);
996
+ }
997
+ bool get_leaf(size_t index) {
998
+ uint32_t packed_node = get_node(index);
999
+ return (packed_node >> 8) & 1;
1000
+ }
1001
+ uint32_t get_value(size_t index) {
1002
+ uint32_t packed_node = get_node(index);
1003
+ return packed_node & ((1U << 31) - 1);
1004
+ }
1005
+ private:
1006
+ uint32_t get_node(size_t index) {
1007
+ if (index > xcda_array_size) {
1008
+ throw std::runtime_error("Index out of array bounds in XCDA array!");
1009
+ }
1010
+ return xcda_array[index];
1011
+ }
1012
+ const uint32_t * xcda_array;
1013
+ size_t xcda_array_size;
1014
+ };
1015
+
1016
+ // this structure stores the best tokenization so far at input_offset
1017
+ struct best_tokenization {
1018
+ llama_token token_id;
1019
+ size_t input_offset;
1020
+ double score_sum;
1021
+ };
1022
+
1023
+ struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
1024
+ if (input_offset == input.size()) {
1025
+ return { &input[input_offset], 0, 0 };
1026
+ }
1027
+
1028
+ // if input prefix matches some user-defined token return this token as normalization result
1029
+ auto user_defined_token_match =
1030
+ tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
1031
+ if (user_defined_token_match.second > 0) {
1032
+ return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
1033
+ }
1034
+
1035
+ size_t longest_prefix_length = 0;
1036
+ size_t longest_prefix_offset = 0;
1037
+
1038
+ if (tokenizer.xcda_array_size > 0) {
1039
+ struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
1040
+
1041
+ // Find the longest normalized sequence matching the input prefix by walking
1042
+ // the XOR-compressed compact double array (XCDA) starting from the root node
1043
+ // We find the index of the next node by calculating BASE[s] ^ c where s is
1044
+ // the index of the previous node and c is a numerical character value
1045
+ uint32_t node_index = 0;
1046
+ // get BASE of the root node
1047
+ node_index = xcda_view.get_base(node_index);
1048
+ for (size_t prefix_offset = input_offset; prefix_offset < input.size(); prefix_offset++) {
1049
+ unsigned char c = input[prefix_offset];
1050
+ if (c == 0) {
1051
+ break;
1052
+ }
1053
+ node_index ^= c;
1054
+ // if value of LCHECK is not c it means that this is not a child of
1055
+ // the previous node, so we stop matching
1056
+ if (xcda_view.get_lcheck(node_index) != c) {
1057
+ break;
1058
+ }
1059
+ bool is_leaf = xcda_view.get_leaf(node_index);
1060
+ // get BASE of the current node
1061
+ node_index ^= xcda_view.get_base(node_index);
1062
+ // if LEAF of the current node is true, it means that its BASE points to the node
1063
+ // containing index of replacement sequence for currently matched input prefix
1064
+ if (is_leaf)
1065
+ {
1066
+ longest_prefix_length = prefix_offset - input_offset + 1;
1067
+ // get index of replacement sequence for currently matched input prefix
1068
+ longest_prefix_offset = xcda_view.get_value(node_index);
1069
+ }
1070
+ }
1071
+ }
1072
+
1073
+ if (longest_prefix_length > 0) {
1074
+ // we have a match, so return the replacement sequence
1075
+ if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
1076
+ throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1077
+ }
1078
+ const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
1079
+ return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
1080
+ }
1081
+
1082
+ // check if the input prefix contains a valid sequence of UTF-8 code units
1083
+ try {
1084
+ // if yes, return this sequence unmodified
1085
+ size_t prefix_offset = input_offset;
1086
+ unicode_cpt_from_utf8(input, prefix_offset);
1087
+ return { &input[input_offset], prefix_offset - input_offset, prefix_offset - input_offset };
1088
+ } catch (std::invalid_argument & /*ex*/) {
1089
+ // if no, consume 1 byte and return U+FFFD - REPLACEMENT CHARACTER
1090
+ return { "\xEF\xBF\xBD", 3, 1 };
1091
+ }
1092
+ }
1093
+
1094
+ const llama_vocab & vocab;
1095
+ const llm_tokenizer_ugm & tokenizer;
1096
+ };
1097
+
1098
+ //
1099
+ // RWKV tokenizer
1100
+ //
1101
+
1102
+ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escaped) {
1103
+ std::vector<uint8_t> output;
1104
+ output.reserve(escaped.size());
1105
+
1106
+ // Parser state
1107
+ bool escaping = false;
1108
+ uint8_t hex_remaining = 0;
1109
+ uint8_t hex_acc = 0;
1110
+
1111
+ // Step through characters, performing parsing
1112
+ for (const char & c : escaped) {
1113
+ // If we're parsing a hex code, interpret the next character
1114
+ if (hex_remaining != 0) {
1115
+ uint8_t value = (c >= 'a') ? (c - 'a' + 10) : (c - '0');
1116
+ hex_acc = (hex_acc << 4) + value;
1117
+
1118
+ hex_remaining -= 1;
1119
+ if (hex_remaining == 0) {
1120
+ output.push_back(hex_acc);
1121
+ hex_acc = 0;
1122
+ }
1123
+
1124
+ continue;
1125
+ }
1126
+
1127
+ // If we got an escape character, interpret it
1128
+ if (escaping) {
1129
+ if (c == 't') {
1130
+ output.push_back('\t');
1131
+ } else if (c == 'n') {
1132
+ output.push_back('\n');
1133
+ } else if (c == 'r') {
1134
+ output.push_back('\r');
1135
+ } else if (c == 'x') {
1136
+ hex_remaining = 2;
1137
+ } else {
1138
+ output.push_back(c);
1139
+ }
1140
+
1141
+ escaping = false;
1142
+ continue;
1143
+ }
1144
+
1145
+ if (c == '\\') {
1146
+ escaping = true;
1147
+ continue;
1148
+ }
1149
+
1150
+ output.push_back(c);
1151
+ }
1152
+
1153
+ return output;
1154
+ }
1155
+
1156
+ struct llm_tokenizer_rwkv : llm_tokenizer {
1157
+ llm_tokenizer_rwkv(const llama_vocab & vocab) {
1158
+ // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1159
+ // For now, we decode the vocab here into the lookup we'll use for tokenization.
1160
+
1161
+ // build trie
1162
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
1163
+ const auto & data = vocab.get_token_data(id);
1164
+ const auto text = llama_unescape_rwkv_token(data.text);
1165
+ token_matcher.insert((const char *) text.data(), text.size(), id);
1166
+ }
1167
+ }
1168
+
1169
+ struct naive_trie token_matcher;
1170
+ };
1171
+
1172
+ struct llm_tokenizer_rwkv_session {
1173
+ llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
1174
+
1175
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
1176
+ uint32_t position = 0;
1177
+ while (position < text.size()) {
1178
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
1179
+ if (node == NULL) {
1180
+ // no matching token found, add unknown token
1181
+ output.push_back(vocab.token_unk());
1182
+ position += 1;
1183
+ continue;
1184
+ }
1185
+
1186
+ // traverse the trie to find the longest matching token
1187
+ uint32_t token_id = 0;
1188
+ uint32_t token_length = 0;
1189
+ while (node != NULL) {
1190
+ if (node->has_value) {
1191
+ token_id = node->value;
1192
+ token_length = position + 1;
1193
+ }
1194
+ node = node->traverse(text[++position]);
1195
+ }
1196
+
1197
+ // add the longest matching token
1198
+ output.push_back(token_id);
1199
+ position = token_length;
1200
+ }
1201
+ }
1202
+
1203
+ private:
1204
+ const llama_vocab & vocab;
1205
+ const llm_tokenizer_rwkv & tokenizer;
1206
+ };
1207
+
1208
+ struct llm_tokenizer_plamo2 : llm_tokenizer {
1209
+ llm_tokenizer_plamo2(const llama_vocab & vocab) {
1210
+ build(vocab);
1211
+ }
1212
+
1213
+ void build(const llama_vocab & vocab) {
1214
+ // Reset internal structures
1215
+ tokens_.clear();
1216
+ bytes_.assign(256, 0);
1217
+ to_suffix_id_.clear();
1218
+ table_.clear();
1219
+
1220
+ // Build token list and byte mapping
1221
+ std::unordered_map<std::string, float> suffix_to_score;
1222
+ std::unordered_map<std::string, llama_token> token_to_id;
1223
+
1224
+ for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
1225
+ const auto & entry = vocab.get_token_data(token_id);
1226
+ tokens_.push_back(entry.text);
1227
+ token_to_id[entry.text] = static_cast<llama_token>(token_id);
1228
+
1229
+ // Handle byte tokens
1230
+ if (vocab.is_byte(token_id)) {
1231
+ if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
1232
+ std::string hex_str = entry.text.substr(3, 2);
1233
+ int byte_val = std::stoi(hex_str, nullptr, 16);
1234
+ bytes_[byte_val] = static_cast<llama_token>(token_id);
1235
+ }
1236
+ continue;
1237
+ }
1238
+
1239
+ // Add token and all its suffixes to suffix_to_score
1240
+ suffix_to_score[entry.text] = entry.score;
1241
+
1242
+ // Extract suffixes character by character (UTF-8 aware)
1243
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
1244
+ for (size_t i = 1; i < cpts.size(); ++i) {
1245
+ std::string suffix;
1246
+ for (size_t j = i; j < cpts.size(); ++j) {
1247
+ suffix += unicode_cpt_to_utf8(cpts[j]);
1248
+ }
1249
+ if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
1250
+ suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
1251
+ }
1252
+ }
1253
+ }
1254
+
1255
+ // Check that all byte tokens are set
1256
+ for (int i = 0; i < 256; ++i) {
1257
+ if (bytes_[i] == 0) {
1258
+ throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
1259
+ }
1260
+ }
1261
+
1262
+ // Build suffix list in lexicographical order of reversed strings
1263
+ std::vector<std::string> suffixes;
1264
+ for (const auto & pair : suffix_to_score) {
1265
+ suffixes.push_back(pair.first);
1266
+ }
1267
+ suffixes.push_back(""); // Empty suffix
1268
+
1269
+ std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
1270
+ std::string rev_a(a.rbegin(), a.rend());
1271
+ std::string rev_b(b.rbegin(), b.rend());
1272
+ return rev_a < rev_b;
1273
+ });
1274
+
1275
+ // Build suffix_to_id and to_suffix_id_
1276
+ std::unordered_map<std::string, int32_t> suffix_to_id;
1277
+ int32_t num_pieces = 0;
1278
+
1279
+ for (const auto & suffix : suffixes) {
1280
+ suffix_to_id[suffix] = num_pieces;
1281
+ if (!suffix.empty()) {
1282
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1283
+
1284
+ std::string remaining;
1285
+ for (size_t i = 1; i < cpts.size(); ++i) {
1286
+ remaining += unicode_cpt_to_utf8(cpts[i]);
1287
+ }
1288
+
1289
+ int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
1290
+ to_suffix_id_[piece_code] = num_pieces;
1291
+
1292
+ // Count number of pieces for this suffix
1293
+ int32_t pieces_for_suffix = 1; // sentinel row
1294
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1295
+ std::string piece;
1296
+ for (int32_t i = 0; i < piece_length; ++i) {
1297
+ piece += unicode_cpt_to_utf8(cpts[i]);
1298
+ }
1299
+ if (suffix_to_score.find(piece) != suffix_to_score.end()) {
1300
+ pieces_for_suffix++;
1301
+ }
1302
+ }
1303
+ num_pieces += pieces_for_suffix;
1304
+ } else {
1305
+ num_pieces++; // Empty suffix contributes one piece (sentinel row)
1306
+ }
1307
+ }
1308
+
1309
+ // Build flattened table
1310
+ table_.resize(num_pieces, std::vector<int32_t>(4, 0));
1311
+ int32_t table_idx = 0;
1312
+
1313
+ for (const auto & suffix : suffixes) {
1314
+ // Add all prefixes of the suffix to the table (in decreasing order of length)
1315
+ std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
1316
+ for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
1317
+ std::string piece;
1318
+ for (int32_t i = 0; i < piece_length; ++i) {
1319
+ piece += unicode_cpt_to_utf8(cpts[i]);
1320
+ }
1321
+
1322
+ auto score_it = suffix_to_score.find(piece);
1323
+ if (score_it == suffix_to_score.end()) {
1324
+ continue;
1325
+ }
1326
+
1327
+ table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
1328
+ auto token_it = token_to_id.find(piece);
1329
+ table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
1330
+
1331
+ float score = score_it->second;
1332
+ table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
1333
+ static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
1334
+ table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
1335
+
1336
+ table_idx++;
1337
+ }
1338
+
1339
+ // Add sentinel row
1340
+ table_[table_idx][TABLE_PIECE_LENGTH] = 1;
1341
+ table_[table_idx][TABLE_TOKEN_ID] = -1;
1342
+ table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
1343
+ table_idx++;
1344
+ }
1345
+ }
1346
+
1347
+ std::vector<llama_token> encode(const std::string & text) const {
1348
+ std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
1349
+ // Skip the first code point if it is a BOM (Byte Order Mark)
1350
+ if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
1351
+ unicode_data.erase(unicode_data.begin());
1352
+ }
1353
+
1354
+ if (unicode_data.empty()) {
1355
+ return {};
1356
+ }
1357
+
1358
+ const size_t data_len = unicode_data.size();
1359
+
1360
+ // Initialize scores array (dynamic programming)
1361
+ std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
1362
+ scores[data_len] = 0;
1363
+
1364
+ // Path array to track best tokenization
1365
+ std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
1366
+
1367
+ int32_t suffix_id = 0;
1368
+
1369
+ // Process from end to beginning
1370
+ for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
1371
+ uint32_t c = unicode_data[i];
1372
+
1373
+ // Find next suffix ID
1374
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1375
+ int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
1376
+ auto it = to_suffix_id_.find(piece_code);
1377
+ suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
1378
+
1379
+ if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
1380
+ break;
1381
+ }
1382
+ }
1383
+
1384
+ // Update best path
1385
+ for (size_t p = suffix_id; p < table_.size(); ++p) {
1386
+ int32_t score = table_[p][TABLE_SCORE];
1387
+ if (score > INVALID_SCORE) {
1388
+ int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
1389
+ int64_t s = scores[i + piece_length] - score;
1390
+
1391
+ if (s < scores[i]) {
1392
+ scores[i] = s;
1393
+ path[i][PATH_TOKEN_LENGTH] = piece_length;
1394
+ path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
1395
+ path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
1396
+
1397
+ if (score == UNKNOWN_SCORE) {
1398
+ // Add UTF-8 byte count
1399
+ path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1400
+ }
1401
+ }
1402
+ }
1403
+
1404
+ if (score == UNKNOWN_SCORE) {
1405
+ break;
1406
+ }
1407
+ }
1408
+ }
1409
+
1410
+ // Decode the best path
1411
+ std::vector<llama_token> token_ids;
1412
+ token_ids.reserve(path[0][PATH_NUM_TOKENS]);
1413
+
1414
+ int pos = 0;
1415
+ while (pos < static_cast<int>(data_len)) {
1416
+ if (path[pos][PATH_TOKEN_ID] >= 0) {
1417
+ token_ids.push_back(path[pos][PATH_TOKEN_ID]);
1418
+ } else {
1419
+ // Fall back to byte tokens
1420
+ uint32_t c = unicode_data[pos];
1421
+ int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
1422
+
1423
+ for (int i = 0; i < s; ++i) {
1424
+ uint8_t b;
1425
+ if (s == 1) {
1426
+ b = c;
1427
+ } else {
1428
+ if (i == 0) {
1429
+ b = (0xF00 >> s) & 0xFF;
1430
+ } else {
1431
+ b = 0x80;
1432
+ }
1433
+ }
1434
+ token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
1435
+ }
1436
+ }
1437
+
1438
+ assert(path[pos][PATH_TOKEN_LENGTH] > 0);
1439
+ pos += path[pos][PATH_TOKEN_LENGTH];
1440
+ }
1441
+
1442
+ return token_ids;
1443
+ }
1444
+ private:
1445
+ // Constants for table structure
1446
+ static constexpr int32_t TABLE_PIECE_LENGTH = 0;
1447
+ static constexpr int32_t TABLE_TOKEN_ID = 1;
1448
+ static constexpr int32_t TABLE_SCORE = 2;
1449
+ static constexpr int32_t TABLE_PIECE_ID = 3;
1450
+
1451
+ // Constants for path array
1452
+ static constexpr int32_t PATH_TOKEN_LENGTH = 0;
1453
+ static constexpr int32_t PATH_TOKEN_ID = 1;
1454
+ static constexpr int32_t PATH_NUM_TOKENS = 2;
1455
+
1456
+ // Score constants
1457
+ static constexpr int32_t INVALID_SCORE = -20000000;
1458
+ static constexpr int32_t UNKNOWN_SCORE = -10000000;
1459
+
1460
+ // List of tokens in the vocabulary
1461
+ std::vector<std::string> tokens_;
1462
+
1463
+ // Mapping from byte code point to token ID (for byte fallback)
1464
+ std::vector<llama_token> bytes_;
1465
+
1466
+ // Mapping from piece code to suffix ID
1467
+ std::unordered_map<int64_t, int32_t> to_suffix_id_;
1468
+
1469
+ // Flattened table representing the Trie structure
1470
+ // Each row contains: [piece_length, token_id, score, piece_id]
1471
+ std::vector<std::vector<int32_t>> table_;
1472
+ };
1473
+
1474
+ struct llm_tokenizer_plamo2_session {
1475
+ llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
1476
+
1477
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
1478
+ std::vector<llama_token> tokens = tokenizer.encode(text);
1479
+ output.insert(output.end(), tokens.begin(), tokens.end());
1480
+ }
1481
+
1482
+ private:
1483
+ const llm_tokenizer_plamo2 & tokenizer;
1484
+ };
1485
+
1486
+ //
1487
+ // impl
1488
+ //
1489
+
1490
+ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
1491
+ FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
1492
+ FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
1493
+ } FRAGMENT_BUFFER_VARIANT_TYPE;
1494
+
1495
+ struct fragment_buffer_variant {
1496
+ fragment_buffer_variant(llama_token _token)
1497
+ :
1498
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
1499
+ token(_token),
1500
+ raw_text(_dummy),
1501
+ offset(0),
1502
+ length(0) {}
1503
+
1504
+ fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
1505
+ :
1506
+ type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
1507
+ token((llama_token) - 1),
1508
+ raw_text(_raw_text),
1509
+ offset(_offset),
1510
+ length(_length){
1511
+ LM_GGML_ASSERT(_offset >= 0);
1512
+ LM_GGML_ASSERT(_length >= 1);
1513
+ LM_GGML_ASSERT(offset + length <= raw_text.length());
1514
+ }
1515
+
1516
+ const FRAGMENT_BUFFER_VARIANT_TYPE type;
1517
+ const llama_token token;
1518
+ const std::string _dummy;
1519
+ const std::string & raw_text;
1520
+ const uint64_t offset;
1521
+ const uint64_t length;
1522
+ };
1523
+
1524
+ struct llama_vocab::impl {
1525
+ uint32_t n_token_types = 0; // for BERT-style token types
1526
+
1527
+ std::string tokenizer_model;
1528
+ std::string tokenizer_pre;
1529
+
1530
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1531
+ enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1532
+
1533
+ int max_token_len = 0; // used for optimizing longest token search
1534
+
1535
+ // default LLaMA special tokens
1536
+ // TODO: should we set all of these to LLAMA_TOKEN_NULL?
1537
+ llama_token special_bos_id = 1;
1538
+ llama_token special_eos_id = 2;
1539
+ llama_token special_eot_id = LLAMA_TOKEN_NULL;
1540
+ llama_token special_eom_id = LLAMA_TOKEN_NULL;
1541
+ llama_token special_unk_id = 0;
1542
+ llama_token special_sep_id = LLAMA_TOKEN_NULL;
1543
+ llama_token special_pad_id = LLAMA_TOKEN_NULL;
1544
+ llama_token special_mask_id = LLAMA_TOKEN_NULL;
1545
+
1546
+ llama_token linefeed_id = 13;
1547
+
1548
+ // fim tokens
1549
+ llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
1550
+ llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
1551
+ llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
1552
+ llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
1553
+ llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
1554
+ llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
1555
+
1556
+ // tokenizer flags
1557
+ bool add_space_prefix = false;
1558
+ bool add_bos = false;
1559
+ bool add_eos = false;
1560
+ bool add_sep = false;
1561
+ bool ignore_merges = false;
1562
+ bool clean_spaces = false; // clean_up_tokenization_spaces
1563
+ bool remove_extra_whitespaces = false;
1564
+ bool escape_whitespaces = true;
1565
+ bool treat_whitespace_as_suffix = false;
1566
+
1567
+ std::unordered_map<std::string, llama_token> token_to_id;
1568
+ std::vector<token_data> id_to_token;
1569
+
1570
+ std::vector<llama_token> cache_special_tokens;
1571
+ std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1572
+ struct pair_hash {
1573
+ size_t operator()(const std::pair<std::string, std::string> & p) const {
1574
+ return std::hash<std::string>{}(p.first) ^ //create some hash for pair
1575
+ (std::hash<std::string>{}(p.second) << 1);
1576
+ }
1577
+ };
1578
+ std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
1579
+
1580
+ // set of all tokens that cause "end of generation"
1581
+ std::set<llama_token> special_eog_ids;
1582
+
1583
+ std::unique_ptr<llm_tokenizer> tokenizer;
1584
+
1585
+ std::vector<char> precompiled_charsmap;
1586
+
1587
+ impl(const llama_vocab & vocab) : vocab(vocab) {
1588
+ }
1589
+
1590
+ ~impl() = default;
1591
+
1592
+ void load(llama_model_loader & ml, const LLM_KV & kv);
1593
+
1594
+ enum llama_vocab_type get_type() const;
1595
+
1596
+ std::string type_name() const;
1597
+
1598
+ bool is_normal (llama_token id) const;
1599
+ bool is_unknown (llama_token id) const;
1600
+ bool is_control (llama_token id) const;
1601
+ bool is_byte (llama_token id) const;
1602
+ bool is_user_defined(llama_token id) const;
1603
+ bool is_unused (llama_token id) const;
1604
+ bool is_eog (llama_token id) const;
1605
+
1606
+ uint8_t token_to_byte(llama_token id) const;
1607
+
1608
+ llama_token_attr token_get_attr(llama_token id) const;
1609
+
1610
+ void init_tokenizer(enum llama_vocab_type type);
1611
+
1612
+ void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
1613
+
1614
+ std::string token_to_piece_for_cache(
1615
+ llama_token token,
1616
+ bool special) const;
1617
+
1618
+
1619
+ std::vector<llama_token> tokenize(
1620
+ const std::string & raw_text,
1621
+ bool add_special,
1622
+ bool parse_special = false) const;
1623
+
1624
+ int32_t tokenize(
1625
+ const char * text,
1626
+ int32_t text_len,
1627
+ llama_token * tokens,
1628
+ int32_t n_tokens_max,
1629
+ bool add_special,
1630
+ bool parse_special) const;
1631
+
1632
+ // does not write null-terminator to buf
1633
+ int32_t token_to_piece(
1634
+ llama_token token,
1635
+ char * buf,
1636
+ int32_t length,
1637
+ int32_t lstrip,
1638
+ bool special) const;
1639
+
1640
+ // use cached data
1641
+ const std::string & token_to_piece(llama_token token) const;
1642
+
1643
+ int32_t detokenize(
1644
+ const llama_token * tokens,
1645
+ int32_t n_tokens,
1646
+ char * text,
1647
+ int32_t text_len_max,
1648
+ bool remove_special,
1649
+ bool unparse_special) const;
1650
+
1651
+ std::string detokenize(
1652
+ const std::vector<llama_token> & tokens,
1653
+ bool special) const;
1654
+
1655
+ void print_info() const;
1656
+
1657
+ private:
1658
+ const llama_vocab & vocab;
1659
+ };
1660
+
1661
+ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1662
+ struct lm_gguf_context * ctx = ml.meta.get();
1663
+
1664
+ // determine vocab type
1665
+ {
1666
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1667
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
1668
+
1669
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
1670
+
1671
+ if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
1672
+ type = LLAMA_VOCAB_TYPE_NONE;
1673
+
1674
+ // default special tokens
1675
+ special_bos_id = LLAMA_TOKEN_NULL;
1676
+ special_eos_id = LLAMA_TOKEN_NULL;
1677
+ special_unk_id = LLAMA_TOKEN_NULL;
1678
+ special_sep_id = LLAMA_TOKEN_NULL;
1679
+ special_pad_id = LLAMA_TOKEN_NULL;
1680
+ special_mask_id = LLAMA_TOKEN_NULL;
1681
+ linefeed_id = LLAMA_TOKEN_NULL;
1682
+
1683
+ // read vocab size from metadata
1684
+ uint32_t n_tokens = 0;
1685
+ if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
1686
+ LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
1687
+ id_to_token.resize(n_tokens);
1688
+ }
1689
+
1690
+ return;
1691
+ }
1692
+
1693
+ if (tokenizer_model == "llama") {
1694
+ type = LLAMA_VOCAB_TYPE_SPM;
1695
+
1696
+ // default special tokens
1697
+ special_bos_id = 1;
1698
+ special_eos_id = 2;
1699
+ special_unk_id = 0;
1700
+ special_sep_id = LLAMA_TOKEN_NULL;
1701
+ special_pad_id = LLAMA_TOKEN_NULL;
1702
+ special_mask_id = LLAMA_TOKEN_NULL;
1703
+ } else if (tokenizer_model == "bert") {
1704
+ type = LLAMA_VOCAB_TYPE_WPM;
1705
+
1706
+ // default special tokens
1707
+ special_bos_id = 101;
1708
+ special_eos_id = LLAMA_TOKEN_NULL;
1709
+ special_unk_id = 100;
1710
+ special_sep_id = 102;
1711
+ special_pad_id = 0;
1712
+ special_mask_id = 103;
1713
+
1714
+ add_sep = true;
1715
+ } else if (tokenizer_model == "gpt2") {
1716
+ type = LLAMA_VOCAB_TYPE_BPE;
1717
+
1718
+ // read bpe merges and populate bpe ranks
1719
+ const int merges_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
1720
+ if (merges_keyidx == -1) {
1721
+ throw std::runtime_error("cannot find tokenizer merges in model file\n");
1722
+ }
1723
+
1724
+ const int n_merges = lm_gguf_get_arr_n(ctx, merges_keyidx);
1725
+ for (int i = 0; i < n_merges; i++) {
1726
+ const std::string word = lm_gguf_get_arr_str(ctx, merges_keyidx, i);
1727
+ //LM_GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
1728
+
1729
+ std::string first;
1730
+ std::string second;
1731
+
1732
+ const size_t pos = word.find(' ', 1);
1733
+
1734
+ if (pos != std::string::npos) {
1735
+ first = word.substr(0, pos);
1736
+ second = word.substr(pos + 1);
1737
+ }
1738
+
1739
+ bpe_ranks.emplace(std::make_pair(first, second), i);
1740
+ }
1741
+
1742
+ // default special tokens
1743
+ special_bos_id = 11;
1744
+ special_eos_id = 11;
1745
+ special_unk_id = LLAMA_TOKEN_NULL;
1746
+ special_sep_id = LLAMA_TOKEN_NULL;
1747
+ special_pad_id = LLAMA_TOKEN_NULL;
1748
+ special_mask_id = LLAMA_TOKEN_NULL;
1749
+ } else if (tokenizer_model == "t5") {
1750
+ type = LLAMA_VOCAB_TYPE_UGM;
1751
+
1752
+ // default special tokens
1753
+ special_bos_id = LLAMA_TOKEN_NULL;
1754
+ special_eos_id = 1;
1755
+ special_unk_id = 2;
1756
+ special_sep_id = LLAMA_TOKEN_NULL;
1757
+ special_pad_id = 0;
1758
+ special_mask_id = LLAMA_TOKEN_NULL;
1759
+
1760
+ const int precompiled_charsmap_keyidx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1761
+ if (precompiled_charsmap_keyidx != -1) {
1762
+ const lm_gguf_type pc_type = lm_gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
1763
+ LM_GGML_ASSERT(pc_type == LM_GGUF_TYPE_INT8 || pc_type == LM_GGUF_TYPE_UINT8);
1764
+
1765
+ const size_t n_precompiled_charsmap = lm_gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1766
+ const char * pc = (const char *) lm_gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1767
+ precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1768
+ #ifdef IS_BIG_ENDIAN
1769
+ // correct endiannes of data in precompiled_charsmap binary blob
1770
+ uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1771
+ *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
1772
+ assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
1773
+ size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
1774
+ uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
1775
+ for (size_t i = 0; i < xcda_array_size; ++i) {
1776
+ xcda_array[i] = __builtin_bswap32(xcda_array[i]);
1777
+ }
1778
+ #endif
1779
+ }
1780
+ } else if (tokenizer_model == "rwkv") {
1781
+ type = LLAMA_VOCAB_TYPE_RWKV;
1782
+
1783
+ // default special tokens
1784
+ special_bos_id = LLAMA_TOKEN_NULL;
1785
+ special_eos_id = LLAMA_TOKEN_NULL;
1786
+ special_unk_id = LLAMA_TOKEN_NULL;
1787
+ special_sep_id = LLAMA_TOKEN_NULL;
1788
+ special_pad_id = LLAMA_TOKEN_NULL;
1789
+ } else if (tokenizer_model == "plamo2") {
1790
+ type = LLAMA_VOCAB_TYPE_PLAMO2;
1791
+
1792
+ // PLaMo-2 default special tokens (these will be overridden by model config)
1793
+ special_bos_id = 1; // <|plamo:bos|>
1794
+ special_eos_id = 2; // <|plamo:eos|>
1795
+ special_unk_id = 0; // <|plamo:unk|>
1796
+ special_sep_id = LLAMA_TOKEN_NULL;
1797
+ special_pad_id = 3; // <|plamo:pad|>
1798
+ special_mask_id = LLAMA_TOKEN_NULL;
1799
+ } else {
1800
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1801
+ }
1802
+
1803
+ // for now, only BPE models have pre-tokenizers
1804
+ if (type == LLAMA_VOCAB_TYPE_BPE) {
1805
+ add_space_prefix = false;
1806
+ clean_spaces = true;
1807
+ if (tokenizer_pre.empty()) {
1808
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
1809
+ LLAMA_LOG_WARN("%s: \n", __func__);
1810
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1811
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
1812
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
1813
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1814
+ LLAMA_LOG_WARN("%s: \n", __func__);
1815
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1816
+ } else if (tokenizer_pre == "default") {
1817
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1818
+ } else if (
1819
+ tokenizer_pre == "llama3" ||
1820
+ tokenizer_pre == "llama-v3" ||
1821
+ tokenizer_pre == "llama-bpe"||
1822
+ tokenizer_pre == "falcon3" ||
1823
+ tokenizer_pre == "falcon-h1" ||
1824
+ tokenizer_pre == "pixtral" ||
1825
+ tokenizer_pre == "midm-2.0" ||
1826
+ tokenizer_pre == "lfm2") {
1827
+ pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1828
+ ignore_merges = true;
1829
+ add_bos = true;
1830
+ } else if (
1831
+ tokenizer_pre == "deepseek-llm") {
1832
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
1833
+ clean_spaces = false;
1834
+ } else if (
1835
+ tokenizer_pre == "deepseek-coder") {
1836
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1837
+ clean_spaces = false;
1838
+ } else if (
1839
+ tokenizer_pre == "deepseek-v3") {
1840
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1841
+ clean_spaces = false;
1842
+ } else if (
1843
+ tokenizer_pre == "falcon") {
1844
+ pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
1845
+ } else if (
1846
+ tokenizer_pre == "mpt") {
1847
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
1848
+ } else if (
1849
+ tokenizer_pre == "starcoder") {
1850
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
1851
+ } else if (
1852
+ tokenizer_pre == "gpt-2" ||
1853
+ tokenizer_pre == "phi-2" ||
1854
+ tokenizer_pre == "jina-es" ||
1855
+ tokenizer_pre == "jina-de" ||
1856
+ tokenizer_pre == "gigachat" ||
1857
+ tokenizer_pre == "jina-v2-es" ||
1858
+ tokenizer_pre == "jina-v2-de" ||
1859
+ tokenizer_pre == "a.x-4.0" ||
1860
+ tokenizer_pre == "mellum") {
1861
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1862
+ } else if (
1863
+ tokenizer_pre == "jina-v1-en" ||
1864
+ tokenizer_pre == "jina-v2-code" ||
1865
+ tokenizer_pre == "roberta-bpe") {
1866
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1867
+ add_sep = true;
1868
+ } else if (
1869
+ tokenizer_pre == "refact") {
1870
+ pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
1871
+ } else if (
1872
+ tokenizer_pre == "command-r") {
1873
+ pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1874
+ clean_spaces = false;
1875
+ } else if (
1876
+ tokenizer_pre == "qwen2" ||
1877
+ tokenizer_pre == "deepseek-r1-qwen") {
1878
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1879
+ clean_spaces = false;
1880
+ } else if (
1881
+ tokenizer_pre == "stablelm2") {
1882
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
1883
+ } else if (
1884
+ tokenizer_pre == "olmo") {
1885
+ pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
1886
+ } else if (
1887
+ tokenizer_pre == "dbrx") {
1888
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
1889
+ } else if (
1890
+ tokenizer_pre == "smaug-bpe") {
1891
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
1892
+ } else if (
1893
+ tokenizer_pre == "poro-chat") {
1894
+ pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1895
+ clean_spaces = false;
1896
+ } else if (
1897
+ tokenizer_pre == "glm4" ||
1898
+ tokenizer_pre == "chatglm-bpe") {
1899
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1900
+ special_bos_id = LLAMA_TOKEN_NULL;
1901
+ } else if (
1902
+ tokenizer_pre == "viking") {
1903
+ pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
1904
+ clean_spaces = false;
1905
+ } else if (
1906
+ tokenizer_pre == "jais") {
1907
+ pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
1908
+ } else if (
1909
+ tokenizer_pre == "tekken") {
1910
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
1911
+ clean_spaces = false;
1912
+ ignore_merges = true;
1913
+ add_bos = true;
1914
+ } else if (
1915
+ tokenizer_pre == "smollm") {
1916
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
1917
+ clean_spaces = false;
1918
+ } else if (
1919
+ tokenizer_pre == "codeshell") {
1920
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
1921
+ } else if (
1922
+ tokenizer_pre == "bloom") {
1923
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
1924
+ } else if (
1925
+ tokenizer_pre == "gpt3-finnish") {
1926
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
1927
+ } else if (
1928
+ tokenizer_pre == "exaone") {
1929
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1930
+ } else if (
1931
+ tokenizer_pre == "exaone4") {
1932
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1933
+ } else if (
1934
+ tokenizer_pre == "chameleon") {
1935
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
1936
+ add_bos = true;
1937
+ clean_spaces = false;
1938
+ } else if (
1939
+ tokenizer_pre == "minerva-7b") {
1940
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
1941
+ } else if (
1942
+ tokenizer_pre == "megrez") {
1943
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1944
+ } else if (
1945
+ tokenizer_pre == "gpt-4o" ||
1946
+ tokenizer_pre == "llama4") {
1947
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O;
1948
+ clean_spaces = false;
1949
+ } else if (
1950
+ tokenizer_pre == "superbpe") {
1951
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE;
1952
+ clean_spaces = false;
1953
+ } else if (
1954
+ tokenizer_pre == "trillion") {
1955
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
1956
+ clean_spaces = false;
1957
+ } else if (
1958
+ tokenizer_pre == "bailingmoe") {
1959
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
1960
+ clean_spaces = false;
1961
+ } else if (
1962
+ tokenizer_pre == "seed-coder") {
1963
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
1964
+ clean_spaces = false;
1965
+ } else if (
1966
+ tokenizer_pre == "hunyuan") {
1967
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
1968
+ clean_spaces = false;
1969
+ } else if (
1970
+ tokenizer_pre == "hunyuan-dense") {
1971
+ pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE;
1972
+ clean_spaces = false;
1973
+ } else if (
1974
+ tokenizer_pre == "kimi-k2") {
1975
+ pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
1976
+ clean_spaces = false;
1977
+ } else {
1978
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1979
+ }
1980
+ } else if (type == LLAMA_VOCAB_TYPE_SPM) {
1981
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1982
+ add_space_prefix = true;
1983
+ clean_spaces = false;
1984
+ add_bos = true;
1985
+ add_eos = false;
1986
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
1987
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1988
+ add_space_prefix = false;
1989
+ clean_spaces = true;
1990
+ add_bos = true;
1991
+ add_eos = false;
1992
+ add_sep = true;
1993
+ } else if (type == LLAMA_VOCAB_TYPE_UGM) {
1994
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1995
+ add_bos = false;
1996
+ add_eos = true;
1997
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
1998
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1999
+ add_space_prefix = false;
2000
+ clean_spaces = false;
2001
+ add_bos = false;
2002
+ add_eos = false;
2003
+ } else {
2004
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2005
+ }
2006
+
2007
+ ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
2008
+ ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
2009
+ }
2010
+
2011
+ const int token_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
2012
+ if (token_idx == -1) {
2013
+ throw std::runtime_error("cannot find tokenizer vocab in model file\n");
2014
+ }
2015
+
2016
+ const float * scores = nullptr;
2017
+ const int score_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
2018
+ if (score_idx != -1) {
2019
+ scores = (const float * ) lm_gguf_get_arr_data(ctx, score_idx);
2020
+ }
2021
+
2022
+ const int * toktypes = nullptr;
2023
+ const int toktype_idx = lm_gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
2024
+ if (toktype_idx != -1) {
2025
+ toktypes = (const int * ) lm_gguf_get_arr_data(ctx, toktype_idx);
2026
+ }
2027
+
2028
+ uint32_t n_tokens = lm_gguf_get_arr_n(ctx, token_idx);
2029
+ id_to_token.resize(n_tokens);
2030
+
2031
+ for (uint32_t i = 0; i < n_tokens; i++) {
2032
+ std::string word = lm_gguf_get_arr_str(ctx, token_idx, i);
2033
+ if (word.empty()) {
2034
+ LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
2035
+ word = "[EMPTY_" + std::to_string(i) + "]";
2036
+ }
2037
+
2038
+ token_to_id[word] = i;
2039
+ max_token_len = std::max(max_token_len, (int) word.size());
2040
+
2041
+ auto & token_data = id_to_token[i];
2042
+ token_data.text = std::move(word);
2043
+ token_data.score = scores ? scores[i] : 0.0f;
2044
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
2045
+
2046
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
2047
+ switch(toktypes[i]) {
2048
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
2049
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
2050
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
2051
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
2052
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
2053
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
2054
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2055
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
2056
+ }
2057
+ }
2058
+ }
2059
+ LM_GGML_ASSERT(id_to_token.size() == token_to_id.size());
2060
+
2061
+ init_tokenizer(type);
2062
+
2063
+ // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
2064
+ if (type == LLAMA_VOCAB_TYPE_SPM) {
2065
+ try {
2066
+ linefeed_id = vocab.byte_to_token('\n');
2067
+ } catch (const std::exception & e) {
2068
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
2069
+ linefeed_id = special_pad_id;
2070
+ }
2071
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
2072
+ linefeed_id = special_pad_id;
2073
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
2074
+ const std::vector<int> ids = tokenize("\n", false);
2075
+ LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2076
+ linefeed_id = ids[0];
2077
+ } else {
2078
+ const std::vector<int> ids = tokenize("\n", false);
2079
+
2080
+ //LM_GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
2081
+ if (ids.empty()) {
2082
+ LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
2083
+ linefeed_id = special_pad_id;
2084
+ } else {
2085
+ linefeed_id = ids[0];
2086
+ }
2087
+ }
2088
+
2089
+ // special tokens
2090
+ {
2091
+ const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
2092
+ { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
2093
+ { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
2094
+ { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
2095
+ { LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
2096
+ { LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
2097
+ { LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
2098
+ { LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
2099
+ { LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
2100
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
2101
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
2102
+ { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
2103
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
2104
+ { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
2105
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
2106
+
2107
+ // deprecated
2108
+ { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
2109
+ { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
2110
+ { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
2111
+ };
2112
+
2113
+ for (const auto & it : special_token_types) {
2114
+ const std::string & key = kv(std::get<0>(it));
2115
+ int32_t & id = std::get<1>(it);
2116
+
2117
+ uint32_t new_id;
2118
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
2119
+ continue;
2120
+ }
2121
+ if (new_id >= id_to_token.size()) {
2122
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
2123
+ __func__, key.c_str(), new_id, id);
2124
+ } else {
2125
+ id = new_id;
2126
+ }
2127
+ }
2128
+
2129
+ // Handle add_bos, add_eos and add_sep
2130
+ {
2131
+ bool temp = true;
2132
+
2133
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
2134
+ add_bos = temp;
2135
+ }
2136
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
2137
+ add_eos = temp;
2138
+ }
2139
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
2140
+ add_sep = temp;
2141
+ }
2142
+ }
2143
+
2144
+ // auto-detect special tokens by text
2145
+ // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
2146
+ // for now, we apply this workaround to find the tokens based on their text
2147
+
2148
+ for (const auto & t : token_to_id) {
2149
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
2150
+ if (special_eot_id == LLAMA_TOKEN_NULL) {
2151
+ if (false
2152
+ || t.first == "<|eot_id|>"
2153
+ || t.first == "<|im_end|>"
2154
+ || t.first == "<|end|>"
2155
+ || t.first == "<end_of_turn>"
2156
+ || t.first == "<|endoftext|>"
2157
+ || t.first == "<EOT>"
2158
+ || t.first == "_<EOT>"
2159
+ || t.first == "<|end▁of▁sentence|>" // DeepSeek
2160
+ || t.first == "<end_of_utterance>" // smoldocling
2161
+ ) {
2162
+ special_eot_id = t.second;
2163
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2164
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2165
+ __func__, t.second, t.first.c_str());
2166
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2167
+ }
2168
+ }
2169
+ }
2170
+
2171
+ // find EOM token: "<|eom_id|>"
2172
+ if (special_eom_id == LLAMA_TOKEN_NULL) {
2173
+ if (false
2174
+ || t.first == "<|eom_id|>"
2175
+ ) {
2176
+ special_eom_id = t.second;
2177
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2178
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2179
+ __func__, t.second, t.first.c_str());
2180
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2181
+ }
2182
+ }
2183
+ }
2184
+
2185
+ // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
2186
+ if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
2187
+ if (false
2188
+ || t.first == "<|fim_prefix|>" // Qwen
2189
+ || t.first == "<fim-prefix>"
2190
+ || t.first == "<fim_prefix>" // Granite
2191
+ || t.first == "<|fim▁begin|>" // DeepSeek
2192
+ || t.first == "<PRE>"
2193
+ || t.first == "▁<PRE>" // CodeLlama
2194
+ || t.first == "<|code_prefix|>" // GLM-4.5
2195
+ ) {
2196
+ special_fim_pre_id = t.second;
2197
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2198
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2199
+ __func__, t.second, t.first.c_str());
2200
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2201
+ }
2202
+ }
2203
+ }
2204
+
2205
+ // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
2206
+ if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
2207
+ if (false
2208
+ || t.first == "<|fim_suffix|>" // Qwen
2209
+ || t.first == "<fim-suffix>"
2210
+ || t.first == "<fim_suffix>" // Granite
2211
+ || t.first == "<|fim▁hole|>" // DeepSeek
2212
+ || t.first == "<SUF>"
2213
+ || t.first == "▁<SUF>" // CodeLlama
2214
+ || t.first == "<|code_suffix|>" // GLM-4.5
2215
+ ) {
2216
+ special_fim_suf_id = t.second;
2217
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2218
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2219
+ __func__, t.second, t.first.c_str());
2220
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2221
+ }
2222
+ }
2223
+ }
2224
+
2225
+ // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
2226
+ if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
2227
+ if (false
2228
+ || t.first == "<|fim_middle|>" // Qwen
2229
+ || t.first == "<fim-middle>"
2230
+ || t.first == "<fim_middle>" // Granite
2231
+ || t.first == "<|fim▁end|>" // DeepSeek
2232
+ || t.first == "<MID>"
2233
+ || t.first == "▁<MID>" // CodeLlama
2234
+ || t.first == "<|code_middle|>" // GLM-4.5
2235
+ ) {
2236
+ special_fim_mid_id = t.second;
2237
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2238
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2239
+ __func__, t.second, t.first.c_str());
2240
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2241
+ }
2242
+ }
2243
+ }
2244
+
2245
+ // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
2246
+ if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
2247
+ if (false
2248
+ || t.first == "<|fim_pad|>" // Qwen
2249
+ || t.first == "<fim-pad>"
2250
+ || t.first == "<fim_pad>" // Granite
2251
+ || t.first == "<PAD>"
2252
+ ) {
2253
+ special_fim_pad_id = t.second;
2254
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2255
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2256
+ __func__, t.second, t.first.c_str());
2257
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2258
+ }
2259
+ }
2260
+ }
2261
+
2262
+ // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
2263
+ if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
2264
+ if (false
2265
+ || t.first == "<|fim_repo|>" // Qwen
2266
+ || t.first == "<|repo_name|>"
2267
+ || t.first == "<fim-repo>"
2268
+ || t.first == "<REPO>"
2269
+ || t.first == "<reponame>" // Granite
2270
+ ) {
2271
+ special_fim_rep_id = t.second;
2272
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2273
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2274
+ __func__, t.second, t.first.c_str());
2275
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2276
+ }
2277
+ }
2278
+ }
2279
+
2280
+ // find FIM_SEP token: "<|file_sep|>"
2281
+ if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
2282
+ if (false
2283
+ || t.first == "<|file_sep|>" // Qwen
2284
+ ) {
2285
+ special_fim_sep_id = t.second;
2286
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2287
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2288
+ __func__, t.second, t.first.c_str());
2289
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2290
+ }
2291
+ }
2292
+ }
2293
+ }
2294
+
2295
+ // maintain a list of tokens that cause end-of-generation
2296
+ // this is currently determined based on the token text, which is obviously not ideal
2297
+ // ref: https://github.com/ggerganov/llama.cpp/issues/9606
2298
+ special_eog_ids.clear();
2299
+
2300
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
2301
+ special_eog_ids.insert(special_fim_pad_id);
2302
+ }
2303
+
2304
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
2305
+ special_eog_ids.insert(special_fim_rep_id);
2306
+ }
2307
+
2308
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
2309
+ special_eog_ids.insert(special_fim_sep_id);
2310
+ }
2311
+
2312
+ for (const auto & t : token_to_id) {
2313
+ if (false
2314
+ || t.first == "<|eot_id|>"
2315
+ || t.first == "<|im_end|>"
2316
+ || t.first == "<|end|>"
2317
+ || t.first == "<|return|>" // o200k_harmony
2318
+ || t.first == "<|call|>" // o200k_harmony
2319
+ || t.first == "<end_of_turn>"
2320
+ || t.first == "<|endoftext|>"
2321
+ || t.first == "<|eom_id|>"
2322
+ || t.first == "<EOT>"
2323
+ || t.first == "_<EOT>"
2324
+ || t.first == "<|end_of_text|>"
2325
+ || t.first == "<end_of_utterance>" // smoldocling
2326
+ ) {
2327
+ special_eog_ids.insert(t.second);
2328
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
2329
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
2330
+ __func__, t.second, t.first.c_str());
2331
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
2332
+ }
2333
+ } else {
2334
+ // token is control, but not marked as EOG -> print a debug log
2335
+ if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
2336
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
2337
+ __func__, t.second, t.first.c_str());
2338
+ }
2339
+ }
2340
+ }
2341
+
2342
+ // @ngxson : quick hack for gpt-oss, always render these tokens
2343
+ for (const auto & t : token_to_id) {
2344
+ if (t.first == "<|channel|>" || t.first == "<|message|>" || t.first == "<|start|>" || t.first == "<|constrain|>") {
2345
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2346
+ }
2347
+ }
2348
+
2349
+ // sanity checks
2350
+ if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
2351
+ special_eog_ids.insert(special_eos_id);
2352
+ LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2353
+ }
2354
+
2355
+ if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
2356
+ special_eog_ids.insert(special_eot_id);
2357
+ LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2358
+ }
2359
+
2360
+ if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
2361
+ special_eog_ids.insert(special_eom_id);
2362
+ LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
2363
+ }
2364
+
2365
+ // TODO: workaround for o200k_harmony tokenizer: the "<|end|>" token should not be EOG
2366
+ // we don't have a good way to detect this, so for now, if we have "<|return|>" and "<|call|>" tokens,
2367
+ // we remove the "<|end|>" token from the EOG list
2368
+ {
2369
+ bool has_return = false;
2370
+ bool has_call = false;
2371
+ bool has_end = false;
2372
+
2373
+ llama_token end_id = LLAMA_TOKEN_NULL;
2374
+
2375
+ LLAMA_LOG_INFO("%s: printing all EOG tokens:\n", __func__);
2376
+ for (auto tid : special_eog_ids) {
2377
+ LLAMA_LOG_INFO("%s: - %d ('%s')\n", __func__, tid, id_to_token[tid].text.c_str());
2378
+
2379
+ if (id_to_token[tid].text == "<|return|>") {
2380
+ has_return = true;
2381
+ } else if (id_to_token[tid].text == "<|call|>") {
2382
+ has_call = true;
2383
+ } else if (id_to_token[tid].text == "<|end|>") {
2384
+ has_end = true;
2385
+ end_id = tid;
2386
+ }
2387
+ }
2388
+
2389
+ if (has_return && has_call && has_end) {
2390
+ special_eog_ids.erase(end_id);
2391
+ id_to_token[end_id].attr = LLAMA_TOKEN_ATTR_USER_DEFINED;
2392
+ LLAMA_LOG_WARN("%s: special_eog_ids contains both '<|return|>' and '<|call|>' tokens, removing '<|end|>' token from EOG list\n", __func__);
2393
+ }
2394
+ }
2395
+ }
2396
+
2397
+ // build special tokens cache
2398
+ {
2399
+ for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
2400
+ if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
2401
+ cache_special_tokens.push_back(id);
2402
+ }
2403
+ }
2404
+
2405
+ std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
2406
+ [&] (const llama_token a, const llama_token b) {
2407
+ return id_to_token[a].text.size() > id_to_token[b].text.size();
2408
+ }
2409
+ );
2410
+
2411
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
2412
+ }
2413
+
2414
+ // build token to piece cache
2415
+ {
2416
+ size_t size_cache = 0;
2417
+
2418
+ std::vector<std::string> cache(n_tokens);
2419
+
2420
+ for (uint32_t id = 0; id < n_tokens; ++id) {
2421
+ cache[id] = token_to_piece_for_cache(id, true);
2422
+
2423
+ size_cache += cache[id].size();
2424
+ }
2425
+
2426
+ std::swap(cache_token_to_piece, cache);
2427
+
2428
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
2429
+ }
2430
+
2431
+ // Handle per token attributes
2432
+ //NOTE: Each model customizes per token attributes.
2433
+ //NOTE: Per token attributes are missing from the GGUF file.
2434
+ //TODO: Extract attributes from GGUF file.
2435
+ {
2436
+ auto _contains_any = [] (const std::string & str, const std::vector<std::string_view> & substrs) -> bool {
2437
+ for (const auto & substr : substrs) {
2438
+ if (str.find(substr) != std::string::npos) {
2439
+ return true;
2440
+ }
2441
+ }
2442
+ return false;
2443
+ };
2444
+
2445
+ auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
2446
+ uint32_t current = id_to_token.at(id).attr;
2447
+ current = value ? (current | attr) : (current & ~attr);
2448
+ id_to_token[id].attr = (llama_token_attr) current;
2449
+ };
2450
+
2451
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
2452
+ _set_tokenid_attr(token_to_id.at(token), attr, value);
2453
+ };
2454
+
2455
+ std::string model_name;
2456
+ std::string tokenizer_pre;
2457
+ std::string general_arch;
2458
+
2459
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
2460
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2461
+ ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
2462
+
2463
+ // model name to lowercase
2464
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
2465
+ [] (const std::string::value_type x) {
2466
+ return std::tolower(x);
2467
+ }
2468
+ );
2469
+
2470
+ // set attributes by model/tokenizer/architecture name
2471
+ if (false
2472
+ || _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2473
+ || _contains_any(general_arch, {"nomic-bert-moe"})
2474
+ ) {
2475
+ if (token_to_id.count("<mask>") == 0) {
2476
+ LLAMA_LOG_WARN("%s: Mask token is missing in vocab, please reconvert model!\n", __func__);
2477
+ } else {
2478
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2479
+ }
2480
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2481
+ for (auto id : cache_special_tokens) {
2482
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
2483
+ }
2484
+ for (const auto * token : {"</s>"}) {
2485
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
2486
+ }
2487
+ for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2488
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2489
+ }
2490
+ }
2491
+ }
2492
+ }
2493
+
2494
+ enum llama_vocab_type llama_vocab::impl::get_type() const {
2495
+ return type;
2496
+ }
2497
+
2498
+ std::string llama_vocab::impl::type_name() const{
2499
+ switch (type) {
2500
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2501
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2502
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2503
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2504
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2505
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2506
+ case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
2507
+ default: return "unknown";
2508
+ }
2509
+ }
2510
+
2511
+ bool llama_vocab::impl::is_normal(llama_token id) const {
2512
+ LM_GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2513
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
2514
+ }
2515
+
2516
+ bool llama_vocab::impl::is_unknown(llama_token id) const {
2517
+ LM_GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2518
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
2519
+ }
2520
+
2521
+ bool llama_vocab::impl::is_control(llama_token id) const {
2522
+ LM_GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2523
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
2524
+ }
2525
+
2526
+ bool llama_vocab::impl::is_byte(llama_token id) const {
2527
+ LM_GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2528
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
2529
+ }
2530
+
2531
+ bool llama_vocab::impl::is_user_defined(llama_token id) const {
2532
+ LM_GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2533
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
2534
+ }
2535
+
2536
+ bool llama_vocab::impl::is_unused(llama_token id) const {
2537
+ LM_GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2538
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
2539
+ }
2540
+
2541
+ bool llama_vocab::impl::is_eog(llama_token id) const {
2542
+ return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
2543
+ }
2544
+
2545
+ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
2546
+ LM_GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2547
+ LM_GGML_ASSERT(is_byte(id));
2548
+ const auto & token_data = id_to_token.at(id);
2549
+ switch (get_type()) {
2550
+ case LLAMA_VOCAB_TYPE_SPM:
2551
+ case LLAMA_VOCAB_TYPE_UGM: {
2552
+ auto buf = token_data.text.substr(3, 2);
2553
+ return strtol(buf.c_str(), NULL, 16);
2554
+ }
2555
+ case LLAMA_VOCAB_TYPE_BPE: {
2556
+ LM_GGML_ABORT("fatal error");
2557
+ }
2558
+ case LLAMA_VOCAB_TYPE_WPM: {
2559
+ LM_GGML_ABORT("fatal error");
2560
+ }
2561
+ default:
2562
+ LM_GGML_ABORT("fatal error");
2563
+ }
2564
+ }
2565
+
2566
+ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
2567
+ LM_GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2568
+ return id_to_token.at(id).attr;
2569
+ }
2570
+
2571
+ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2572
+ LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
2573
+
2574
+ switch (type) {
2575
+ case LLAMA_VOCAB_TYPE_SPM:
2576
+ tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
2577
+ break;
2578
+ case LLAMA_VOCAB_TYPE_BPE:
2579
+ tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
2580
+ break;
2581
+ case LLAMA_VOCAB_TYPE_WPM:
2582
+ tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
2583
+ break;
2584
+ case LLAMA_VOCAB_TYPE_UGM:
2585
+ tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
2586
+ break;
2587
+ case LLAMA_VOCAB_TYPE_RWKV:
2588
+ tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2589
+ break;
2590
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2591
+ tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
2592
+ break;
2593
+ default:
2594
+ LM_GGML_ABORT("unsupported vocab type");
2595
+ }
2596
+ }
2597
+
2598
+ //
2599
+ // (de-) tokenize
2600
+ //
2601
+
2602
+ // #define PRETOKENIZERDEBUG
2603
+
2604
+ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
2605
+ // for each special token
2606
+ for (const llama_token special_id : cache_special_tokens) {
2607
+ const auto & data = vocab.get_token_data(special_id);
2608
+ const auto & text = data.text;
2609
+
2610
+ if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
2611
+ // Ignore control and unknown tokens when parse_special == false
2612
+ continue;
2613
+ // User-defined tokens are still pre-tokenized before everything else
2614
+ // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
2615
+ // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
2616
+ }
2617
+
2618
+ // for each text fragment
2619
+ std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
2620
+ while (it != buffer.end()) {
2621
+ auto & fragment = (*it);
2622
+
2623
+ // if a fragment is text ( not yet processed )
2624
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2625
+ const auto & raw_text = fragment.raw_text;
2626
+
2627
+ auto raw_text_base_offset = fragment.offset;
2628
+ auto raw_text_base_length = fragment.length;
2629
+
2630
+ // loop over the text
2631
+ while (true) {
2632
+ // find the first occurrence of a given special token in this fragment
2633
+ // passing offset argument only limit the "search area" but match coordinates
2634
+ // are still relative to the source full raw_text
2635
+ // string_view begins at pos 0 for the same reason
2636
+ auto match = std::string_view(raw_text.data(), raw_text_base_offset + raw_text_base_length).find(text, raw_text_base_offset);
2637
+
2638
+ // no occurrences found, stop processing this fragment for a given special token
2639
+ if (match == std::string::npos) break;
2640
+
2641
+ #ifdef PRETOKENIZERDEBUG
2642
+ LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2643
+ #endif
2644
+ auto source = std::distance(buffer.begin(), it);
2645
+
2646
+ // if match is further than base offset
2647
+ // then we have some text to the left of it
2648
+ if (match > raw_text_base_offset) {
2649
+ // left
2650
+ const int64_t left_reminder_offset = raw_text_base_offset + 0;
2651
+ int64_t left_reminder_length = match - raw_text_base_offset;
2652
+
2653
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
2654
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
2655
+ left_reminder_length--;
2656
+ }
2657
+ }
2658
+
2659
+ if (left_reminder_length > 0) {
2660
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
2661
+ it++;
2662
+ }
2663
+
2664
+ #ifdef PRETOKENIZERDEBUG
2665
+ LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
2666
+ #endif
2667
+ }
2668
+
2669
+ // special token
2670
+ buffer.emplace_after(it, special_id);
2671
+ it++;
2672
+
2673
+ // right
2674
+ if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
2675
+ int64_t right_reminder_offset = match + text.length();
2676
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
2677
+
2678
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
2679
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
2680
+ right_reminder_offset++;
2681
+ right_reminder_length--;
2682
+ }
2683
+ }
2684
+
2685
+ if (right_reminder_length > 0) {
2686
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
2687
+ it++;
2688
+ }
2689
+
2690
+ #ifdef PRETOKENIZERDEBUG
2691
+ LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
2692
+ #endif
2693
+
2694
+ if (source == 0) {
2695
+ buffer.erase_after(buffer.before_begin());
2696
+ } else {
2697
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2698
+ }
2699
+
2700
+ // repeat for the right side
2701
+ raw_text_base_offset = right_reminder_offset;
2702
+ raw_text_base_length = right_reminder_length;
2703
+
2704
+ #ifdef PRETOKENIZERDEBUG
2705
+ LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2706
+ #endif
2707
+ } else {
2708
+ if (source == 0) {
2709
+ buffer.erase_after(buffer.before_begin());
2710
+ } else {
2711
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2712
+ }
2713
+ break;
2714
+ }
2715
+ }
2716
+ }
2717
+ it++;
2718
+ }
2719
+ }
2720
+ }
2721
+
2722
+ // NOTE: avoid ever using this except for building the token_to_piece caches
2723
+ std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
2724
+ std::string piece;
2725
+ piece.resize(piece.capacity()); // using string internal cache
2726
+ const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2727
+ if (n_chars < 0) {
2728
+ piece.resize(-n_chars);
2729
+ int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2730
+ LM_GGML_ASSERT(check == -n_chars);
2731
+ }
2732
+ else {
2733
+ piece.resize(n_chars);
2734
+ }
2735
+
2736
+ return piece;
2737
+ }
2738
+
2739
+ static void llama_escape_whitespace(std::string & text) {
2740
+ replace_all(text, " ", "\xe2\x96\x81");
2741
+ }
2742
+
2743
+ static void llama_unescape_whitespace(std::string & word) {
2744
+ replace_all(word, "\xe2\x96\x81", " ");
2745
+ }
2746
+
2747
+ static std::string llama_decode_text(const std::string & text) {
2748
+ std::string decoded_text;
2749
+
2750
+ const auto cpts = unicode_cpts_from_utf8(text);
2751
+ for (const auto cpt : cpts) {
2752
+ const auto utf8 = unicode_cpt_to_utf8(cpt);
2753
+ try {
2754
+ decoded_text += unicode_utf8_to_byte(utf8);
2755
+ } catch (const std::out_of_range & /*e*/) {
2756
+ decoded_text += "[UNK_BYTE_0x";
2757
+ for (const auto c : utf8) {
2758
+ decoded_text += format("%02x", (uint8_t) c);
2759
+ }
2760
+ decoded_text += text + "]";
2761
+ }
2762
+ }
2763
+
2764
+ return decoded_text;
2765
+ }
2766
+
2767
+ std::vector<llama_token> llama_vocab::impl::tokenize(
2768
+ const std::string & raw_text,
2769
+ bool add_special,
2770
+ bool parse_special) const {
2771
+ LM_GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2772
+
2773
+ std::vector<llama_token> output;
2774
+ std::forward_list<fragment_buffer_variant> fragment_buffer;
2775
+
2776
+ if (!raw_text.empty()) {
2777
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
2778
+ tokenizer_st_partition(fragment_buffer, parse_special);
2779
+ }
2780
+
2781
+ switch (get_type()) {
2782
+ case LLAMA_VOCAB_TYPE_SPM:
2783
+ {
2784
+ // OG tokenizer behavior:
2785
+ //
2786
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
2787
+ // tokenizer.encode('', add_special_tokens=False) returns []
2788
+
2789
+ bool is_prev_special = true; // prefix with space if first token
2790
+
2791
+ if (add_special && add_bos) {
2792
+ LM_GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2793
+ output.push_back(special_bos_id);
2794
+ is_prev_special = true;
2795
+ }
2796
+
2797
+ for (const auto & fragment : fragment_buffer) {
2798
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2799
+ std::string text;
2800
+
2801
+ // prefix with space if previous is special
2802
+ if (add_space_prefix && is_prev_special) {
2803
+ text = ' ';
2804
+ }
2805
+
2806
+ text += fragment.raw_text.substr(fragment.offset, fragment.length);
2807
+
2808
+ #ifdef PRETOKENIZERDEBUG
2809
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2810
+ #endif
2811
+ llama_escape_whitespace(text);
2812
+ llm_tokenizer_spm_session session(vocab);
2813
+ session.tokenize(text, output);
2814
+ is_prev_special = false;
2815
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2816
+ output.push_back(fragment.token);
2817
+ is_prev_special = true;
2818
+ }
2819
+ }
2820
+
2821
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2822
+ LLAMA_LOG_WARN(
2823
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2824
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2825
+ "Are you sure this is what you want?\n", __FUNCTION__);
2826
+ }
2827
+
2828
+ if (add_special && add_eos) {
2829
+ LM_GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2830
+ output.push_back(special_eos_id);
2831
+ }
2832
+ } break;
2833
+ case LLAMA_VOCAB_TYPE_BPE:
2834
+ {
2835
+ llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
2836
+ // it calls some other methods that are not exist in llm_tokenizer,
2837
+ // here just cast it to bpe tokenizer object
2838
+ if (add_special) {
2839
+ session.append_bos(output);
2840
+ }
2841
+ for (const auto & fragment : fragment_buffer) {
2842
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2843
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2844
+
2845
+ #ifdef PRETOKENIZERDEBUG
2846
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2847
+ #endif
2848
+ session.tokenize(text, output);
2849
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2850
+ session.append(fragment.token, output);
2851
+ }
2852
+ }
2853
+
2854
+ if (add_special) {
2855
+ session.append_eos(output);
2856
+ session.check_double_bos_eos(output);
2857
+ }
2858
+ } break;
2859
+ case LLAMA_VOCAB_TYPE_WPM:
2860
+ {
2861
+ if (add_special) {
2862
+ LM_GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2863
+ output.push_back(special_bos_id);
2864
+ }
2865
+
2866
+ llm_tokenizer_wpm_session session(vocab);
2867
+
2868
+ for (const auto & fragment : fragment_buffer) {
2869
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2870
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2871
+
2872
+ #ifdef PRETOKENIZERDEBUG
2873
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2874
+ #endif
2875
+ session.tokenize(text, output);
2876
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2877
+ output.push_back(fragment.token);
2878
+ }
2879
+ }
2880
+
2881
+ if (add_special) {
2882
+ LM_GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
2883
+ output.push_back(special_sep_id);
2884
+ }
2885
+ } break;
2886
+ case LLAMA_VOCAB_TYPE_UGM:
2887
+ {
2888
+ if (add_special && add_bos) {
2889
+ LM_GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2890
+ output.push_back(special_bos_id);
2891
+ }
2892
+ llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
2893
+
2894
+ for (const auto & fragment : fragment_buffer) {
2895
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2896
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2897
+ #ifdef PRETOKENIZERDEBUG
2898
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2899
+ #endif
2900
+ session.tokenize(text, output);
2901
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2902
+ output.push_back(fragment.token);
2903
+ }
2904
+ }
2905
+
2906
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2907
+ LLAMA_LOG_WARN(
2908
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2909
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2910
+ "Are you sure this is what you want?\n", __FUNCTION__);
2911
+ }
2912
+
2913
+ if (add_special && add_eos) {
2914
+ LM_GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2915
+ output.push_back(special_eos_id);
2916
+ }
2917
+ } break;
2918
+ case LLAMA_VOCAB_TYPE_RWKV:
2919
+ {
2920
+ llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
2921
+ for (const auto & fragment : fragment_buffer) {
2922
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2923
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2924
+
2925
+ #ifdef PRETOKENIZERDEBUG
2926
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2927
+ #endif
2928
+
2929
+ session.tokenize(text, output);
2930
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2931
+ output.push_back(fragment.token);
2932
+ }
2933
+ }
2934
+ } break;
2935
+ case LLAMA_VOCAB_TYPE_PLAMO2:
2936
+ {
2937
+ llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
2938
+ for (const auto & fragment : fragment_buffer) {
2939
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2940
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2941
+
2942
+ #ifdef PRETOKENIZERDEBUG
2943
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2944
+ #endif
2945
+
2946
+ session.tokenize(text, output);
2947
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2948
+ output.push_back(fragment.token);
2949
+ }
2950
+ }
2951
+ } break;
2952
+ case LLAMA_VOCAB_TYPE_NONE:
2953
+ LM_GGML_ABORT("fatal error");
2954
+ }
2955
+
2956
+ return output;
2957
+ }
2958
+
2959
+ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
2960
+ // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
2961
+ static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
2962
+ const llama_token_attr attr = token_get_attr(token);
2963
+ if (!special && (attr & attr_special)) {
2964
+ return 0;
2965
+ }
2966
+
2967
+ // copy piece chars to output text buffer
2968
+ // skip up to 'lstrip' leading spaces before copying
2969
+ auto _try_copy = [=] (const char * token, size_t size) -> int32_t {
2970
+ if (size >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
2971
+ LM_GGML_ABORT("invalid token size: %zu exceeds int32_t limit", size);
2972
+ }
2973
+
2974
+ for (int32_t i = 0; i < lstrip && size && *token == ' '; ++i) {
2975
+ token++;
2976
+ size--;
2977
+ }
2978
+ if (length < (int32_t)size) {
2979
+ return -(int32_t) size;
2980
+ }
2981
+ memcpy(buf, token, size);
2982
+ return (int32_t) size;
2983
+ };
2984
+
2985
+ // if we have a cache - use it
2986
+ {
2987
+ const auto & cache = cache_token_to_piece;
2988
+
2989
+ if (!cache.empty()) {
2990
+ const auto & result = cache.at(token);
2991
+ return _try_copy(result.data(), result.size());
2992
+ }
2993
+ }
2994
+
2995
+ if (0 <= token && token < (int32_t) id_to_token.size()) {
2996
+ const std::string & token_text = id_to_token[token].text;
2997
+ switch (get_type()) {
2998
+ case LLAMA_VOCAB_TYPE_WPM:
2999
+ case LLAMA_VOCAB_TYPE_SPM:
3000
+ case LLAMA_VOCAB_TYPE_UGM: {
3001
+ // NOTE: we accept all unsupported token types,
3002
+ // suppressing them like CONTROL tokens.
3003
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3004
+ return _try_copy(token_text.data(), token_text.size());
3005
+ }
3006
+ if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3007
+ std::string result = token_text;
3008
+ llama_unescape_whitespace(result);
3009
+ return _try_copy(result.data(), result.size());
3010
+ }
3011
+ if (attr & LLAMA_TOKEN_ATTR_BYTE) {
3012
+ char byte = (char) token_to_byte(token);
3013
+ return _try_copy((char*) &byte, 1);
3014
+ }
3015
+ break;
3016
+ }
3017
+ case LLAMA_VOCAB_TYPE_BPE: {
3018
+ // NOTE: we accept all unsupported token types,
3019
+ // suppressing them like CONTROL tokens.
3020
+ if (attr & (attr_special | LLAMA_TOKEN_ATTR_USER_DEFINED)) {
3021
+ return _try_copy(token_text.data(), token_text.size());
3022
+ }
3023
+ if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3024
+ std::string result = llama_decode_text(token_text);
3025
+ return _try_copy(result.data(), result.size());
3026
+ }
3027
+ break;
3028
+ }
3029
+ case LLAMA_VOCAB_TYPE_RWKV: {
3030
+ std::vector<uint8_t> result = llama_unescape_rwkv_token(token_text);
3031
+
3032
+ // If we don't have enough space, return an error
3033
+ if (result.size() > (size_t)length) {
3034
+ return -(int)result.size();
3035
+ }
3036
+
3037
+ memcpy(buf, result.data(), result.size());
3038
+ return (int)result.size();
3039
+ }
3040
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3041
+ // PLaMo-2 uses similar token handling as BPE/SPM
3042
+ if (vocab.is_byte(token)) {
3043
+ // Handle byte tokens like <0xXX>
3044
+ if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
3045
+ int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
3046
+ if (length < 1) {
3047
+ return -1;
3048
+ }
3049
+ buf[0] = static_cast<char>(hex_val);
3050
+ return 1;
3051
+ }
3052
+ }
3053
+
3054
+ // Normal token - just copy the text
3055
+ std::string result = token_text;
3056
+ return _try_copy(result.data(), result.size());
3057
+ }
3058
+ default:
3059
+ LM_GGML_ABORT("fatal error");
3060
+ }
3061
+ }
3062
+
3063
+ return 0;
3064
+ }
3065
+
3066
+ const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
3067
+ return cache_token_to_piece.at(token);
3068
+ }
3069
+
3070
+ int32_t llama_vocab::impl::detokenize(
3071
+ const llama_token * tokens,
3072
+ int32_t n_tokens,
3073
+ char * text,
3074
+ int32_t text_len_max,
3075
+ bool remove_special,
3076
+ bool unparse_special) const {
3077
+ if (type == LLAMA_VOCAB_TYPE_NONE) {
3078
+ return 0;
3079
+ }
3080
+
3081
+ LM_GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
3082
+
3083
+ int32_t avail = text_len_max;
3084
+ int32_t total = 0;
3085
+
3086
+ // remove the leading space
3087
+ bool remove_space = add_space_prefix;
3088
+
3089
+ if (remove_special && add_bos) {
3090
+ if (n_tokens > 0 && tokens[0] == special_bos_id) {
3091
+ remove_space = false;
3092
+ n_tokens--;
3093
+ tokens++;
3094
+ }
3095
+ }
3096
+
3097
+ if (remove_special && add_eos) {
3098
+ if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
3099
+ n_tokens--;
3100
+ }
3101
+ }
3102
+
3103
+ for (int32_t i = 0; i < n_tokens; ++i) {
3104
+ LM_GGML_ASSERT(avail >= 0);
3105
+ int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
3106
+ remove_space = false;
3107
+ if (n_chars < 0) {
3108
+ avail = 0;
3109
+ total -= n_chars;
3110
+ } else if (n_chars > 0) {
3111
+ avail -= n_chars;
3112
+ text += n_chars;
3113
+ total += n_chars;
3114
+ }
3115
+ }
3116
+
3117
+ if (total > text_len_max) {
3118
+ return -total;
3119
+ }
3120
+
3121
+ if (clean_spaces) {
3122
+ text -= total; // restart text
3123
+
3124
+ // first pass: characters ?!., //TODO: where do these characters come from?
3125
+ const int32_t total1 = total;
3126
+ total = total ? 1 : 0;
3127
+ for (int32_t i = 1; i < total1; ++i) {
3128
+ const char x = text[i];
3129
+ if (text[i - 1] == ' ') {
3130
+ if (x == '?' || x == '!' || x == '.' || x == ',') { // " ?", " !", " .", " ,"
3131
+ total--; // remove space
3132
+ }
3133
+ }
3134
+ text[total++] = x;
3135
+ }
3136
+
3137
+ // second pass: strip single apostrophe between spaces
3138
+ const int32_t total2 = total;
3139
+ total = total ? 1 : 0;
3140
+ for (int32_t i = 1; i < total2; ++i) {
3141
+ const char x = text[i];
3142
+ if (x == '\'' && i + 1 < total2 && text[i - 1] == ' ' && text[i + 1] == ' ') { // " ' "
3143
+ total--; // remove prev space
3144
+ text[++i] = '\0'; // remove next space
3145
+ }
3146
+ text[total++] = x;
3147
+ }
3148
+
3149
+ // third pass: apostrophe contractions //NOTE: this makes sense?
3150
+ const int32_t total3 = total;
3151
+ total = total ? 1 : 0;
3152
+ for (int32_t i = 1; i < total3; ++i) {
3153
+ const char x = text[i];
3154
+ if (text[i - 1] == ' ') {
3155
+ if (x == '\'' && i + 1 < total3) {
3156
+ const char x1 = text[i + 1];
3157
+ if (x1 == 't' || x1 == 'd') { // " 't", " 'd"
3158
+ //total--; // remove space
3159
+ } else if (x1 == 's' || x1 == 'm') { // " 's", " 'm"
3160
+ total--; // remove space
3161
+ } else if (i + 2 < total3) {
3162
+ const char x2 = text[i + 2];
3163
+ if ((x1 == 'l' && x2 == 'l')) { // " 'll"
3164
+ //total--; // remove space
3165
+ } else if ((x1 == 'r' && x2 == 'e') || (x1 == 'v' && x2 == 'e')) { // " 're", " 've"
3166
+ total--; // remove space
3167
+ } else {
3168
+ //total--; // remove space
3169
+ }
3170
+ } else {
3171
+ //total--; // remove space
3172
+ }
3173
+ }
3174
+ }
3175
+ text[total++] = x;
3176
+ }
3177
+ }
3178
+
3179
+ return total <= text_len_max ? total : -total;
3180
+ }
3181
+
3182
+ void llama_vocab::impl::print_info() const {
3183
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
3184
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
3185
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
3186
+
3187
+ // special tokens
3188
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token.at(special_bos_id).text.c_str() ); }
3189
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token.at(special_eos_id).text.c_str() ); }
3190
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token.at(special_eot_id).text.c_str() ); }
3191
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token.at(special_eom_id).text.c_str() ); }
3192
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token.at(special_unk_id).text.c_str() ); }
3193
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token.at(special_sep_id).text.c_str() ); }
3194
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token.at(special_pad_id).text.c_str() ); }
3195
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token.at(special_mask_id).text.c_str() ); }
3196
+
3197
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token.at(linefeed_id).text.c_str() ); }
3198
+
3199
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token.at(special_fim_pre_id).text.c_str() ); }
3200
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token.at(special_fim_suf_id).text.c_str() ); }
3201
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token.at(special_fim_mid_id).text.c_str() ); }
3202
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token.at(special_fim_pad_id).text.c_str() ); }
3203
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token.at(special_fim_rep_id).text.c_str() ); }
3204
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token.at(special_fim_sep_id).text.c_str() ); }
3205
+
3206
+ for (const auto & id : special_eog_ids) {
3207
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token.at(id).text.c_str() );
3208
+ }
3209
+
3210
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
3211
+ }
3212
+
3213
+ llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
3214
+ }
3215
+
3216
+ llama_vocab::~llama_vocab() {
3217
+ }
3218
+
3219
+ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
3220
+ pimpl->load(ml, kv);
3221
+ }
3222
+
3223
+ std::string llama_vocab::get_tokenizer_model() const {
3224
+ return pimpl->tokenizer_model;
3225
+ }
3226
+
3227
+ std::string llama_vocab::get_tokenizer_pre() const {
3228
+ return pimpl->tokenizer_pre;
3229
+ }
3230
+
3231
+ enum llama_vocab_type llama_vocab::get_type() const {
3232
+ return pimpl->type;
3233
+ }
3234
+
3235
+ enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
3236
+ return pimpl->pre_type;
3237
+ }
3238
+
3239
+ uint32_t llama_vocab::n_tokens() const {
3240
+ return (uint32_t) pimpl->id_to_token.size();
3241
+ }
3242
+
3243
+ uint32_t llama_vocab::n_token_types() const {
3244
+ return (uint32_t) pimpl->n_token_types;
3245
+ }
3246
+
3247
+ std::string llama_vocab::type_name() const{
3248
+ return pimpl->type_name();
3249
+ }
3250
+
3251
+ bool llama_vocab::is_normal(llama_token id) const {
3252
+ return pimpl->is_normal(id);
3253
+ }
3254
+
3255
+ bool llama_vocab::is_unknown(llama_token id) const {
3256
+ return pimpl->is_unknown(id);
3257
+ }
3258
+
3259
+ bool llama_vocab::is_control(llama_token id) const {
3260
+ return pimpl->is_control(id);
3261
+ }
3262
+
3263
+ bool llama_vocab::is_byte(llama_token id) const {
3264
+ return pimpl->is_byte(id);
3265
+ }
3266
+
3267
+ bool llama_vocab::is_user_defined(llama_token id) const {
3268
+ return pimpl->is_user_defined(id);
3269
+ }
3270
+
3271
+ bool llama_vocab::is_unused(llama_token id) const {
3272
+ return pimpl->is_unused(id);
3273
+ }
3274
+
3275
+ bool llama_vocab::is_eog(llama_token id) const {
3276
+ return pimpl->is_eog(id);
3277
+ }
3278
+
3279
+ uint8_t llama_vocab::token_to_byte(llama_token id) const {
3280
+ return pimpl->token_to_byte(id);
3281
+ }
3282
+
3283
+ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
3284
+ LM_GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
3285
+ static const char * hex = "0123456789ABCDEF";
3286
+ switch (get_type()) {
3287
+ case LLAMA_VOCAB_TYPE_SPM:
3288
+ case LLAMA_VOCAB_TYPE_UGM: {
3289
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
3290
+ auto token = pimpl->token_to_id.find(buf);
3291
+ if (token != pimpl->token_to_id.end()) {
3292
+ return (*token).second;
3293
+ }
3294
+ // Try to fall back to just the byte as a string
3295
+ const char buf2[2] = { (char)ch, 0 };
3296
+ return pimpl->token_to_id.at(buf2);
3297
+ }
3298
+ case LLAMA_VOCAB_TYPE_WPM:
3299
+ case LLAMA_VOCAB_TYPE_BPE: {
3300
+ return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
3301
+ }
3302
+ case LLAMA_VOCAB_TYPE_PLAMO2: {
3303
+ // PLaMo-2 uses byte tokens in format <0xXX>
3304
+ char hex_str[8];
3305
+ snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
3306
+ return pimpl->token_to_id.at(hex_str);
3307
+ }
3308
+ default:
3309
+ LM_GGML_ABORT("fatal error");
3310
+ }
3311
+ }
3312
+
3313
+ llama_token llama_vocab::text_to_token(const std::string & text) const {
3314
+ LM_GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3315
+ auto it = pimpl->token_to_id.find(text);
3316
+ if (it != pimpl->token_to_id.end()) {
3317
+ return (*it).second;
3318
+ }
3319
+ return LLAMA_TOKEN_NULL;
3320
+ }
3321
+
3322
+ const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
3323
+ LM_GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3324
+ return pimpl->id_to_token.at(id);
3325
+ }
3326
+
3327
+ const char * llama_vocab::token_get_text(llama_token id) const {
3328
+ LM_GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3329
+ return pimpl->id_to_token.at(id).text.c_str();
3330
+ }
3331
+
3332
+ float llama_vocab::token_get_score(llama_token id) const {
3333
+ LM_GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
3334
+ return pimpl->id_to_token.at(id).score;
3335
+ }
3336
+
3337
+ llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
3338
+ return pimpl->token_get_attr(id);
3339
+ }
3340
+
3341
+ llama_token llama_vocab::token_bos() const {
3342
+ return pimpl->special_bos_id;
3343
+ }
3344
+
3345
+ llama_token llama_vocab::token_eos() const {
3346
+ return pimpl->special_eos_id;
3347
+ }
3348
+
3349
+ llama_token llama_vocab::token_eot() const {
3350
+ return pimpl->special_eot_id;
3351
+ }
3352
+
3353
+ llama_token llama_vocab::token_eom() const {
3354
+ return pimpl->special_eom_id;
3355
+ }
3356
+
3357
+ llama_token llama_vocab::token_unk() const {
3358
+ return pimpl->special_unk_id;
3359
+ }
3360
+
3361
+ llama_token llama_vocab::token_sep() const {
3362
+ return pimpl->special_sep_id;
3363
+ }
3364
+
3365
+ llama_token llama_vocab::token_nl() const {
3366
+ return pimpl->linefeed_id;
3367
+ }
3368
+
3369
+ llama_token llama_vocab::token_pad() const {
3370
+ return pimpl->special_pad_id;
3371
+ }
3372
+
3373
+ llama_token llama_vocab::token_prefix() const {
3374
+ return pimpl->special_fim_pre_id;
3375
+ }
3376
+
3377
+ llama_token llama_vocab::token_middle() const {
3378
+ return pimpl->special_fim_mid_id;
3379
+ }
3380
+
3381
+ llama_token llama_vocab::token_suffix() const {
3382
+ return pimpl->special_fim_suf_id;
3383
+ }
3384
+
3385
+ llama_token llama_vocab::token_fim_pre() const {
3386
+ return pimpl->special_fim_pre_id;
3387
+ }
3388
+
3389
+ llama_token llama_vocab::token_fim_suf() const {
3390
+ return pimpl->special_fim_suf_id;
3391
+ }
3392
+
3393
+ llama_token llama_vocab::token_fim_mid() const {
3394
+ return pimpl->special_fim_mid_id;
3395
+ }
3396
+
3397
+ llama_token llama_vocab::token_fim_pad() const {
3398
+ return pimpl->special_fim_pad_id;
3399
+ }
3400
+
3401
+ llama_token llama_vocab::token_fim_rep() const {
3402
+ return pimpl->special_fim_rep_id;
3403
+ }
3404
+
3405
+ llama_token llama_vocab::token_fim_sep() const {
3406
+ return pimpl->special_fim_sep_id;
3407
+ }
3408
+
3409
+ llama_token llama_vocab::token_mask() const {
3410
+ return pimpl->special_mask_id;
3411
+ }
3412
+
3413
+ bool llama_vocab::get_add_space_prefix() const {
3414
+ return pimpl->add_space_prefix;
3415
+ }
3416
+
3417
+ bool llama_vocab::get_add_bos() const {
3418
+ return pimpl->add_bos;
3419
+ }
3420
+
3421
+ bool llama_vocab::get_add_eos() const {
3422
+ return pimpl->add_eos;
3423
+ }
3424
+
3425
+ bool llama_vocab::get_add_sep() const {
3426
+ return pimpl->add_sep;
3427
+ }
3428
+
3429
+ bool llama_vocab::get_ignore_merges() const {
3430
+ return pimpl->ignore_merges;
3431
+ }
3432
+
3433
+ bool llama_vocab::get_clean_spaces() const {
3434
+ return pimpl->clean_spaces;
3435
+ }
3436
+
3437
+ bool llama_vocab::get_remove_extra_whitespaces() const {
3438
+ return pimpl->remove_extra_whitespaces;
3439
+ }
3440
+
3441
+ bool llama_vocab::get_escape_whitespaces() const {
3442
+ return pimpl->escape_whitespaces;
3443
+ }
3444
+
3445
+ bool llama_vocab::get_treat_whitespace_as_suffix() const {
3446
+ return pimpl->treat_whitespace_as_suffix;
3447
+ }
3448
+
3449
+ int llama_vocab::max_token_len() const {
3450
+ return pimpl->max_token_len;
3451
+ }
3452
+
3453
+ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
3454
+ LM_GGML_ASSERT(token_left.find(' ') == std::string::npos);
3455
+ LM_GGML_ASSERT(token_left.find('\n') == std::string::npos);
3456
+ LM_GGML_ASSERT(token_right.find(' ') == std::string::npos);
3457
+ LM_GGML_ASSERT(token_right.find('\n') == std::string::npos);
3458
+
3459
+ auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
3460
+ if (it == pimpl->bpe_ranks.end()) {
3461
+ return -1;
3462
+ }
3463
+
3464
+ return it->second;
3465
+ }
3466
+
3467
+ std::vector<std::string> llama_vocab::get_bpe_merges() const {
3468
+ std::vector<std::string> result(pimpl->bpe_ranks.size());
3469
+
3470
+ for (const auto & pair : pimpl->bpe_ranks) {
3471
+ result[pair.second] = pair.first.first + " " + pair.first.second;
3472
+ }
3473
+
3474
+ return result;
3475
+ }
3476
+
3477
+ std::vector<char> llama_vocab::get_precompiled_charsmap() const {
3478
+ return pimpl->precompiled_charsmap;
3479
+ }
3480
+
3481
+ int32_t llama_vocab::tokenize(
3482
+ const char * text,
3483
+ int32_t text_len,
3484
+ llama_token * tokens,
3485
+ int32_t n_tokens_max,
3486
+ bool add_special,
3487
+ bool parse_special) const {
3488
+ auto res = tokenize(std::string(text, text_len), add_special, parse_special);
3489
+ if (res.size() >= static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
3490
+ LLAMA_LOG_ERROR("%s: tokenization result size %zu exceeds int32_t limit\n", __func__, res.size());
3491
+ return std::numeric_limits<int32_t>::min();
3492
+ }
3493
+
3494
+ if (n_tokens_max < (int) res.size()) {
3495
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
3496
+ return -((int) res.size());
3497
+ }
3498
+
3499
+ for (size_t i = 0; i < res.size(); i++) {
3500
+ tokens[i] = res[i];
3501
+ }
3502
+
3503
+ return res.size();
3504
+ }
3505
+
3506
+ std::vector<llama_token> llama_vocab::tokenize(
3507
+ const std::string & raw_text,
3508
+ bool add_special,
3509
+ bool parse_special) const {
3510
+ return pimpl->tokenize(raw_text, add_special, parse_special);
3511
+ }
3512
+
3513
+ const std::string & llama_vocab::token_to_piece(llama_token token) const {
3514
+ return pimpl->token_to_piece(token);
3515
+ }
3516
+
3517
+ int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
3518
+ return pimpl->token_to_piece(token, buf, length, lstrip, special);
3519
+ }
3520
+
3521
+ int32_t llama_vocab::detokenize(
3522
+ const llama_token * tokens,
3523
+ int32_t n_tokens,
3524
+ char * text,
3525
+ int32_t text_len_max,
3526
+ bool remove_special,
3527
+ bool unparse_special) const {
3528
+ return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3529
+ }
3530
+
3531
+ std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
3532
+ std::string text;
3533
+ text.resize(std::max(text.capacity(), tokens.size()));
3534
+ int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3535
+ if (n_chars < 0) {
3536
+ text.resize(-n_chars);
3537
+ n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
3538
+ LM_GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
3539
+ }
3540
+
3541
+ text.resize(n_chars);
3542
+
3543
+ // NOTE: the original tokenizer decodes bytes after collecting the pieces.
3544
+ return text;
3545
+ }
3546
+
3547
+ void llama_vocab::print_info() const {
3548
+ pimpl->print_info();
3549
+ }
3550
+
3551
+ //
3552
+ // interface implementation
3553
+ //
3554
+
3555
+ int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
3556
+ return vocab->n_tokens();
3557
+ }
3558
+
3559
+ // deprecated
3560
+ int32_t llama_n_vocab(const struct llama_vocab * vocab) {
3561
+ return llama_vocab_n_tokens(vocab);
3562
+ }
3563
+
3564
+ enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
3565
+ return vocab->get_type();
3566
+ }
3567
+
3568
+ const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
3569
+ return vocab->token_get_text(token);
3570
+ }
3571
+
3572
+ float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
3573
+ return vocab->token_get_score(token);
3574
+ }
3575
+
3576
+ enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
3577
+ return vocab->token_get_attr(token);
3578
+ }
3579
+
3580
+ bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
3581
+ return vocab->is_eog(token);
3582
+ }
3583
+
3584
+ bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
3585
+ return vocab->is_control(token);
3586
+ }
3587
+
3588
+ llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
3589
+ return vocab->token_bos();
3590
+ }
3591
+
3592
+ llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
3593
+ return vocab->token_eos();
3594
+ }
3595
+
3596
+ llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
3597
+ return vocab->token_eot();
3598
+ }
3599
+
3600
+ // deprecated
3601
+ llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
3602
+ return vocab->token_bos();
3603
+ }
3604
+
3605
+ llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
3606
+ return vocab->token_sep();
3607
+ }
3608
+
3609
+ llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
3610
+ return vocab->token_nl();
3611
+ }
3612
+
3613
+ llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
3614
+ return vocab->token_pad();
3615
+ }
3616
+
3617
+ bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
3618
+ return vocab->get_add_bos();
3619
+ }
3620
+
3621
+ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3622
+ return vocab->get_add_eos();
3623
+ }
3624
+
3625
+ bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
3626
+ return vocab->get_add_sep();
3627
+ }
3628
+
3629
+ llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3630
+ return vocab->token_fim_pre();
3631
+ }
3632
+
3633
+ llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
3634
+ return vocab->token_fim_suf();
3635
+ }
3636
+
3637
+ llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
3638
+ return vocab->token_fim_mid();
3639
+ }
3640
+
3641
+ llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
3642
+ return vocab->token_fim_pad();
3643
+ }
3644
+
3645
+ llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
3646
+ return vocab->token_fim_rep();
3647
+ }
3648
+
3649
+ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3650
+ return vocab->token_fim_sep();
3651
+ }
3652
+
3653
+ llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
3654
+ return vocab->token_mask();
3655
+ }
3656
+
3657
+ // deprecated
3658
+ const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3659
+ return llama_vocab_get_text(vocab, token);
3660
+ }
3661
+
3662
+ // deprecated
3663
+ float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
3664
+ return llama_vocab_get_score(vocab, token);
3665
+ }
3666
+
3667
+ // deprecated
3668
+ enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
3669
+ return llama_vocab_get_attr(vocab, token);
3670
+ }
3671
+
3672
+ // deprecated
3673
+ bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
3674
+ return llama_vocab_is_eog(vocab, token);
3675
+ }
3676
+
3677
+ // deprecated
3678
+ bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
3679
+ return llama_vocab_is_control(vocab, token);
3680
+ }
3681
+
3682
+ // deprecated
3683
+ llama_token llama_token_bos(const struct llama_vocab * vocab) {
3684
+ return llama_vocab_bos(vocab);
3685
+ }
3686
+
3687
+ // deprecated
3688
+ llama_token llama_token_eos(const struct llama_vocab * vocab) {
3689
+ return llama_vocab_eos(vocab);
3690
+ }
3691
+
3692
+ // deprecated
3693
+ llama_token llama_token_eot(const struct llama_vocab * vocab) {
3694
+ return llama_vocab_eot(vocab);
3695
+ }
3696
+
3697
+ // deprecated
3698
+ llama_token llama_token_cls(const struct llama_vocab * vocab) {
3699
+ //return llama_vocab_cls(vocab);
3700
+ return llama_vocab_bos(vocab); // avoid deprecation warning
3701
+ }
3702
+
3703
+ // deprecated
3704
+ llama_token llama_token_sep(const struct llama_vocab * vocab) {
3705
+ return llama_vocab_sep(vocab);
3706
+ }
3707
+
3708
+ // deprecated
3709
+ llama_token llama_token_nl (const struct llama_vocab * vocab) {
3710
+ return llama_vocab_nl(vocab);
3711
+ }
3712
+
3713
+ // deprecated
3714
+ llama_token llama_token_pad(const struct llama_vocab * vocab) {
3715
+ return llama_vocab_pad(vocab);
3716
+ }
3717
+
3718
+ // deprecated
3719
+ bool llama_add_bos_token(const struct llama_vocab * vocab) {
3720
+ return llama_vocab_get_add_bos(vocab);
3721
+ }
3722
+
3723
+ // deprecated
3724
+ bool llama_add_eos_token(const struct llama_vocab * vocab) {
3725
+ return llama_vocab_get_add_eos(vocab);
3726
+ }
3727
+
3728
+ // deprecated
3729
+ llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
3730
+ return llama_vocab_fim_pre(vocab);
3731
+ }
3732
+
3733
+ // deprecated
3734
+ llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
3735
+ return llama_vocab_fim_suf(vocab);
3736
+ }
3737
+
3738
+ // deprecated
3739
+ llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
3740
+ return llama_vocab_fim_mid(vocab);
3741
+ }
3742
+
3743
+ // deprecated
3744
+ llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
3745
+ return llama_vocab_fim_pad(vocab);
3746
+ }
3747
+
3748
+ // deprecated
3749
+ llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
3750
+ return llama_vocab_fim_rep(vocab);
3751
+ }
3752
+
3753
+ // deprecated
3754
+ llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
3755
+ return llama_vocab_fim_sep(vocab);
3756
+ }
3757
+
3758
+ //
3759
+ // tokenization
3760
+ //
3761
+
3762
+ int32_t llama_tokenize(
3763
+ const struct llama_vocab * vocab,
3764
+ const char * text,
3765
+ int32_t text_len,
3766
+ llama_token * tokens,
3767
+ int32_t n_tokens_max,
3768
+ bool add_special,
3769
+ bool parse_special) {
3770
+ return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
3771
+ }
3772
+
3773
+ int32_t llama_token_to_piece(
3774
+ const struct llama_vocab * vocab,
3775
+ llama_token token,
3776
+ char * buf,
3777
+ int32_t length,
3778
+ int32_t lstrip,
3779
+ bool special) {
3780
+ return vocab->token_to_piece(token, buf, length, lstrip, special);
3781
+ }
3782
+
3783
+ int32_t llama_detokenize(
3784
+ const struct llama_vocab * vocab,
3785
+ const llama_token * tokens,
3786
+ int32_t n_tokens,
3787
+ char * text,
3788
+ int32_t text_len_max,
3789
+ bool remove_special,
3790
+ bool unparse_special) {
3791
+ return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3792
+ }