@fugood/llama.node 0.3.6 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (186) hide show
  1. package/README.md +17 -2
  2. package/bin/darwin/arm64/llama-node.node +0 -0
  3. package/bin/darwin/x64/llama-node.node +0 -0
  4. package/bin/linux/arm64/llama-node.node +0 -0
  5. package/bin/linux/x64/llama-node.node +0 -0
  6. package/bin/linux-cuda/arm64/llama-node.node +0 -0
  7. package/bin/linux-cuda/x64/llama-node.node +0 -0
  8. package/bin/linux-vulkan/arm64/llama-node.node +0 -0
  9. package/bin/linux-vulkan/x64/llama-node.node +0 -0
  10. package/bin/win32/arm64/llama-node.node +0 -0
  11. package/bin/win32/arm64/node.lib +0 -0
  12. package/bin/win32/x64/llama-node.node +0 -0
  13. package/bin/win32/x64/node.lib +0 -0
  14. package/bin/win32-vulkan/arm64/llama-node.node +0 -0
  15. package/bin/win32-vulkan/arm64/node.lib +0 -0
  16. package/bin/win32-vulkan/x64/llama-node.node +0 -0
  17. package/bin/win32-vulkan/x64/node.lib +0 -0
  18. package/lib/binding.ts +3 -1
  19. package/lib/index.js +16 -1
  20. package/lib/index.ts +16 -0
  21. package/package.json +1 -1
  22. package/src/EmbeddingWorker.cpp +4 -3
  23. package/src/LlamaCompletionWorker.cpp +4 -2
  24. package/src/LlamaContext.cpp +61 -6
  25. package/src/LlamaContext.h +1 -0
  26. package/src/common.hpp +6 -11
  27. package/src/llama.cpp/.github/workflows/build.yml +19 -17
  28. package/src/llama.cpp/.github/workflows/docker.yml +77 -30
  29. package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
  30. package/src/llama.cpp/.github/workflows/server.yml +22 -3
  31. package/src/llama.cpp/CMakeLists.txt +49 -24
  32. package/src/llama.cpp/common/arg.cpp +82 -26
  33. package/src/llama.cpp/common/arg.h +3 -0
  34. package/src/llama.cpp/common/common.cpp +192 -72
  35. package/src/llama.cpp/common/common.h +51 -18
  36. package/src/llama.cpp/common/ngram-cache.cpp +12 -12
  37. package/src/llama.cpp/common/ngram-cache.h +2 -2
  38. package/src/llama.cpp/common/sampling.cpp +11 -6
  39. package/src/llama.cpp/common/speculative.cpp +18 -15
  40. package/src/llama.cpp/docs/build.md +2 -0
  41. package/src/llama.cpp/examples/batched/batched.cpp +9 -7
  42. package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
  43. package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
  44. package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
  45. package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
  46. package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
  47. package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
  48. package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
  49. package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
  50. package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
  51. package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
  52. package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
  53. package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
  54. package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
  55. package/src/llama.cpp/examples/infill/infill.cpp +23 -24
  56. package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
  57. package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
  58. package/src/llama.cpp/examples/llava/clip.cpp +4 -2
  59. package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
  60. package/src/llama.cpp/examples/llava/llava.cpp +2 -2
  61. package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
  62. package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
  63. package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
  64. package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
  65. package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
  66. package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
  67. package/src/llama.cpp/examples/main/main.cpp +51 -29
  68. package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
  69. package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
  70. package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
  71. package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
  72. package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
  73. package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
  74. package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
  75. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
  76. package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
  77. package/src/llama.cpp/examples/run/run.cpp +175 -61
  78. package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
  79. package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
  80. package/src/llama.cpp/examples/server/httplib.h +1295 -409
  81. package/src/llama.cpp/examples/server/server.cpp +387 -181
  82. package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
  83. package/src/llama.cpp/examples/server/utils.hpp +170 -58
  84. package/src/llama.cpp/examples/simple/simple.cpp +9 -8
  85. package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
  86. package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
  87. package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
  88. package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
  89. package/src/llama.cpp/examples/tts/tts.cpp +64 -23
  90. package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
  91. package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
  92. package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
  93. package/src/llama.cpp/ggml/include/ggml.h +36 -145
  94. package/src/llama.cpp/ggml/include/gguf.h +202 -0
  95. package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
  96. package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
  97. package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
  98. package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
  99. package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
  100. package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
  101. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
  102. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
  103. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
  104. package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
  105. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
  106. package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
  107. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
  108. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
  109. package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
  110. package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
  111. package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
  112. package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
  113. package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
  114. package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
  115. package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
  116. package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
  117. package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
  118. package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
  119. package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
  120. package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
  121. package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
  122. package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
  123. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
  124. package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
  125. package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
  126. package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
  127. package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
  128. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
  129. package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
  130. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
  131. package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
  132. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
  133. package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
  134. package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
  135. package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
  136. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
  137. package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
  138. package/src/llama.cpp/ggml/src/ggml.c +117 -1327
  139. package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
  140. package/src/llama.cpp/include/llama-cpp.h +6 -1
  141. package/src/llama.cpp/include/llama.h +138 -75
  142. package/src/llama.cpp/src/CMakeLists.txt +13 -1
  143. package/src/llama.cpp/src/llama-adapter.cpp +347 -0
  144. package/src/llama.cpp/src/llama-adapter.h +74 -0
  145. package/src/llama.cpp/src/llama-arch.cpp +1487 -0
  146. package/src/llama.cpp/src/llama-arch.h +400 -0
  147. package/src/llama.cpp/src/llama-batch.cpp +368 -0
  148. package/src/llama.cpp/src/llama-batch.h +88 -0
  149. package/src/llama.cpp/src/llama-chat.cpp +578 -0
  150. package/src/llama.cpp/src/llama-chat.h +52 -0
  151. package/src/llama.cpp/src/llama-context.cpp +1775 -0
  152. package/src/llama.cpp/src/llama-context.h +128 -0
  153. package/src/llama.cpp/src/llama-cparams.cpp +1 -0
  154. package/src/llama.cpp/src/llama-cparams.h +37 -0
  155. package/src/llama.cpp/src/llama-grammar.cpp +5 -4
  156. package/src/llama.cpp/src/llama-grammar.h +3 -1
  157. package/src/llama.cpp/src/llama-hparams.cpp +71 -0
  158. package/src/llama.cpp/src/llama-hparams.h +139 -0
  159. package/src/llama.cpp/src/llama-impl.cpp +167 -0
  160. package/src/llama.cpp/src/llama-impl.h +16 -136
  161. package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
  162. package/src/llama.cpp/src/llama-kv-cache.h +218 -0
  163. package/src/llama.cpp/src/llama-mmap.cpp +589 -0
  164. package/src/llama.cpp/src/llama-mmap.h +67 -0
  165. package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
  166. package/src/llama.cpp/src/llama-model-loader.h +167 -0
  167. package/src/llama.cpp/src/llama-model.cpp +3953 -0
  168. package/src/llama.cpp/src/llama-model.h +370 -0
  169. package/src/llama.cpp/src/llama-quant.cpp +934 -0
  170. package/src/llama.cpp/src/llama-quant.h +1 -0
  171. package/src/llama.cpp/src/llama-sampling.cpp +147 -32
  172. package/src/llama.cpp/src/llama-sampling.h +3 -19
  173. package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
  174. package/src/llama.cpp/src/llama-vocab.h +97 -142
  175. package/src/llama.cpp/src/llama.cpp +7160 -20314
  176. package/src/llama.cpp/src/unicode.cpp +8 -3
  177. package/src/llama.cpp/tests/CMakeLists.txt +2 -0
  178. package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
  179. package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
  180. package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
  181. package/src/llama.cpp/tests/test-gguf.cpp +222 -187
  182. package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
  183. package/src/llama.cpp/tests/test-sampling.cpp +0 -1
  184. package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
  185. package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
  186. package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
@@ -1,5 +1,8 @@
1
1
  #include "llama-vocab.h"
2
2
 
3
+ #include "llama-impl.h"
4
+ #include "llama-model-loader.h"
5
+
3
6
  #include "unicode.h"
4
7
 
5
8
  #include <algorithm>
@@ -9,29 +12,15 @@
9
12
  #include <cstdarg>
10
13
  #include <cstring>
11
14
  #include <forward_list>
15
+ #include <map>
12
16
  #include <queue>
13
- #include <sstream>
17
+ #include <set>
18
+ #include <unordered_map>
14
19
 
15
20
  //
16
21
  // helpers
17
22
  //
18
23
 
19
- LLAMA_ATTRIBUTE_FORMAT(1, 2)
20
- static std::string format(const char * fmt, ...) {
21
- va_list ap;
22
- va_list ap2;
23
- va_start(ap, fmt);
24
- va_copy(ap2, ap);
25
- int size = vsnprintf(NULL, 0, fmt, ap);
26
- GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
27
- std::vector<char> buf(size + 1);
28
- int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
29
- GGML_ASSERT(size2 == size);
30
- va_end(ap2);
31
- va_end(ap);
32
- return std::string(buf.data(), size);
33
- }
34
-
35
24
  struct naive_trie {
36
25
  naive_trie() : has_value(false), value(0) {
37
26
  }
@@ -76,96 +65,14 @@ struct naive_trie {
76
65
  };
77
66
 
78
67
  //
79
- // impl
68
+ // tokenizers
80
69
  //
81
70
 
82
71
  struct llm_tokenizer {
83
- llm_tokenizer() {}
84
- virtual ~llm_tokenizer() = default;
72
+ llm_tokenizer() {}
73
+ virtual ~llm_tokenizer() = default;
85
74
  };
86
75
 
87
- llama_vocab::~llama_vocab() {
88
- delete tokenizer;
89
- }
90
-
91
- int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
92
- GGML_ASSERT(token_left.find(' ') == std::string::npos);
93
- GGML_ASSERT(token_left.find('\n') == std::string::npos);
94
- GGML_ASSERT(token_right.find(' ') == std::string::npos);
95
- GGML_ASSERT(token_right.find('\n') == std::string::npos);
96
-
97
- auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
98
- if (it == bpe_ranks.end()) {
99
- return -1;
100
- }
101
-
102
- return it->second;
103
- }
104
-
105
- static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
106
- return vocab.type;
107
- }
108
-
109
- static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
110
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
111
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
112
- }
113
-
114
- static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
115
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
116
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
117
- }
118
-
119
- static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
120
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
121
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
122
- }
123
-
124
- static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
125
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
126
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
127
- }
128
-
129
- static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
130
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
131
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
132
- }
133
-
134
- static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
135
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
136
- return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
137
- }
138
-
139
- static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
140
- GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
141
- GGML_ASSERT(llama_is_byte_token(vocab, id));
142
- const auto & token_data = vocab.id_to_token.at(id);
143
- switch (llama_vocab_get_type(vocab)) {
144
- case LLAMA_VOCAB_TYPE_SPM:
145
- case LLAMA_VOCAB_TYPE_UGM: {
146
- auto buf = token_data.text.substr(3, 2);
147
- return strtol(buf.c_str(), NULL, 16);
148
- }
149
- case LLAMA_VOCAB_TYPE_BPE: {
150
- GGML_ABORT("fatal error");
151
- //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
152
- }
153
- case LLAMA_VOCAB_TYPE_WPM: {
154
- GGML_ABORT("fatal error");
155
- }
156
- default:
157
- GGML_ABORT("fatal error");
158
- }
159
- }
160
-
161
- static void llama_escape_whitespace(std::string & text) {
162
- replace_all(text, " ", "\xe2\x96\x81");
163
- }
164
-
165
- static void llama_unescape_whitespace(std::string & word) {
166
- replace_all(word, "\xe2\x96\x81", " ");
167
- }
168
-
169
76
  struct llm_symbol {
170
77
  using index = int;
171
78
  index prev;
@@ -197,14 +104,13 @@ struct llm_bigram_spm {
197
104
  };
198
105
 
199
106
  struct llm_tokenizer_spm : llm_tokenizer {
200
- llm_tokenizer_spm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
107
+ llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
201
108
  };
202
109
 
203
110
  struct llm_tokenizer_spm_session {
204
111
  llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
205
112
 
206
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
207
-
113
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
208
114
  // split string into utf8 chars
209
115
  int index = 0;
210
116
  size_t offs = 0;
@@ -263,13 +169,13 @@ struct llm_tokenizer_spm_session {
263
169
  }
264
170
 
265
171
  private:
266
- void resegment(llm_symbol & symbol, std::vector<llama_vocab::id> & output) {
172
+ void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
267
173
  auto text = std::string(symbol.text, symbol.n);
268
- auto token = vocab.token_to_id.find(text);
174
+ auto token = vocab.text_to_token(text);
269
175
 
270
176
  // Do we need to support is_unused?
271
- if (token != vocab.token_to_id.end()) {
272
- output.push_back((*token).second);
177
+ if (token != LLAMA_TOKEN_NULL) {
178
+ output.push_back(token);
273
179
  return;
274
180
  }
275
181
 
@@ -279,8 +185,8 @@ private:
279
185
  // output any symbols that did not form tokens as bytes.
280
186
  output.reserve(output.size() + symbol.n);
281
187
  for (int j = 0; j < (int)symbol.n; ++j) {
282
- llama_vocab::id token_id = llama_byte_to_token_impl(vocab, symbol.text[j]);
283
- output.push_back(token_id);
188
+ llama_token id = vocab.byte_to_token(symbol.text[j]);
189
+ output.push_back(id);
284
190
  }
285
191
  return;
286
192
  }
@@ -294,17 +200,17 @@ private:
294
200
  return;
295
201
  }
296
202
  const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
297
- auto token = vocab.token_to_id.find(text);
203
+ auto token = vocab.text_to_token(text);
298
204
 
299
- if (token == vocab.token_to_id.end()) {
205
+ if (token == LLAMA_TOKEN_NULL) {
300
206
  return;
301
207
  }
302
208
 
303
- if (static_cast<size_t>((*token).second) >= vocab.id_to_token.size()) {
209
+ if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
304
210
  return;
305
211
  }
306
212
 
307
- const auto & tok_data = vocab.id_to_token[(*token).second];
213
+ const auto & tok_data = vocab.get_token_data(token);
308
214
 
309
215
  llm_bigram_spm bigram;
310
216
  bigram.left = left;
@@ -367,9 +273,9 @@ struct llm_bigram_bpe {
367
273
  };
368
274
 
369
275
  struct llm_tokenizer_bpe : llm_tokenizer {
370
- llm_tokenizer_bpe(const llama_vocab & vocab) : llm_tokenizer() {
371
- GGML_ASSERT(vocab.type == LLAMA_VOCAB_TYPE_BPE);
372
- switch (vocab.type_pre) {
276
+ llm_tokenizer_bpe(const llama_vocab & vocab) {
277
+ GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
278
+ switch (vocab.get_pre_type()) {
373
279
  case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
374
280
  regex_exprs = {
375
281
  // original regex from tokenizer.json
@@ -396,6 +302,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
396
302
  "\\p{N}+",
397
303
  };
398
304
  break;
305
+ case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
306
+ regex_exprs = {
307
+ "\\p{N}{1,3}",
308
+ "[一-龥぀-ゟ゠-ヿ]+",
309
+ "[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
310
+ };
311
+ break;
399
312
  case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
400
313
  regex_exprs = {
401
314
  "[\r\n]",
@@ -495,39 +408,38 @@ struct llm_tokenizer_bpe : llm_tokenizer {
495
408
  };
496
409
 
497
410
  struct llm_tokenizer_bpe_session {
498
- llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
499
- bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
411
+ llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
500
412
 
501
- static void append(const llama_vocab::id token_id, std::vector<llama_vocab::id> & output) {
413
+ static void append(const llama_token token_id, std::vector<llama_token> & output) {
502
414
  output.push_back(token_id);
503
415
  }
504
416
 
505
- bool append_bos(std::vector<llama_vocab::id> & output) const {
506
- if (vocab.tokenizer_add_bos) {
507
- GGML_ASSERT(vocab.special_bos_id != -1);
508
- output.push_back(vocab.special_bos_id);
417
+ bool append_bos(std::vector<llama_token> & output) const {
418
+ if (vocab.get_add_bos()) {
419
+ GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
420
+ output.push_back(vocab.token_bos());
509
421
  return true;
510
422
  }
511
423
  return false;
512
424
  }
513
425
 
514
- bool append_eos(std::vector<llama_vocab::id> & output) const {
515
- if (vocab.tokenizer_add_eos) {
516
- GGML_ASSERT(vocab.special_eos_id != -1);
517
- output.push_back(vocab.special_eos_id);
426
+ bool append_eos(std::vector<llama_token> & output) const {
427
+ if (vocab.get_add_eos()) {
428
+ GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
429
+ output.push_back(vocab.token_eos());
518
430
  return true;
519
431
  }
520
432
  return false;
521
433
  }
522
434
 
523
- void check_double_bos_eos(const std::vector<llama_vocab::id> & output) const {
524
- if (vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
435
+ void check_double_bos_eos(const std::vector<llama_token> & output) const {
436
+ if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
525
437
  LLAMA_LOG_WARN(
526
438
  "%s: Added a BOS token to the prompt as specified by the model but the prompt "
527
439
  "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
528
440
  "Are you sure this is what you want?\n", __FUNCTION__);
529
441
  }
530
- if (vocab.tokenizer_add_eos && output.size() >= 2 && *(output.end()-2) == vocab.special_eos_id) {
442
+ if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
531
443
  LLAMA_LOG_WARN(
532
444
  "%s: Added a EOS token to the prompt as specified by the model but the prompt "
533
445
  "also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
@@ -535,9 +447,9 @@ struct llm_tokenizer_bpe_session {
535
447
  }
536
448
  }
537
449
 
538
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
450
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
539
451
  int final_prev_index = -1;
540
- const auto word_collection = unicode_regex_split(text, bpe_tokenizer->regex_exprs);
452
+ const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
541
453
 
542
454
  symbols_final.clear();
543
455
 
@@ -548,7 +460,8 @@ struct llm_tokenizer_bpe_session {
548
460
  int index = 0;
549
461
  size_t offset = 0;
550
462
 
551
- if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
463
+ //if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
464
+ if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
552
465
  symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
553
466
  offset = word.size();
554
467
  }
@@ -622,18 +535,18 @@ struct llm_tokenizer_bpe_session {
622
535
  }
623
536
 
624
537
  const std::string str = std::string(symbol.text, symbol.n);
625
- const auto token = vocab.token_to_id.find(str);
538
+ const auto token = vocab.text_to_token(str);
626
539
 
627
- if (token == vocab.token_to_id.end()) {
540
+ if (token == LLAMA_TOKEN_NULL) {
628
541
  for (auto j = str.begin(); j != str.end(); ++j) {
629
542
  std::string byte_str(1, *j);
630
- auto token_multibyte = vocab.token_to_id.find(byte_str);
631
- if (token_multibyte != vocab.token_to_id.end()) {
632
- output.push_back(token_multibyte->second);
543
+ auto token_multibyte = vocab.text_to_token(byte_str);
544
+ if (token_multibyte != LLAMA_TOKEN_NULL) {
545
+ output.push_back(token_multibyte);
633
546
  }
634
547
  }
635
548
  } else {
636
- output.push_back((*token).second);
549
+ output.push_back(token);
637
550
  }
638
551
  }
639
552
  }
@@ -667,7 +580,7 @@ private:
667
580
  }
668
581
 
669
582
  const llama_vocab & vocab;
670
- const llm_tokenizer_bpe * bpe_tokenizer;
583
+ const llm_tokenizer_bpe & tokenizer;
671
584
 
672
585
  std::vector<llm_symbol> symbols;
673
586
  std::vector<llm_symbol> symbols_final;
@@ -679,14 +592,13 @@ private:
679
592
  //
680
593
 
681
594
  struct llm_tokenizer_wpm : llm_tokenizer {
682
- llm_tokenizer_wpm(const llama_vocab & /*vocab*/) : llm_tokenizer() {}
595
+ llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
683
596
  };
684
597
 
685
598
  struct llm_tokenizer_wpm_session {
686
599
  llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
687
600
 
688
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
689
- const auto & token_map = vocab.token_to_id;
601
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
690
602
  // normalize and split by whitespace
691
603
  std::vector<std::string> words = preprocess(text);
692
604
  // bos token prepended already
@@ -709,10 +621,10 @@ struct llm_tokenizer_wpm_session {
709
621
  for (int i = 0; i < n; ++i) {
710
622
  // loop through possible match length
711
623
  bool match = false;
712
- for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
713
- auto it = token_map.find(word1.substr(i, j - i));
714
- if (it != token_map.end()) {
715
- output.push_back(it->second);
624
+ for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
625
+ auto id = vocab.text_to_token(word1.substr(i, j - i));
626
+ if (id != LLAMA_TOKEN_NULL) {
627
+ output.push_back(id);
716
628
  match = true;
717
629
  i = j - 1;
718
630
  break;
@@ -727,7 +639,7 @@ struct llm_tokenizer_wpm_session {
727
639
 
728
640
  // we didn't find any matches for this word
729
641
  if (current_tokens == output.size()) {
730
- output.push_back(vocab.special_unk_id);
642
+ output.push_back(vocab.token_unk());
731
643
  }
732
644
  }
733
645
  }
@@ -796,45 +708,45 @@ private:
796
708
  //
797
709
 
798
710
  struct llm_tokenizer_ugm : llm_tokenizer {
799
- llm_tokenizer_ugm(const llama_vocab & vocab) : llm_tokenizer() {
800
- if (vocab.precompiled_charsmap.size() > 0) {
711
+ llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
712
+ if (precompiled_charsmap.size() > 0) {
801
713
  size_t charsmap_offset = 0;
802
714
 
803
715
  // First four bytes of precompiled_charsmap contains length of binary
804
716
  // blob containing XOR-compressed compact double array (XCDA) entries
805
- uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
717
+ uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
806
718
  charsmap_offset += sizeof(xcda_blob_size);
807
- if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
719
+ if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
808
720
  throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
809
721
  }
810
722
 
811
723
  // Next xcda_blob_size bytes contain entries of XOR-compressed compact
812
724
  // double array (XCDA). Each entry is bit-packed into a 32-bit integer.
813
- xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
725
+ xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
814
726
  xcda_array_size = xcda_blob_size / sizeof(uint32_t);
815
727
  charsmap_offset += xcda_blob_size;
816
728
 
817
729
  // Remaining bytes of precompiled charsmap contain null-terminated
818
730
  // replacement strings for prefixes matched by the XCDA.
819
- prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
820
- prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
731
+ prefix_replacements = &precompiled_charsmap[charsmap_offset];
732
+ prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
821
733
  }
822
734
 
823
- for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
824
- const auto &token_data = vocab.id_to_token[id];
735
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
736
+ const auto & token_data = vocab.get_token_data(id);
825
737
 
826
- if (llama_is_normal_token(vocab, id)) {
738
+ if (vocab.is_normal(id)) {
827
739
  min_score = std::min<float>(min_score, token_data.score);
828
740
  max_score = std::max<float>(max_score, token_data.score);
829
741
  }
830
742
 
831
- if (llama_is_normal_token(vocab, id) ||
832
- llama_is_user_defined_token(vocab, id) ||
833
- llama_is_unused_token(vocab, id)) {
743
+ if (vocab.is_normal(id) ||
744
+ vocab.is_user_defined(id) ||
745
+ vocab.is_unused(id)) {
834
746
  token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
835
747
  }
836
748
 
837
- if (llama_is_user_defined_token(vocab, id)) {
749
+ if (vocab.is_user_defined(id)) {
838
750
  user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
839
751
  }
840
752
  }
@@ -863,8 +775,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
863
775
  };
864
776
 
865
777
  struct llm_tokenizer_ugm_session {
866
- llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
867
- ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
778
+ llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
868
779
 
869
780
  /* This implementation is based on SentencePiece optimized Viterbi algorithm for
870
781
  * unigram language models. The general idea is to:
@@ -879,7 +790,7 @@ struct llm_tokenizer_ugm_session {
879
790
  * After processing the whole sequence we backtrack from the end to get
880
791
  * the best tokenization.
881
792
  */
882
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
793
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
883
794
  // get current size of output (for reversal later)
884
795
  size_t output_size = output.size();
885
796
 
@@ -892,9 +803,9 @@ struct llm_tokenizer_ugm_session {
892
803
  }
893
804
 
894
805
  // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
895
- std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.special_unk_id, 0, -FLT_MAX});
806
+ std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
896
807
  // at the beginning tokenization score is zero
897
- tokenization_results[0] = { vocab.special_unk_id, 0, 0 };
808
+ tokenization_results[0] = { vocab.token_unk(), 0, 0 };
898
809
 
899
810
  for (size_t input_offset = 0; input_offset < input_len;) {
900
811
  size_t prefix_offset = input_offset;
@@ -904,7 +815,7 @@ struct llm_tokenizer_ugm_session {
904
815
  // traverse the token matcher trie to find a matching token
905
816
  bool single_codepoint_token_found = false;
906
817
  const struct best_tokenization & current_best = tokenization_results[input_offset];
907
- const struct naive_trie * node = ugm_tokenizer->token_matcher.traverse(normalized[prefix_offset++]);
818
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
908
819
 
909
820
  while (prefix_offset <= input_len && node != NULL) {
910
821
  // check if we found valid token in prefix
@@ -914,13 +825,13 @@ struct llm_tokenizer_ugm_session {
914
825
  single_codepoint_token_found = true;
915
826
  }
916
827
  llama_token token_id = node->value;
917
- const auto & token_data = vocab.id_to_token[token_id];
828
+ const auto & token_data = vocab.get_token_data(token_id);
918
829
 
919
830
  // we set the user-defined token scores to 0 to make them more likely to be selected
920
831
  // (normal token scores are log probabilities, so they are negative)
921
832
  // score type is double here to make tokenization results exactly
922
833
  // the same as in the HF tokenizer using SentencePiece
923
- const double token_score = llama_is_user_defined_token(vocab, token_id) ? 0.0 : token_data.score;
834
+ const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
924
835
  const double challenger_score = current_best.score_sum + token_score;
925
836
  struct best_tokenization & current_champ = tokenization_results[prefix_offset];
926
837
  if (challenger_score > current_champ.score_sum) {
@@ -934,11 +845,11 @@ struct llm_tokenizer_ugm_session {
934
845
  // if we didn't find a valid token corresponding to the whole UTF code point
935
846
  // then use unknown token as the tokenization of this UTF code point
936
847
  if (!single_codepoint_token_found) {
937
- const double challenger_score = current_best.score_sum + ugm_tokenizer->unknown_token_score;
848
+ const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
938
849
  prefix_offset = input_offset + n_utf8_code_units;
939
850
  struct best_tokenization & current_champ = tokenization_results[prefix_offset];
940
851
  if (challenger_score > current_champ.score_sum) {
941
- struct best_tokenization challenger = { vocab.special_unk_id, input_offset, (float) challenger_score };
852
+ struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
942
853
  current_champ = challenger;
943
854
  }
944
855
  }
@@ -951,7 +862,7 @@ struct llm_tokenizer_ugm_session {
951
862
  // merge sequences of consecutive unknown tokens into single unknown tokens
952
863
  bool is_prev_unknown = false;
953
864
  for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
954
- bool is_unknown = tokenization.token_id == vocab.special_unk_id;
865
+ bool is_unknown = tokenization.token_id == vocab.token_unk();
955
866
  if (!(is_prev_unknown && is_unknown)) {
956
867
  output.push_back(tokenization.token_id);
957
868
  }
@@ -978,11 +889,11 @@ private:
978
889
  normalized->clear();
979
890
  normalized->reserve(input.size() * 3);
980
891
 
981
- const std::string space = vocab.tokenizer_escape_whitespaces ? ugm_tokenizer->escaped_space : " ";
892
+ const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
982
893
 
983
- bool shall_prepend_space = !vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
984
- bool shall_append_space = vocab.tokenizer_treat_whitespace_as_suffix && vocab.tokenizer_add_space_prefix;
985
- bool shall_merge_spaces = vocab.tokenizer_remove_extra_whitespaces;
894
+ const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
895
+ const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
896
+ const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
986
897
 
987
898
  bool is_space_prepended = false;
988
899
  bool processing_non_ws = false;
@@ -1074,7 +985,7 @@ private:
1074
985
 
1075
986
  // if input prefix matches some user-defined token return this token as normalization result
1076
987
  auto user_defined_token_match =
1077
- ugm_tokenizer->user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
988
+ tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
1078
989
  if (user_defined_token_match.second > 0) {
1079
990
  return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
1080
991
  }
@@ -1082,8 +993,8 @@ private:
1082
993
  size_t longest_prefix_length = 0;
1083
994
  size_t longest_prefix_offset = 0;
1084
995
 
1085
- if (ugm_tokenizer->xcda_array_size > 0) {
1086
- struct xcda_array_view xcda_view(ugm_tokenizer->xcda_array, ugm_tokenizer->xcda_array_size);
996
+ if (tokenizer.xcda_array_size > 0) {
997
+ struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
1087
998
 
1088
999
  // Find the longest normalized sequence matching the input prefix by walking
1089
1000
  // the XOR-compressed compact double array (XCDA) starting from the root node
@@ -1119,10 +1030,10 @@ private:
1119
1030
 
1120
1031
  if (longest_prefix_length > 0) {
1121
1032
  // we have a match, so return the replacement sequence
1122
- if (longest_prefix_offset >= ugm_tokenizer->prefix_replacements_size) {
1033
+ if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
1123
1034
  throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
1124
1035
  }
1125
- const char * prefix_replacement = &(ugm_tokenizer->prefix_replacements)[longest_prefix_offset];
1036
+ const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
1126
1037
  return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
1127
1038
  }
1128
1039
 
@@ -1139,7 +1050,7 @@ private:
1139
1050
  }
1140
1051
 
1141
1052
  const llama_vocab & vocab;
1142
- const llm_tokenizer_ugm * ugm_tokenizer;
1053
+ const llm_tokenizer_ugm & tokenizer;
1143
1054
  };
1144
1055
 
1145
1056
  //
@@ -1201,15 +1112,15 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
1201
1112
  }
1202
1113
 
1203
1114
  struct llm_tokenizer_rwkv : llm_tokenizer {
1204
- llm_tokenizer_rwkv(const llama_vocab & vocab) : llm_tokenizer() {
1115
+ llm_tokenizer_rwkv(const llama_vocab & vocab) {
1205
1116
  // RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
1206
1117
  // For now, we decode the vocab here into the lookup we'll use for tokenization.
1207
1118
 
1208
1119
  // build trie
1209
- for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
1210
- const auto & token = vocab.id_to_token[id];
1211
- const auto data = llama_unescape_rwkv_token(token.text);
1212
- token_matcher.insert((const char *) data.data(), data.size(), id);
1120
+ for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
1121
+ const auto & data = vocab.get_token_data(id);
1122
+ const auto text = llama_unescape_rwkv_token(data.text);
1123
+ token_matcher.insert((const char *) text.data(), text.size(), id);
1213
1124
  }
1214
1125
  }
1215
1126
 
@@ -1217,16 +1128,15 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
1217
1128
  };
1218
1129
 
1219
1130
  struct llm_tokenizer_rwkv_session {
1220
- llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
1221
- rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
1131
+ llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
1222
1132
 
1223
- void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
1133
+ void tokenize(const std::string & text, std::vector<llama_token> & output) {
1224
1134
  uint32_t position = 0;
1225
1135
  while (position < text.size()) {
1226
- const struct naive_trie * node = rwkv_tokenizer.token_matcher.traverse(text[position]);
1136
+ const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
1227
1137
  if (node == NULL) {
1228
1138
  // no matching token found, add unknown token
1229
- output.push_back(vocab.special_unk_id);
1139
+ output.push_back(vocab.token_unk());
1230
1140
  position += 1;
1231
1141
  continue;
1232
1142
  }
@@ -1250,33 +1160,11 @@ struct llm_tokenizer_rwkv_session {
1250
1160
 
1251
1161
  private:
1252
1162
  const llama_vocab & vocab;
1253
- const llm_tokenizer_rwkv & rwkv_tokenizer;
1163
+ const llm_tokenizer_rwkv & tokenizer;
1254
1164
  };
1255
1165
 
1256
- void llama_vocab::init_tokenizer() {
1257
- switch (type) {
1258
- case LLAMA_VOCAB_TYPE_SPM:
1259
- tokenizer = new llm_tokenizer_spm(*this);
1260
- break;
1261
- case LLAMA_VOCAB_TYPE_BPE:
1262
- tokenizer = new llm_tokenizer_bpe(*this);
1263
- break;
1264
- case LLAMA_VOCAB_TYPE_WPM:
1265
- tokenizer = new llm_tokenizer_wpm(*this);
1266
- break;
1267
- case LLAMA_VOCAB_TYPE_UGM:
1268
- tokenizer = new llm_tokenizer_ugm(*this);
1269
- break;
1270
- case LLAMA_VOCAB_TYPE_RWKV:
1271
- tokenizer = new llm_tokenizer_rwkv(*this);
1272
- break;
1273
- default:
1274
- GGML_ABORT("unsupported vocab type");
1275
- }
1276
- }
1277
-
1278
1166
  //
1279
- // (de-) tokenize
1167
+ // impl
1280
1168
  //
1281
1169
 
1282
1170
  typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
@@ -1285,7 +1173,7 @@ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
1285
1173
  } FRAGMENT_BUFFER_VARIANT_TYPE;
1286
1174
 
1287
1175
  struct fragment_buffer_variant {
1288
- fragment_buffer_variant(llama_vocab::id _token)
1176
+ fragment_buffer_variant(llama_token _token)
1289
1177
  :
1290
1178
  type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
1291
1179
  token(_token),
@@ -1296,7 +1184,7 @@ struct fragment_buffer_variant {
1296
1184
  fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
1297
1185
  :
1298
1186
  type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
1299
- token((llama_vocab::id) - 1),
1187
+ token((llama_token) - 1),
1300
1188
  raw_text(_raw_text),
1301
1189
  offset(_offset),
1302
1190
  length(_length){
@@ -1306,451 +1194,1095 @@ struct fragment_buffer_variant {
1306
1194
  }
1307
1195
 
1308
1196
  const FRAGMENT_BUFFER_VARIANT_TYPE type;
1309
- const llama_vocab::id token;
1197
+ const llama_token token;
1310
1198
  const std::string _dummy;
1311
1199
  const std::string & raw_text;
1312
1200
  const uint64_t offset;
1313
1201
  const uint64_t length;
1314
1202
  };
1315
1203
 
1316
- // #define PRETOKENIZERDEBUG
1204
+ struct llama_vocab::impl {
1205
+ uint32_t n_token_types = 0; // for BERT-style token types
1317
1206
 
1318
- static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) {
1319
- // for each special token
1320
- for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
1321
- const auto & data = vocab.id_to_token[special_id];
1322
- const auto & special_token = data.text;
1207
+ enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
1208
+ enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1323
1209
 
1324
- if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
1325
- // Ignore control and unknown tokens when parse_special == false
1326
- continue;
1327
- // User-defined tokens are still pre-tokenized before everything else
1328
- // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
1329
- // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
1330
- }
1210
+ int max_token_len = 0; // used for optimizing longest token search
1331
1211
 
1332
- // for each text fragment
1333
- std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
1334
- while (it != buffer.end()) {
1335
- auto & fragment = (*it);
1212
+ // default LLaMA special tokens
1213
+ // TODO: should we set all of these to LLAMA_TOKEN_NULL?
1214
+ llama_token special_bos_id = 1;
1215
+ llama_token special_eos_id = 2;
1216
+ llama_token special_eot_id = LLAMA_TOKEN_NULL;
1217
+ llama_token special_eom_id = LLAMA_TOKEN_NULL;
1218
+ llama_token special_unk_id = 0;
1219
+ llama_token special_sep_id = LLAMA_TOKEN_NULL;
1220
+ llama_token special_pad_id = LLAMA_TOKEN_NULL;
1221
+ llama_token special_mask_id = LLAMA_TOKEN_NULL;
1336
1222
 
1337
- // if a fragment is text ( not yet processed )
1338
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1339
- const auto & raw_text = fragment.raw_text;
1223
+ llama_token linefeed_id = 13;
1340
1224
 
1341
- auto raw_text_base_offset = fragment.offset;
1342
- auto raw_text_base_length = fragment.length;
1225
+ // fim tokens
1226
+ llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
1227
+ llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
1228
+ llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
1229
+ llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
1230
+ llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
1231
+ llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
1343
1232
 
1344
- // loop over the text
1345
- while (true) {
1346
- // find the first occurrence of a given special token in this fragment
1347
- // passing offset argument only limit the "search area" but match coordinates
1348
- // are still relative to the source full raw_text
1349
- auto match = raw_text.find(special_token, raw_text_base_offset);
1233
+ // tokenizer flags
1234
+ bool add_space_prefix = false;
1235
+ bool add_bos = false;
1236
+ bool add_eos = false;
1237
+ bool ignore_merges = false;
1238
+ bool clean_spaces = false; // clean_up_tokenization_spaces
1239
+ bool remove_extra_whitespaces = false;
1240
+ bool escape_whitespaces = true;
1241
+ bool treat_whitespace_as_suffix = false;
1350
1242
 
1351
- // no occurrences found, stop processing this fragment for a given special token
1352
- if (match == std::string::npos) break;
1243
+ std::unordered_map<std::string, llama_token> token_to_id;
1244
+ std::vector<token_data> id_to_token;
1353
1245
 
1354
- // check if match is within bounds of offset <-> length
1355
- if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
1246
+ std::vector<llama_token> cache_special_tokens;
1247
+ std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
1356
1248
 
1357
- #ifdef PRETOKENIZERDEBUG
1358
- LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
1359
- #endif
1360
- auto source = std::distance(buffer.begin(), it);
1249
+ std::map<std::pair<std::string, std::string>, int> bpe_ranks;
1361
1250
 
1362
- // if match is further than base offset
1363
- // then we have some text to the left of it
1364
- if (match > raw_text_base_offset) {
1365
- // left
1366
- const int64_t left_reminder_offset = raw_text_base_offset + 0;
1367
- int64_t left_reminder_length = match - raw_text_base_offset;
1251
+ // set of all tokens that cause "end of generation"
1252
+ std::set<llama_token> special_eog_ids;
1368
1253
 
1369
- if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
1370
- while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
1371
- left_reminder_length--;
1372
- }
1373
- }
1254
+ std::unique_ptr<llm_tokenizer> tokenizer;
1374
1255
 
1375
- if (left_reminder_length > 0) {
1376
- buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
1377
- it++;
1378
- }
1256
+ std::vector<char> precompiled_charsmap;
1379
1257
 
1380
- #ifdef PRETOKENIZERDEBUG
1381
- LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
1382
- #endif
1383
- }
1258
+ impl(const llama_vocab & vocab) : vocab(vocab) {
1259
+ }
1384
1260
 
1385
- // special token
1386
- buffer.emplace_after(it, special_id);
1387
- it++;
1261
+ ~impl() = default;
1388
1262
 
1389
- // right
1390
- if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
1391
- int64_t right_reminder_offset = match + special_token.length();
1392
- int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
1263
+ void load(llama_model_loader & ml, const LLM_KV & kv);
1393
1264
 
1394
- if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
1395
- while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
1396
- right_reminder_offset++;
1397
- right_reminder_length--;
1398
- }
1399
- }
1265
+ enum llama_vocab_type get_type() const;
1400
1266
 
1401
- if (right_reminder_length > 0) {
1402
- buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
1403
- it++;
1404
- }
1267
+ std::string type_name() const;
1405
1268
 
1406
- #ifdef PRETOKENIZERDEBUG
1407
- LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
1408
- #endif
1269
+ bool is_normal (llama_token id) const;
1270
+ bool is_unknown (llama_token id) const;
1271
+ bool is_control (llama_token id) const;
1272
+ bool is_byte (llama_token id) const;
1273
+ bool is_user_defined(llama_token id) const;
1274
+ bool is_unused (llama_token id) const;
1275
+ bool is_eog (llama_token id) const;
1409
1276
 
1410
- if (source == 0) {
1411
- buffer.erase_after(buffer.before_begin());
1412
- } else {
1413
- buffer.erase_after(std::next(buffer.begin(), (source-1)));
1414
- }
1277
+ uint8_t token_to_byte(llama_token id) const;
1415
1278
 
1416
- // repeat for the right side
1417
- raw_text_base_offset = right_reminder_offset;
1418
- raw_text_base_length = right_reminder_length;
1279
+ llama_token_attr token_get_attr(llama_token id) const;
1419
1280
 
1420
- #ifdef PRETOKENIZERDEBUG
1421
- LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
1422
- #endif
1423
- } else {
1424
- if (source == 0) {
1425
- buffer.erase_after(buffer.before_begin());
1426
- } else {
1427
- buffer.erase_after(std::next(buffer.begin(), (source-1)));
1428
- }
1429
- break;
1430
- }
1281
+ void init_tokenizer(enum llama_vocab_type type);
1282
+
1283
+ void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
1284
+
1285
+ std::string token_to_piece_for_cache(
1286
+ llama_token token,
1287
+ bool special) const;
1288
+
1289
+
1290
+ std::vector<llama_token> tokenize(
1291
+ const std::string & raw_text,
1292
+ bool add_special,
1293
+ bool parse_special = false) const;
1294
+
1295
+ int32_t tokenize(
1296
+ const char * text,
1297
+ int32_t text_len,
1298
+ llama_token * tokens,
1299
+ int32_t n_tokens_max,
1300
+ bool add_special,
1301
+ bool parse_special) const;
1302
+
1303
+ // does not write null-terminator to buf
1304
+ int32_t token_to_piece(
1305
+ llama_token token,
1306
+ char * buf,
1307
+ int32_t length,
1308
+ int32_t lstrip,
1309
+ bool special) const;
1310
+
1311
+ // use cached data
1312
+ const std::string & token_to_piece(llama_token token) const;
1313
+
1314
+ int32_t detokenize(
1315
+ const llama_token * tokens,
1316
+ int32_t n_tokens,
1317
+ char * text,
1318
+ int32_t text_len_max,
1319
+ bool remove_special,
1320
+ bool unparse_special) const;
1321
+
1322
+ std::string detokenize(
1323
+ const std::vector<llama_token> & tokens,
1324
+ bool special) const;
1325
+
1326
+ void print_info() const;
1327
+
1328
+ private:
1329
+ const llama_vocab & vocab;
1330
+ };
1331
+
1332
+ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
1333
+ struct gguf_context * ctx = ml.meta.get();
1334
+
1335
+ // determine vocab type
1336
+ {
1337
+ std::string tokenizer_model;
1338
+ std::string tokenizer_pre;
1339
+
1340
+ ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
1341
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
1342
+
1343
+ ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
1344
+
1345
+ if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
1346
+ type = LLAMA_VOCAB_TYPE_NONE;
1347
+
1348
+ // default special tokens
1349
+ special_bos_id = LLAMA_TOKEN_NULL;
1350
+ special_eos_id = LLAMA_TOKEN_NULL;
1351
+ special_unk_id = LLAMA_TOKEN_NULL;
1352
+ special_sep_id = LLAMA_TOKEN_NULL;
1353
+ special_pad_id = LLAMA_TOKEN_NULL;
1354
+ special_mask_id = LLAMA_TOKEN_NULL;
1355
+ linefeed_id = LLAMA_TOKEN_NULL;
1356
+
1357
+ // read vocab size from metadata
1358
+ uint32_t n_tokens = 0;
1359
+ if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
1360
+ LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
1361
+ id_to_token.resize(n_tokens);
1362
+ }
1363
+
1364
+ return;
1365
+ }
1366
+
1367
+ if (tokenizer_model == "llama") {
1368
+ type = LLAMA_VOCAB_TYPE_SPM;
1369
+
1370
+ // default special tokens
1371
+ special_bos_id = 1;
1372
+ special_eos_id = 2;
1373
+ special_unk_id = 0;
1374
+ special_sep_id = LLAMA_TOKEN_NULL;
1375
+ special_pad_id = LLAMA_TOKEN_NULL;
1376
+ special_mask_id = LLAMA_TOKEN_NULL;
1377
+ } else if (tokenizer_model == "bert") {
1378
+ type = LLAMA_VOCAB_TYPE_WPM;
1379
+
1380
+ // default special tokens
1381
+ special_bos_id = 101;
1382
+ special_eos_id = LLAMA_TOKEN_NULL;
1383
+ special_unk_id = 100;
1384
+ special_sep_id = 102;
1385
+ special_pad_id = 0;
1386
+ special_mask_id = 103;
1387
+ } else if (tokenizer_model == "gpt2") {
1388
+ type = LLAMA_VOCAB_TYPE_BPE;
1389
+
1390
+ // read bpe merges and populate bpe ranks
1391
+ const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
1392
+ if (merges_keyidx == -1) {
1393
+ throw std::runtime_error("cannot find tokenizer merges in model file\n");
1394
+ }
1395
+
1396
+ const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
1397
+ for (int i = 0; i < n_merges; i++) {
1398
+ const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
1399
+ //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
1400
+
1401
+ std::string first;
1402
+ std::string second;
1403
+
1404
+ const size_t pos = word.find(' ', 1);
1405
+
1406
+ if (pos != std::string::npos) {
1407
+ first = word.substr(0, pos);
1408
+ second = word.substr(pos + 1);
1431
1409
  }
1410
+
1411
+ bpe_ranks.emplace(std::make_pair(first, second), i);
1432
1412
  }
1433
- it++;
1413
+
1414
+ // default special tokens
1415
+ special_bos_id = 11;
1416
+ special_eos_id = 11;
1417
+ special_unk_id = LLAMA_TOKEN_NULL;
1418
+ special_sep_id = LLAMA_TOKEN_NULL;
1419
+ special_pad_id = LLAMA_TOKEN_NULL;
1420
+ special_mask_id = LLAMA_TOKEN_NULL;
1421
+ } else if (tokenizer_model == "t5") {
1422
+ type = LLAMA_VOCAB_TYPE_UGM;
1423
+
1424
+ // default special tokens
1425
+ special_bos_id = LLAMA_TOKEN_NULL;
1426
+ special_eos_id = 1;
1427
+ special_unk_id = 2;
1428
+ special_sep_id = LLAMA_TOKEN_NULL;
1429
+ special_pad_id = 0;
1430
+ special_mask_id = LLAMA_TOKEN_NULL;
1431
+
1432
+ const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
1433
+ if (precompiled_charsmap_keyidx != -1) {
1434
+ size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
1435
+ const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
1436
+ precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
1437
+ #ifdef IS_BIG_ENDIAN
1438
+ // correct endiannes of data in precompiled_charsmap binary blob
1439
+ uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
1440
+ *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
1441
+ assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
1442
+ size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
1443
+ uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
1444
+ for (size_t i = 0; i < xcda_array_size; ++i) {
1445
+ xcda_array[i] = __builtin_bswap32(xcda_array[i]);
1446
+ }
1447
+ #endif
1448
+ }
1449
+ } else if (tokenizer_model == "rwkv") {
1450
+ type = LLAMA_VOCAB_TYPE_RWKV;
1451
+
1452
+ // default special tokens
1453
+ special_bos_id = LLAMA_TOKEN_NULL;
1454
+ special_eos_id = LLAMA_TOKEN_NULL;
1455
+ special_unk_id = LLAMA_TOKEN_NULL;
1456
+ special_sep_id = LLAMA_TOKEN_NULL;
1457
+ special_pad_id = LLAMA_TOKEN_NULL;
1458
+ } else {
1459
+ throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
1460
+ }
1461
+
1462
+ // for now, only BPE models have pre-tokenizers
1463
+ if (type == LLAMA_VOCAB_TYPE_BPE) {
1464
+ add_space_prefix = false;
1465
+ clean_spaces = true;
1466
+ if (tokenizer_pre.empty()) {
1467
+ LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
1468
+ LLAMA_LOG_WARN("%s: \n", __func__);
1469
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1470
+ LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
1471
+ LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
1472
+ LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
1473
+ LLAMA_LOG_WARN("%s: \n", __func__);
1474
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1475
+ } else if (tokenizer_pre == "default") {
1476
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1477
+ } else if (
1478
+ tokenizer_pre == "llama3" ||
1479
+ tokenizer_pre == "llama-v3" ||
1480
+ tokenizer_pre == "llama-bpe"||
1481
+ tokenizer_pre == "falcon3") {
1482
+ pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
1483
+ ignore_merges = true;
1484
+ add_bos = true;
1485
+ } else if (
1486
+ tokenizer_pre == "deepseek-llm") {
1487
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
1488
+ clean_spaces = false;
1489
+ } else if (
1490
+ tokenizer_pre == "deepseek-coder") {
1491
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1492
+ clean_spaces = false;
1493
+ } else if (
1494
+ tokenizer_pre == "deepseek-v3") {
1495
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1496
+ clean_spaces = false;
1497
+ } else if (
1498
+ tokenizer_pre == "falcon") {
1499
+ pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
1500
+ } else if (
1501
+ tokenizer_pre == "mpt") {
1502
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
1503
+ } else if (
1504
+ tokenizer_pre == "starcoder") {
1505
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
1506
+ } else if (
1507
+ tokenizer_pre == "gpt-2" ||
1508
+ tokenizer_pre == "phi-2" ||
1509
+ tokenizer_pre == "jina-es" ||
1510
+ tokenizer_pre == "jina-de" ||
1511
+ tokenizer_pre == "gigachat" ||
1512
+ tokenizer_pre == "jina-v1-en" ||
1513
+ tokenizer_pre == "jina-v2-es" ||
1514
+ tokenizer_pre == "jina-v2-de" ||
1515
+ tokenizer_pre == "jina-v2-code" ||
1516
+ tokenizer_pre == "roberta-bpe") {
1517
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
1518
+ } else if (
1519
+ tokenizer_pre == "refact") {
1520
+ pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
1521
+ } else if (
1522
+ tokenizer_pre == "command-r") {
1523
+ pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
1524
+ clean_spaces = false;
1525
+ } else if (
1526
+ tokenizer_pre == "qwen2") {
1527
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1528
+ clean_spaces = false;
1529
+ } else if (
1530
+ tokenizer_pre == "stablelm2") {
1531
+ pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
1532
+ } else if (
1533
+ tokenizer_pre == "olmo") {
1534
+ pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
1535
+ } else if (
1536
+ tokenizer_pre == "dbrx") {
1537
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
1538
+ } else if (
1539
+ tokenizer_pre == "smaug-bpe") {
1540
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
1541
+ } else if (
1542
+ tokenizer_pre == "poro-chat") {
1543
+ pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
1544
+ clean_spaces = false;
1545
+ } else if (
1546
+ tokenizer_pre == "chatglm-bpe") {
1547
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
1548
+ special_bos_id = LLAMA_TOKEN_NULL;
1549
+ } else if (
1550
+ tokenizer_pre == "viking") {
1551
+ pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
1552
+ clean_spaces = false;
1553
+ } else if (
1554
+ tokenizer_pre == "jais") {
1555
+ pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
1556
+ } else if (
1557
+ tokenizer_pre == "tekken") {
1558
+ pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
1559
+ clean_spaces = false;
1560
+ ignore_merges = true;
1561
+ add_bos = true;
1562
+ } else if (
1563
+ tokenizer_pre == "smollm") {
1564
+ pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
1565
+ clean_spaces = false;
1566
+ } else if (
1567
+ tokenizer_pre == "codeshell") {
1568
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
1569
+ } else if (
1570
+ tokenizer_pre == "bloom") {
1571
+ pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
1572
+ } else if (
1573
+ tokenizer_pre == "gpt3-finnish") {
1574
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
1575
+ } else if (
1576
+ tokenizer_pre == "exaone") {
1577
+ pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
1578
+ } else if (
1579
+ tokenizer_pre == "chameleon") {
1580
+ pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
1581
+ add_bos = true;
1582
+ clean_spaces = false;
1583
+ } else if (
1584
+ tokenizer_pre == "minerva-7b") {
1585
+ pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
1586
+ } else if (
1587
+ tokenizer_pre == "megrez") {
1588
+ pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
1589
+ } else {
1590
+ throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
1591
+ }
1592
+ } else if (type == LLAMA_VOCAB_TYPE_SPM) {
1593
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1594
+ add_space_prefix = true;
1595
+ clean_spaces = false;
1596
+ add_bos = true;
1597
+ add_eos = false;
1598
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
1599
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1600
+ add_space_prefix = false;
1601
+ clean_spaces = true;
1602
+ add_bos = true;
1603
+ add_eos = false;
1604
+ } else if (type == LLAMA_VOCAB_TYPE_UGM) {
1605
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1606
+ add_bos = false;
1607
+ add_eos = true;
1608
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
1609
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1610
+ add_space_prefix = false;
1611
+ clean_spaces = false;
1612
+ add_bos = false;
1613
+ add_eos = false;
1614
+ } else {
1615
+ pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
1434
1616
  }
1617
+
1618
+ ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
1619
+ ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
1435
1620
  }
1436
- }
1437
1621
 
1438
- std::vector<llama_vocab::id> llama_tokenize_internal(
1439
- const llama_vocab & vocab,
1440
- std::string raw_text,
1441
- bool add_special,
1442
- bool parse_special) {
1443
- GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
1622
+ const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
1623
+ if (token_idx == -1) {
1624
+ throw std::runtime_error("cannot find tokenizer vocab in model file\n");
1625
+ }
1444
1626
 
1445
- std::vector<llama_vocab::id> output;
1446
- std::forward_list<fragment_buffer_variant> fragment_buffer;
1627
+ const float * scores = nullptr;
1628
+ const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
1629
+ if (score_idx != -1) {
1630
+ scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
1631
+ }
1447
1632
 
1448
- if (!raw_text.empty()) {
1449
- fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
1450
- tokenizer_st_partition(vocab, fragment_buffer, parse_special);
1633
+ const int * toktypes = nullptr;
1634
+ const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
1635
+ if (toktype_idx != -1) {
1636
+ toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
1451
1637
  }
1452
1638
 
1453
- switch (vocab.type) {
1454
- case LLAMA_VOCAB_TYPE_SPM:
1455
- {
1456
- // OG tokenizer behavior:
1457
- //
1458
- // tokenizer.encode('', add_special_tokens=True) returns [1]
1459
- // tokenizer.encode('', add_special_tokens=False) returns []
1639
+ uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
1640
+ id_to_token.resize(n_tokens);
1641
+
1642
+ for (uint32_t i = 0; i < n_tokens; i++) {
1643
+ std::string word = gguf_get_arr_str(ctx, token_idx, i);
1644
+ if (word.empty()) {
1645
+ LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
1646
+ word = "[EMPTY_" + std::to_string(i) + "]";
1647
+ }
1648
+
1649
+ token_to_id[word] = i;
1650
+ max_token_len = std::max(max_token_len, (int) word.size());
1651
+
1652
+ auto & token_data = id_to_token[i];
1653
+ token_data.text = std::move(word);
1654
+ token_data.score = scores ? scores[i] : 0.0f;
1655
+ token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
1656
+
1657
+ if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
1658
+ switch(toktypes[i]) {
1659
+ case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
1660
+ case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
1661
+ case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
1662
+ case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
1663
+ case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
1664
+ case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
1665
+ case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
1666
+ default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
1667
+ }
1668
+ }
1669
+ }
1670
+ GGML_ASSERT(id_to_token.size() == token_to_id.size());
1460
1671
 
1461
- bool is_prev_special = true; // prefix with space if first token
1672
+ init_tokenizer(type);
1462
1673
 
1463
- if (add_special && vocab.tokenizer_add_bos) {
1464
- GGML_ASSERT(vocab.special_bos_id != -1);
1465
- output.push_back(vocab.special_bos_id);
1466
- is_prev_special = true;
1467
- }
1674
+ // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
1675
+ if (type == LLAMA_VOCAB_TYPE_SPM) {
1676
+ try {
1677
+ linefeed_id = vocab.byte_to_token('\n');
1678
+ } catch (const std::exception & e) {
1679
+ LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
1680
+ linefeed_id = special_pad_id;
1681
+ }
1682
+ } else if (type == LLAMA_VOCAB_TYPE_WPM) {
1683
+ linefeed_id = special_pad_id;
1684
+ } else if (type == LLAMA_VOCAB_TYPE_RWKV) {
1685
+ const std::vector<int> ids = tokenize("\n", false);
1686
+ GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
1687
+ linefeed_id = ids[0];
1688
+ } else {
1689
+ const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
1690
+
1691
+ //GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
1692
+ if (ids.empty()) {
1693
+ LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
1694
+ linefeed_id = special_pad_id;
1695
+ } else {
1696
+ linefeed_id = ids[0];
1697
+ }
1698
+ }
1468
1699
 
1469
- for (const auto & fragment : fragment_buffer) {
1470
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1471
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
1700
+ // special tokens
1701
+ {
1702
+ const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
1703
+ { LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
1704
+ { LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
1705
+ { LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
1706
+ { LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
1707
+ { LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
1708
+ { LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
1709
+ { LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
1710
+ { LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
1711
+ { LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
1712
+ { LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
1713
+ { LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
1714
+ { LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
1715
+ { LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
1716
+ { LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
1717
+
1718
+ // deprecated
1719
+ { LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
1720
+ { LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
1721
+ { LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
1722
+ };
1723
+
1724
+ for (const auto & it : special_token_types) {
1725
+ const std::string & key = kv(std::get<0>(it));
1726
+ int32_t & id = std::get<1>(it);
1727
+
1728
+ uint32_t new_id;
1729
+ if (!ml.get_key(std::get<0>(it), new_id, false)) {
1730
+ continue;
1731
+ }
1732
+ if (new_id >= id_to_token.size()) {
1733
+ LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
1734
+ __func__, key.c_str(), new_id, id);
1735
+ } else {
1736
+ id = new_id;
1737
+ }
1738
+ }
1472
1739
 
1473
- // prefix with space if previous is special
1474
- if (vocab.tokenizer_add_space_prefix && is_prev_special) {
1475
- raw_text = " " + raw_text;
1476
- }
1740
+ // Handle add_bos and add_eos
1741
+ {
1742
+ bool temp = true;
1477
1743
 
1478
- #ifdef PRETOKENIZERDEBUG
1479
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1480
- #endif
1481
- llama_escape_whitespace(raw_text);
1482
- llm_tokenizer_spm_session session(vocab);
1483
- session.tokenize(raw_text, output);
1484
- is_prev_special = false;
1485
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1486
- output.push_back(fragment.token);
1487
- is_prev_special = true;
1744
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
1745
+ add_bos = temp;
1746
+ }
1747
+ if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
1748
+ add_eos = temp;
1749
+ }
1750
+ }
1751
+
1752
+ // auto-detect special tokens by text
1753
+ // TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
1754
+ // for now, we apply this workaround to find the tokens based on their text
1755
+
1756
+ for (const auto & t : token_to_id) {
1757
+ // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
1758
+ if (special_eot_id == LLAMA_TOKEN_NULL) {
1759
+ if (false
1760
+ || t.first == "<|eot_id|>"
1761
+ || t.first == "<|im_end|>"
1762
+ || t.first == "<|end|>"
1763
+ || t.first == "<end_of_turn>"
1764
+ || t.first == "<|endoftext|>"
1765
+ || t.first == "<EOT>"
1766
+ || t.first == "<|end▁of▁sentence|>" // DeepSeek
1767
+ ) {
1768
+ special_eot_id = t.second;
1769
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1770
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1771
+ __func__, t.second, t.first.c_str());
1772
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1488
1773
  }
1489
1774
  }
1775
+ }
1490
1776
 
1491
- if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
1492
- LLAMA_LOG_WARN(
1493
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
1494
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
1495
- "Are you sure this is what you want?\n", __FUNCTION__);
1777
+ // find EOM token: "<|eom_id|>"
1778
+ if (special_eom_id == LLAMA_TOKEN_NULL) {
1779
+ if (false
1780
+ || t.first == "<|eom_id|>"
1781
+ ) {
1782
+ special_eom_id = t.second;
1783
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1784
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1785
+ __func__, t.second, t.first.c_str());
1786
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1787
+ }
1496
1788
  }
1789
+ }
1497
1790
 
1498
- if (add_special && vocab.tokenizer_add_eos) {
1499
- GGML_ASSERT(vocab.special_eos_id != -1);
1500
- output.push_back(vocab.special_eos_id);
1791
+ // find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
1792
+ if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
1793
+ if (false
1794
+ || t.first == "<|fim_prefix|>" // Qwen
1795
+ || t.first == "<fim-prefix>"
1796
+ || t.first == "<|fim▁begin|>" // DeepSeek
1797
+ || t.first == "<PRE>"
1798
+ ) {
1799
+ special_fim_pre_id = t.second;
1800
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1801
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1802
+ __func__, t.second, t.first.c_str());
1803
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1804
+ }
1501
1805
  }
1502
- } break;
1503
- case LLAMA_VOCAB_TYPE_BPE:
1504
- {
1505
- llm_tokenizer_bpe_session session(vocab);
1506
- // it calls some other methods that are not exist in llm_tokenizer,
1507
- // here just cast it to bpe tokenizer object
1508
- if (add_special) {
1509
- session.append_bos(output);
1510
- }
1511
- for (const auto & fragment : fragment_buffer) {
1512
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1513
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
1806
+ }
1514
1807
 
1515
- #ifdef PRETOKENIZERDEBUG
1516
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1517
- #endif
1518
- session.tokenize(raw_text, output);
1519
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1520
- session.append(fragment.token, output);
1808
+ // find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
1809
+ if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
1810
+ if (false
1811
+ || t.first == "<|fim_suffix|>" // Qwen
1812
+ || t.first == "<fim-suffix>"
1813
+ || t.first == "<|fim▁hole|>" // DeepSeek
1814
+ || t.first == "<SUF>"
1815
+ ) {
1816
+ special_fim_suf_id = t.second;
1817
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1818
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1819
+ __func__, t.second, t.first.c_str());
1820
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1521
1821
  }
1522
1822
  }
1823
+ }
1523
1824
 
1524
- if (add_special) {
1525
- session.append_eos(output);
1526
- session.check_double_bos_eos(output);
1527
- }
1528
- } break;
1529
- case LLAMA_VOCAB_TYPE_WPM:
1530
- {
1531
- if (add_special) {
1532
- GGML_ASSERT(vocab.special_cls_id != -1);
1533
- output.push_back(vocab.special_cls_id);
1825
+ // find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
1826
+ if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
1827
+ if (false
1828
+ || t.first == "<|fim_middle|>" // Qwen
1829
+ || t.first == "<fim-middle>"
1830
+ || t.first == "<|fim▁end|>" // DeepSeek
1831
+ || t.first == "<MID>"
1832
+ ) {
1833
+ special_fim_mid_id = t.second;
1834
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1835
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1836
+ __func__, t.second, t.first.c_str());
1837
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1838
+ }
1534
1839
  }
1840
+ }
1535
1841
 
1536
- llm_tokenizer_wpm_session session(vocab);
1537
-
1538
- for (const auto & fragment : fragment_buffer) {
1539
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1540
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
1541
-
1542
- #ifdef PRETOKENIZERDEBUG
1543
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1544
- #endif
1545
- session.tokenize(raw_text, output);
1546
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1547
- output.push_back(fragment.token);
1842
+ // find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
1843
+ if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
1844
+ if (false
1845
+ || t.first == "<|fim_pad|>" // Qwen
1846
+ || t.first == "<fim-pad>"
1847
+ || t.first == "<PAD>"
1848
+ ) {
1849
+ special_fim_pad_id = t.second;
1850
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1851
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1852
+ __func__, t.second, t.first.c_str());
1853
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1548
1854
  }
1549
1855
  }
1856
+ }
1550
1857
 
1551
- if (add_special) {
1552
- GGML_ASSERT(vocab.special_sep_id != -1);
1553
- output.push_back(vocab.special_sep_id);
1554
- }
1555
- } break;
1556
- case LLAMA_VOCAB_TYPE_UGM:
1557
- {
1558
- if (add_special && vocab.tokenizer_add_bos) {
1559
- GGML_ASSERT(vocab.special_bos_id != -1);
1560
- output.push_back(vocab.special_bos_id);
1858
+ // find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
1859
+ if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
1860
+ if (false
1861
+ || t.first == "<|fim_repo|>" // Qwen
1862
+ || t.first == "<|repo_name|>"
1863
+ || t.first == "<fim-repo>"
1864
+ || t.first == "<REPO>"
1865
+ ) {
1866
+ special_fim_rep_id = t.second;
1867
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1868
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1869
+ __func__, t.second, t.first.c_str());
1870
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1871
+ }
1561
1872
  }
1562
- llm_tokenizer_ugm_session session(vocab);
1873
+ }
1563
1874
 
1564
- for (const auto & fragment : fragment_buffer) {
1565
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1566
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
1567
- #ifdef PRETOKENIZERDEBUG
1568
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1569
- #endif
1570
- session.tokenize(raw_text, output);
1571
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1572
- output.push_back(fragment.token);
1875
+ // find FIM_SEP token: "<|file_sep|>"
1876
+ if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
1877
+ if (false
1878
+ || t.first == "<|file_sep|>" // Qwen
1879
+ ) {
1880
+ special_fim_sep_id = t.second;
1881
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1882
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1883
+ __func__, t.second, t.first.c_str());
1884
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1573
1885
  }
1574
1886
  }
1887
+ }
1888
+ }
1575
1889
 
1576
- if (add_special && vocab.tokenizer_add_bos && output.size() >= 2 && output[1] == vocab.special_bos_id) {
1577
- LLAMA_LOG_WARN(
1578
- "%s: Added a BOS token to the prompt as specified by the model but the prompt "
1579
- "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
1580
- "Are you sure this is what you want?\n", __FUNCTION__);
1581
- }
1890
+ // maintain a list of tokens that cause end-of-generation
1891
+ // this is currently determined based on the token text, which is obviously not ideal
1892
+ // ref: https://github.com/ggerganov/llama.cpp/issues/9606
1893
+ special_eog_ids.clear();
1582
1894
 
1583
- if (add_special && vocab.tokenizer_add_eos) {
1584
- GGML_ASSERT(vocab.special_eos_id != -1);
1585
- output.push_back(vocab.special_eos_id);
1586
- }
1587
- } break;
1588
- case LLAMA_VOCAB_TYPE_RWKV:
1589
- {
1590
- llm_tokenizer_rwkv_session session(vocab);
1591
- for (const auto & fragment : fragment_buffer) {
1592
- if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
1593
- auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
1895
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
1896
+ special_eog_ids.insert(special_fim_pad_id);
1897
+ }
1594
1898
 
1595
- #ifdef PRETOKENIZERDEBUG
1596
- LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
1597
- #endif
1899
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
1900
+ special_eog_ids.insert(special_fim_rep_id);
1901
+ }
1598
1902
 
1599
- session.tokenize(raw_text, output);
1600
- } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
1601
- output.push_back(fragment.token);
1602
- }
1903
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
1904
+ special_eog_ids.insert(special_fim_sep_id);
1905
+ }
1906
+
1907
+ for (const auto & t : token_to_id) {
1908
+ if (false
1909
+ || t.first == "<|eot_id|>"
1910
+ || t.first == "<|im_end|>"
1911
+ || t.first == "<|end|>"
1912
+ || t.first == "<end_of_turn>"
1913
+ || t.first == "<|endoftext|>"
1914
+ || t.first == "<|eom_id|>"
1915
+ || t.first == "<EOT>"
1916
+ ) {
1917
+ special_eog_ids.insert(t.second);
1918
+ if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
1919
+ LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
1920
+ __func__, t.second, t.first.c_str());
1921
+ id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
1603
1922
  }
1604
- } break;
1605
- case LLAMA_VOCAB_TYPE_NONE:
1606
- GGML_ABORT("fatal error");
1923
+ } else {
1924
+ // token is control, but not marked as EOG -> print a debug log
1925
+ if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
1926
+ LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
1927
+ __func__, t.second, t.first.c_str());
1928
+ }
1929
+ }
1930
+ }
1931
+
1932
+ // sanity checks
1933
+ if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
1934
+ special_eog_ids.insert(special_eos_id);
1935
+ LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
1936
+ }
1937
+
1938
+ if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
1939
+ special_eog_ids.insert(special_eot_id);
1940
+ LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
1941
+ }
1942
+
1943
+ if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
1944
+ special_eog_ids.insert(special_eom_id);
1945
+ LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
1946
+ }
1607
1947
  }
1608
1948
 
1609
- return output;
1610
- }
1949
+ // build special tokens cache
1950
+ {
1951
+ for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
1952
+ if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
1953
+ cache_special_tokens.push_back(id);
1954
+ }
1955
+ }
1611
1956
 
1612
- llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
1613
- GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
1614
- static const char * hex = "0123456789ABCDEF";
1615
- switch (llama_vocab_get_type(vocab)) {
1616
- case LLAMA_VOCAB_TYPE_SPM:
1617
- case LLAMA_VOCAB_TYPE_UGM: {
1618
- const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
1619
- auto token = vocab.token_to_id.find(buf);
1620
- if (token != vocab.token_to_id.end()) {
1621
- return (*token).second;
1957
+ std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
1958
+ [&] (const llama_token a, const llama_token b) {
1959
+ return id_to_token[a].text.size() > id_to_token[b].text.size();
1622
1960
  }
1623
- // Try to fall back to just the byte as a string
1624
- const char buf2[2] = { (char)ch, 0 };
1625
- return vocab.token_to_id.at(buf2);
1961
+ );
1962
+
1963
+ LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
1964
+ }
1965
+
1966
+ // build token to piece cache
1967
+ {
1968
+ size_t size_cache = 0;
1969
+
1970
+ std::vector<std::string> cache(n_tokens);
1971
+
1972
+ for (uint32_t id = 0; id < n_tokens; ++id) {
1973
+ cache[id] = token_to_piece_for_cache(id, true);
1974
+
1975
+ size_cache += cache[id].size();
1626
1976
  }
1627
- case LLAMA_VOCAB_TYPE_WPM:
1628
- case LLAMA_VOCAB_TYPE_BPE: {
1629
- return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
1977
+
1978
+ std::swap(cache_token_to_piece, cache);
1979
+
1980
+ LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
1981
+ }
1982
+
1983
+ // Handle per token attributes
1984
+ //NOTE: Each model customizes per token attributes.
1985
+ //NOTE: Per token attributes are missing from the GGUF file.
1986
+ //TODO: Extract attributes from GGUF file.
1987
+ {
1988
+ auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
1989
+ for (const auto & substr : substrs) {
1990
+ if (str.find(substr) < std::string::npos) {
1991
+ return true;
1992
+ }
1993
+ }
1994
+ return false;
1995
+ };
1996
+
1997
+ auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
1998
+ uint32_t current = id_to_token.at(id).attr;
1999
+ current = value ? (current | attr) : (current & ~attr);
2000
+ id_to_token[id].attr = (llama_token_attr) current;
2001
+ };
2002
+
2003
+ auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
2004
+ _set_tokenid_attr(token_to_id.at(token), attr, value);
2005
+ };
2006
+
2007
+ std::string model_name;
2008
+ std::string tokenizer_pre;
2009
+
2010
+ ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
2011
+ ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2012
+
2013
+ // model name to lowercase
2014
+ std::transform(model_name.begin(), model_name.end(), model_name.begin(),
2015
+ [] (const std::string::value_type x) {
2016
+ return std::tolower(x);
2017
+ }
2018
+ );
2019
+
2020
+ // set attributes by model/tokenizer name
2021
+ if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
2022
+ _set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
2023
+ } else if (_contains_any(model_name, {"phi-3", "phi3"})) {
2024
+ for (auto id : cache_special_tokens) {
2025
+ _set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
2026
+ }
2027
+ for (const auto * token : {"</s>"}) {
2028
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
2029
+ }
2030
+ for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
2031
+ _set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
2032
+ }
1630
2033
  }
1631
- default:
1632
- GGML_ABORT("fatal error");
1633
2034
  }
1634
2035
  }
1635
2036
 
1636
- const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token) {
1637
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
1638
- return vocab.id_to_token[token].text.c_str();
2037
+ enum llama_vocab_type llama_vocab::impl::get_type() const {
2038
+ return type;
1639
2039
  }
1640
2040
 
1641
- float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token) {
1642
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
1643
- return vocab.id_to_token[token].score;
2041
+ std::string llama_vocab::impl::type_name() const{
2042
+ switch (type) {
2043
+ case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
2044
+ case LLAMA_VOCAB_TYPE_SPM: return "SPM";
2045
+ case LLAMA_VOCAB_TYPE_BPE: return "BPE";
2046
+ case LLAMA_VOCAB_TYPE_WPM: return "WPM";
2047
+ case LLAMA_VOCAB_TYPE_UGM: return "UGM";
2048
+ case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
2049
+ default: return "unknown";
2050
+ }
1644
2051
  }
1645
2052
 
1646
- llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token) {
1647
- GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
1648
- return vocab.id_to_token[token].attr;
2053
+ bool llama_vocab::impl::is_normal(llama_token id) const {
2054
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2055
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
1649
2056
  }
1650
2057
 
1651
- bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
1652
- return token != -1 && vocab.special_eog_ids.count(token) > 0;
2058
+ bool llama_vocab::impl::is_unknown(llama_token id) const {
2059
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2060
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
1653
2061
  }
1654
2062
 
1655
- bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token) {
1656
- return llama_is_control_token(vocab, token);
2063
+ bool llama_vocab::impl::is_control(llama_token id) const {
2064
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2065
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
1657
2066
  }
1658
2067
 
1659
- llama_token llama_token_bos_impl(const struct llama_vocab & vocab) {
1660
- return vocab.special_bos_id;
2068
+ bool llama_vocab::impl::is_byte(llama_token id) const {
2069
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2070
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
1661
2071
  }
1662
2072
 
1663
- llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
1664
- return vocab.special_eos_id;
2073
+ bool llama_vocab::impl::is_user_defined(llama_token id) const {
2074
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2075
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
1665
2076
  }
1666
2077
 
1667
- llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
1668
- return vocab.special_eot_id;
2078
+ bool llama_vocab::impl::is_unused(llama_token id) const {
2079
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2080
+ return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
1669
2081
  }
1670
2082
 
1671
- llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
1672
- return vocab.special_eom_id;
2083
+ bool llama_vocab::impl::is_eog(llama_token id) const {
2084
+ return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
1673
2085
  }
1674
2086
 
1675
- llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
1676
- return vocab.special_cls_id;
2087
+ uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
2088
+ GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2089
+ GGML_ASSERT(is_byte(id));
2090
+ const auto & token_data = id_to_token.at(id);
2091
+ switch (get_type()) {
2092
+ case LLAMA_VOCAB_TYPE_SPM:
2093
+ case LLAMA_VOCAB_TYPE_UGM: {
2094
+ auto buf = token_data.text.substr(3, 2);
2095
+ return strtol(buf.c_str(), NULL, 16);
2096
+ }
2097
+ case LLAMA_VOCAB_TYPE_BPE: {
2098
+ GGML_ABORT("fatal error");
2099
+ }
2100
+ case LLAMA_VOCAB_TYPE_WPM: {
2101
+ GGML_ABORT("fatal error");
2102
+ }
2103
+ default:
2104
+ GGML_ABORT("fatal error");
2105
+ }
1677
2106
  }
1678
2107
 
1679
- llama_token llama_token_sep_impl(const struct llama_vocab & vocab) {
1680
- return vocab.special_sep_id;
2108
+ llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
2109
+ GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
2110
+ return id_to_token.at(id).attr;
1681
2111
  }
1682
2112
 
1683
- llama_token llama_token_nl_impl(const struct llama_vocab & vocab) {
1684
- return vocab.linefeed_id;
1685
- }
2113
+ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
2114
+ LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
1686
2115
 
1687
- llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
1688
- return vocab.special_pad_id;
2116
+ switch (type) {
2117
+ case LLAMA_VOCAB_TYPE_SPM:
2118
+ tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
2119
+ break;
2120
+ case LLAMA_VOCAB_TYPE_BPE:
2121
+ tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
2122
+ break;
2123
+ case LLAMA_VOCAB_TYPE_WPM:
2124
+ tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
2125
+ break;
2126
+ case LLAMA_VOCAB_TYPE_UGM:
2127
+ tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
2128
+ break;
2129
+ case LLAMA_VOCAB_TYPE_RWKV:
2130
+ tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
2131
+ break;
2132
+ default:
2133
+ GGML_ABORT("unsupported vocab type");
2134
+ }
1689
2135
  }
1690
2136
 
1691
- bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
1692
- return vocab.tokenizer_add_bos;
1693
- }
2137
+ //
2138
+ // (de-) tokenize
2139
+ //
1694
2140
 
1695
- bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
1696
- return vocab.tokenizer_add_eos;
1697
- }
2141
+ // #define PRETOKENIZERDEBUG
1698
2142
 
1699
- llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
1700
- return vocab.special_fim_pre_id;
1701
- }
2143
+ void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
2144
+ // for each special token
2145
+ for (const llama_token special_id : cache_special_tokens) {
2146
+ const auto & data = vocab.get_token_data(special_id);
2147
+ const auto & text = data.text;
1702
2148
 
1703
- llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
1704
- return vocab.special_fim_mid_id;
1705
- }
2149
+ if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
2150
+ // Ignore control and unknown tokens when parse_special == false
2151
+ continue;
2152
+ // User-defined tokens are still pre-tokenized before everything else
2153
+ // ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
2154
+ // This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
2155
+ }
1706
2156
 
1707
- llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
1708
- return vocab.special_fim_suf_id;
1709
- }
2157
+ // for each text fragment
2158
+ std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
2159
+ while (it != buffer.end()) {
2160
+ auto & fragment = (*it);
1710
2161
 
1711
- llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
1712
- return vocab.special_fim_pre_id;
1713
- }
2162
+ // if a fragment is text ( not yet processed )
2163
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2164
+ const auto & raw_text = fragment.raw_text;
1714
2165
 
1715
- llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
1716
- return vocab.special_fim_suf_id;
1717
- }
2166
+ auto raw_text_base_offset = fragment.offset;
2167
+ auto raw_text_base_length = fragment.length;
1718
2168
 
1719
- llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
1720
- return vocab.special_fim_mid_id;
1721
- }
2169
+ // loop over the text
2170
+ while (true) {
2171
+ // find the first occurrence of a given special token in this fragment
2172
+ // passing offset argument only limit the "search area" but match coordinates
2173
+ // are still relative to the source full raw_text
2174
+ auto match = raw_text.find(text, raw_text_base_offset);
1722
2175
 
1723
- llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
1724
- return vocab.special_fim_pad_id;
1725
- }
2176
+ // no occurrences found, stop processing this fragment for a given special token
2177
+ if (match == std::string::npos) break;
1726
2178
 
1727
- llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
1728
- return vocab.special_fim_rep_id;
1729
- }
2179
+ // check if match is within bounds of offset <-> length
2180
+ if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
1730
2181
 
1731
- llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
1732
- return vocab.special_fim_sep_id;
1733
- }
2182
+ #ifdef PRETOKENIZERDEBUG
2183
+ LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2184
+ #endif
2185
+ auto source = std::distance(buffer.begin(), it);
1734
2186
 
1735
- int32_t llama_tokenize_impl(
1736
- const struct llama_vocab & vocab,
1737
- const char * text,
1738
- int32_t text_len,
1739
- llama_token * tokens,
1740
- int32_t n_tokens_max,
1741
- bool add_special,
1742
- bool parse_special) {
1743
- auto res = llama_tokenize_internal(vocab, std::string(text, text_len), add_special, parse_special);
1744
- if (n_tokens_max < (int) res.size()) {
1745
- // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
1746
- return -((int) res.size());
2187
+ // if match is further than base offset
2188
+ // then we have some text to the left of it
2189
+ if (match > raw_text_base_offset) {
2190
+ // left
2191
+ const int64_t left_reminder_offset = raw_text_base_offset + 0;
2192
+ int64_t left_reminder_length = match - raw_text_base_offset;
2193
+
2194
+ if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
2195
+ while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
2196
+ left_reminder_length--;
2197
+ }
2198
+ }
2199
+
2200
+ if (left_reminder_length > 0) {
2201
+ buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
2202
+ it++;
2203
+ }
2204
+
2205
+ #ifdef PRETOKENIZERDEBUG
2206
+ LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
2207
+ #endif
2208
+ }
2209
+
2210
+ // special token
2211
+ buffer.emplace_after(it, special_id);
2212
+ it++;
2213
+
2214
+ // right
2215
+ if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
2216
+ int64_t right_reminder_offset = match + text.length();
2217
+ int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
2218
+
2219
+ if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
2220
+ while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
2221
+ right_reminder_offset++;
2222
+ right_reminder_length--;
2223
+ }
2224
+ }
2225
+
2226
+ if (right_reminder_length > 0) {
2227
+ buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
2228
+ it++;
2229
+ }
2230
+
2231
+ #ifdef PRETOKENIZERDEBUG
2232
+ LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
2233
+ #endif
2234
+
2235
+ if (source == 0) {
2236
+ buffer.erase_after(buffer.before_begin());
2237
+ } else {
2238
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2239
+ }
2240
+
2241
+ // repeat for the right side
2242
+ raw_text_base_offset = right_reminder_offset;
2243
+ raw_text_base_length = right_reminder_length;
2244
+
2245
+ #ifdef PRETOKENIZERDEBUG
2246
+ LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
2247
+ #endif
2248
+ } else {
2249
+ if (source == 0) {
2250
+ buffer.erase_after(buffer.before_begin());
2251
+ } else {
2252
+ buffer.erase_after(std::next(buffer.begin(), (source - 1)));
2253
+ }
2254
+ break;
2255
+ }
2256
+ }
2257
+ }
2258
+ it++;
2259
+ }
1747
2260
  }
2261
+ }
1748
2262
 
1749
- for (size_t i = 0; i < res.size(); i++) {
1750
- tokens[i] = res[i];
2263
+ // NOTE: avoid ever using this except for building the token_to_piece caches
2264
+ std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
2265
+ std::string piece;
2266
+ piece.resize(piece.capacity()); // using string internal cache
2267
+ const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2268
+ if (n_chars < 0) {
2269
+ piece.resize(-n_chars);
2270
+ int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
2271
+ GGML_ASSERT(check == -n_chars);
2272
+ }
2273
+ else {
2274
+ piece.resize(n_chars);
1751
2275
  }
1752
2276
 
1753
- return res.size();
2277
+ return piece;
2278
+ }
2279
+
2280
+ static void llama_escape_whitespace(std::string & text) {
2281
+ replace_all(text, " ", "\xe2\x96\x81");
2282
+ }
2283
+
2284
+ static void llama_unescape_whitespace(std::string & word) {
2285
+ replace_all(word, "\xe2\x96\x81", " ");
1754
2286
  }
1755
2287
 
1756
2288
  static std::string llama_decode_text(const std::string & text) {
@@ -1773,11 +2305,185 @@ static std::string llama_decode_text(const std::string & text) {
1773
2305
  return decoded_text;
1774
2306
  }
1775
2307
 
1776
- // does not write null-terminator to buf
1777
- int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) {
2308
+ std::vector<llama_token> llama_vocab::impl::tokenize(
2309
+ const std::string & raw_text,
2310
+ bool add_special,
2311
+ bool parse_special) const {
2312
+ GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2313
+
2314
+ std::vector<llama_token> output;
2315
+ std::forward_list<fragment_buffer_variant> fragment_buffer;
2316
+
2317
+ if (!raw_text.empty()) {
2318
+ fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
2319
+ tokenizer_st_partition(fragment_buffer, parse_special);
2320
+ }
2321
+
2322
+ switch (get_type()) {
2323
+ case LLAMA_VOCAB_TYPE_SPM:
2324
+ {
2325
+ // OG tokenizer behavior:
2326
+ //
2327
+ // tokenizer.encode('', add_special_tokens=True) returns [1]
2328
+ // tokenizer.encode('', add_special_tokens=False) returns []
2329
+
2330
+ bool is_prev_special = true; // prefix with space if first token
2331
+
2332
+ if (add_special && add_bos) {
2333
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2334
+ output.push_back(special_bos_id);
2335
+ is_prev_special = true;
2336
+ }
2337
+
2338
+ for (const auto & fragment : fragment_buffer) {
2339
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2340
+ std::string text;
2341
+
2342
+ // prefix with space if previous is special
2343
+ if (add_space_prefix && is_prev_special) {
2344
+ text = ' ';
2345
+ }
2346
+
2347
+ text += fragment.raw_text.substr(fragment.offset, fragment.length);
2348
+
2349
+ #ifdef PRETOKENIZERDEBUG
2350
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2351
+ #endif
2352
+ llama_escape_whitespace(text);
2353
+ llm_tokenizer_spm_session session(vocab);
2354
+ session.tokenize(text, output);
2355
+ is_prev_special = false;
2356
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2357
+ output.push_back(fragment.token);
2358
+ is_prev_special = true;
2359
+ }
2360
+ }
2361
+
2362
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2363
+ LLAMA_LOG_WARN(
2364
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2365
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2366
+ "Are you sure this is what you want?\n", __FUNCTION__);
2367
+ }
2368
+
2369
+ if (add_special && add_eos) {
2370
+ GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2371
+ output.push_back(special_eos_id);
2372
+ }
2373
+ } break;
2374
+ case LLAMA_VOCAB_TYPE_BPE:
2375
+ {
2376
+ llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
2377
+ // it calls some other methods that are not exist in llm_tokenizer,
2378
+ // here just cast it to bpe tokenizer object
2379
+ if (add_special) {
2380
+ session.append_bos(output);
2381
+ }
2382
+ for (const auto & fragment : fragment_buffer) {
2383
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2384
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2385
+
2386
+ #ifdef PRETOKENIZERDEBUG
2387
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2388
+ #endif
2389
+ session.tokenize(text, output);
2390
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2391
+ session.append(fragment.token, output);
2392
+ }
2393
+ }
2394
+
2395
+ if (add_special) {
2396
+ session.append_eos(output);
2397
+ session.check_double_bos_eos(output);
2398
+ }
2399
+ } break;
2400
+ case LLAMA_VOCAB_TYPE_WPM:
2401
+ {
2402
+ if (add_special) {
2403
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2404
+ output.push_back(special_bos_id);
2405
+ }
2406
+
2407
+ llm_tokenizer_wpm_session session(vocab);
2408
+
2409
+ for (const auto & fragment : fragment_buffer) {
2410
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2411
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2412
+
2413
+ #ifdef PRETOKENIZERDEBUG
2414
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2415
+ #endif
2416
+ session.tokenize(text, output);
2417
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2418
+ output.push_back(fragment.token);
2419
+ }
2420
+ }
2421
+
2422
+ if (add_special) {
2423
+ GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
2424
+ output.push_back(special_sep_id);
2425
+ }
2426
+ } break;
2427
+ case LLAMA_VOCAB_TYPE_UGM:
2428
+ {
2429
+ if (add_special && add_bos) {
2430
+ GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
2431
+ output.push_back(special_bos_id);
2432
+ }
2433
+ llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
2434
+
2435
+ for (const auto & fragment : fragment_buffer) {
2436
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2437
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2438
+ #ifdef PRETOKENIZERDEBUG
2439
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2440
+ #endif
2441
+ session.tokenize(text, output);
2442
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2443
+ output.push_back(fragment.token);
2444
+ }
2445
+ }
2446
+
2447
+ if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
2448
+ LLAMA_LOG_WARN(
2449
+ "%s: Added a BOS token to the prompt as specified by the model but the prompt "
2450
+ "also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
2451
+ "Are you sure this is what you want?\n", __FUNCTION__);
2452
+ }
2453
+
2454
+ if (add_special && add_eos) {
2455
+ GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
2456
+ output.push_back(special_eos_id);
2457
+ }
2458
+ } break;
2459
+ case LLAMA_VOCAB_TYPE_RWKV:
2460
+ {
2461
+ llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
2462
+ for (const auto & fragment : fragment_buffer) {
2463
+ if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
2464
+ std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
2465
+
2466
+ #ifdef PRETOKENIZERDEBUG
2467
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
2468
+ #endif
2469
+
2470
+ session.tokenize(text, output);
2471
+ } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
2472
+ output.push_back(fragment.token);
2473
+ }
2474
+ }
2475
+ } break;
2476
+ case LLAMA_VOCAB_TYPE_NONE:
2477
+ GGML_ABORT("fatal error");
2478
+ }
2479
+
2480
+ return output;
2481
+ }
2482
+
2483
+ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
1778
2484
  // ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
1779
2485
  static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
1780
- const llama_token_attr attr = llama_token_get_attr_impl(vocab, token);
2486
+ const llama_token_attr attr = token_get_attr(token);
1781
2487
  if (!special && (attr & attr_special)) {
1782
2488
  return 0;
1783
2489
  }
@@ -1798,7 +2504,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
1798
2504
 
1799
2505
  // if we have a cache - use it
1800
2506
  {
1801
- const auto & cache = vocab.cache_token_to_piece;
2507
+ const auto & cache = cache_token_to_piece;
1802
2508
 
1803
2509
  if (!cache.empty()) {
1804
2510
  const auto & result = cache.at(token);
@@ -1806,9 +2512,9 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
1806
2512
  }
1807
2513
  }
1808
2514
 
1809
- if (0 <= token && token < (int32_t) vocab.id_to_token.size()) {
1810
- const std::string & token_text = vocab.id_to_token[token].text;
1811
- switch (llama_vocab_get_type(vocab)) {
2515
+ if (0 <= token && token < (int32_t) id_to_token.size()) {
2516
+ const std::string & token_text = id_to_token[token].text;
2517
+ switch (get_type()) {
1812
2518
  case LLAMA_VOCAB_TYPE_WPM:
1813
2519
  case LLAMA_VOCAB_TYPE_SPM:
1814
2520
  case LLAMA_VOCAB_TYPE_UGM: {
@@ -1823,7 +2529,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
1823
2529
  return _try_copy(result.data(), result.size());
1824
2530
  }
1825
2531
  if (attr & LLAMA_TOKEN_ATTR_BYTE) {
1826
- char byte = (char) llama_token_to_byte(vocab, token);
2532
+ char byte = (char) token_to_byte(token);
1827
2533
  return _try_copy((char*) &byte, 1);
1828
2534
  }
1829
2535
  break;
@@ -1859,43 +2565,46 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
1859
2565
  return 0;
1860
2566
  }
1861
2567
 
1862
- int32_t llama_detokenize_impl(
1863
- const struct llama_vocab & vocab,
2568
+ const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
2569
+ return cache_token_to_piece.at(token);
2570
+ }
2571
+
2572
+ int32_t llama_vocab::impl::detokenize(
1864
2573
  const llama_token * tokens,
1865
2574
  int32_t n_tokens,
1866
2575
  char * text,
1867
2576
  int32_t text_len_max,
1868
2577
  bool remove_special,
1869
- bool unparse_special) {
1870
- if (vocab.type == LLAMA_VOCAB_TYPE_NONE) {
2578
+ bool unparse_special) const {
2579
+ if (type == LLAMA_VOCAB_TYPE_NONE) {
1871
2580
  return 0;
1872
2581
  }
1873
2582
 
1874
- GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
2583
+ GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
1875
2584
 
1876
2585
  int32_t avail = text_len_max;
1877
2586
  int32_t total = 0;
1878
2587
 
1879
2588
  // remove the leading space
1880
- bool remove_space = vocab.tokenizer_add_space_prefix;
2589
+ bool remove_space = add_space_prefix;
1881
2590
 
1882
- if (remove_special && vocab.tokenizer_add_bos) {
1883
- if (n_tokens > 0 && tokens[0] == vocab.special_bos_id) {
2591
+ if (remove_special && add_bos) {
2592
+ if (n_tokens > 0 && tokens[0] == special_bos_id) {
1884
2593
  remove_space = false;
1885
2594
  n_tokens--;
1886
2595
  tokens++;
1887
2596
  }
1888
2597
  }
1889
2598
 
1890
- if (remove_special && vocab.tokenizer_add_eos) {
1891
- if (n_tokens > 0 && tokens[n_tokens-1] == vocab.special_eos_id) {
2599
+ if (remove_special && add_eos) {
2600
+ if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
1892
2601
  n_tokens--;
1893
2602
  }
1894
2603
  }
1895
2604
 
1896
2605
  for (int32_t i = 0; i < n_tokens; ++i) {
1897
2606
  GGML_ASSERT(avail >= 0);
1898
- int32_t n_chars = llama_token_to_piece_impl(vocab, tokens[i], text, avail, remove_space, unparse_special);
2607
+ int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
1899
2608
  remove_space = false;
1900
2609
  if (n_chars < 0) {
1901
2610
  avail = 0;
@@ -1911,7 +2620,7 @@ int32_t llama_detokenize_impl(
1911
2620
  return -total;
1912
2621
  }
1913
2622
 
1914
- if (vocab.tokenizer_clean_spaces) {
2623
+ if (clean_spaces) {
1915
2624
  text -= total; // restart text
1916
2625
 
1917
2626
  // first pass: characters ?!., //TODO: where do these characters come from?
@@ -1972,13 +2681,321 @@ int32_t llama_detokenize_impl(
1972
2681
  return total <= text_len_max ? total : -total;
1973
2682
  }
1974
2683
 
1975
- std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
2684
+ void llama_vocab::impl::print_info() const {
2685
+ LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
2686
+ LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
2687
+ LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
2688
+
2689
+ // special tokens
2690
+ if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
2691
+ if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
2692
+ if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
2693
+ if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
2694
+ if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
2695
+ if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
2696
+ if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
2697
+ if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
2698
+
2699
+ if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
2700
+
2701
+ if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
2702
+ if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
2703
+ if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
2704
+ if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
2705
+ if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
2706
+ if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
2707
+
2708
+ for (const auto & id : special_eog_ids) {
2709
+ LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
2710
+ }
2711
+
2712
+ LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
2713
+ }
2714
+
2715
+ llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
2716
+ }
2717
+
2718
+ llama_vocab::~llama_vocab() {
2719
+ }
2720
+
2721
+ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
2722
+ pimpl->load(ml, kv);
2723
+ }
2724
+
2725
+ enum llama_vocab_type llama_vocab::get_type() const {
2726
+ return pimpl->type;
2727
+ }
2728
+
2729
+ enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
2730
+ return pimpl->pre_type;
2731
+ }
2732
+
2733
+ uint32_t llama_vocab::n_tokens() const {
2734
+ return (uint32_t) pimpl->id_to_token.size();
2735
+ }
2736
+
2737
+ uint32_t llama_vocab::n_token_types() const {
2738
+ return (uint32_t) pimpl->n_token_types;
2739
+ }
2740
+
2741
+ std::string llama_vocab::type_name() const{
2742
+ return pimpl->type_name();
2743
+ }
2744
+
2745
+ bool llama_vocab::is_normal(llama_token id) const {
2746
+ return pimpl->is_normal(id);
2747
+ }
2748
+
2749
+ bool llama_vocab::is_unknown(llama_token id) const {
2750
+ return pimpl->is_unknown(id);
2751
+ }
2752
+
2753
+ bool llama_vocab::is_control(llama_token id) const {
2754
+ return pimpl->is_control(id);
2755
+ }
2756
+
2757
+ bool llama_vocab::is_byte(llama_token id) const {
2758
+ return pimpl->is_byte(id);
2759
+ }
2760
+
2761
+ bool llama_vocab::is_user_defined(llama_token id) const {
2762
+ return pimpl->is_user_defined(id);
2763
+ }
2764
+
2765
+ bool llama_vocab::is_unused(llama_token id) const {
2766
+ return pimpl->is_unused(id);
2767
+ }
2768
+
2769
+ bool llama_vocab::is_eog(llama_token id) const {
2770
+ return pimpl->is_eog(id);
2771
+ }
2772
+
2773
+ uint8_t llama_vocab::token_to_byte(llama_token id) const {
2774
+ return pimpl->token_to_byte(id);
2775
+ }
2776
+
2777
+ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
2778
+ GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
2779
+ static const char * hex = "0123456789ABCDEF";
2780
+ switch (get_type()) {
2781
+ case LLAMA_VOCAB_TYPE_SPM:
2782
+ case LLAMA_VOCAB_TYPE_UGM: {
2783
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
2784
+ auto token = pimpl->token_to_id.find(buf);
2785
+ if (token != pimpl->token_to_id.end()) {
2786
+ return (*token).second;
2787
+ }
2788
+ // Try to fall back to just the byte as a string
2789
+ const char buf2[2] = { (char)ch, 0 };
2790
+ return pimpl->token_to_id.at(buf2);
2791
+ }
2792
+ case LLAMA_VOCAB_TYPE_WPM:
2793
+ case LLAMA_VOCAB_TYPE_BPE: {
2794
+ return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
2795
+ }
2796
+ default:
2797
+ GGML_ABORT("fatal error");
2798
+ }
2799
+ }
2800
+
2801
+ llama_token llama_vocab::text_to_token(const std::string & text) const {
2802
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
2803
+ auto it = pimpl->token_to_id.find(text);
2804
+ if (it != pimpl->token_to_id.end()) {
2805
+ return (*it).second;
2806
+ }
2807
+ return LLAMA_TOKEN_NULL;
2808
+ }
2809
+
2810
+ const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
2811
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
2812
+ return pimpl->id_to_token.at(id);
2813
+ }
2814
+
2815
+ const char * llama_vocab::token_get_text(llama_token id) const {
2816
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
2817
+ return pimpl->id_to_token.at(id).text.c_str();
2818
+ }
2819
+
2820
+ float llama_vocab::token_get_score(llama_token id) const {
2821
+ GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
2822
+ return pimpl->id_to_token.at(id).score;
2823
+ }
2824
+
2825
+ llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
2826
+ return pimpl->token_get_attr(id);
2827
+ }
2828
+
2829
+ llama_token llama_vocab::token_bos() const {
2830
+ return pimpl->special_bos_id;
2831
+ }
2832
+
2833
+ llama_token llama_vocab::token_eos() const {
2834
+ return pimpl->special_eos_id;
2835
+ }
2836
+
2837
+ llama_token llama_vocab::token_eot() const {
2838
+ return pimpl->special_eot_id;
2839
+ }
2840
+
2841
+ llama_token llama_vocab::token_eom() const {
2842
+ return pimpl->special_eom_id;
2843
+ }
2844
+
2845
+ llama_token llama_vocab::token_unk() const {
2846
+ return pimpl->special_unk_id;
2847
+ }
2848
+
2849
+ llama_token llama_vocab::token_sep() const {
2850
+ return pimpl->special_sep_id;
2851
+ }
2852
+
2853
+ llama_token llama_vocab::token_nl() const {
2854
+ return pimpl->linefeed_id;
2855
+ }
2856
+
2857
+ llama_token llama_vocab::token_pad() const {
2858
+ return pimpl->special_pad_id;
2859
+ }
2860
+
2861
+ llama_token llama_vocab::token_prefix() const {
2862
+ return pimpl->special_fim_pre_id;
2863
+ }
2864
+
2865
+ llama_token llama_vocab::token_middle() const {
2866
+ return pimpl->special_fim_mid_id;
2867
+ }
2868
+
2869
+ llama_token llama_vocab::token_suffix() const {
2870
+ return pimpl->special_fim_suf_id;
2871
+ }
2872
+
2873
+ llama_token llama_vocab::token_fim_pre() const {
2874
+ return pimpl->special_fim_pre_id;
2875
+ }
2876
+
2877
+ llama_token llama_vocab::token_fim_suf() const {
2878
+ return pimpl->special_fim_suf_id;
2879
+ }
2880
+
2881
+ llama_token llama_vocab::token_fim_mid() const {
2882
+ return pimpl->special_fim_mid_id;
2883
+ }
2884
+
2885
+ llama_token llama_vocab::token_fim_pad() const {
2886
+ return pimpl->special_fim_pad_id;
2887
+ }
2888
+
2889
+ llama_token llama_vocab::token_fim_rep() const {
2890
+ return pimpl->special_fim_rep_id;
2891
+ }
2892
+
2893
+ llama_token llama_vocab::token_fim_sep() const {
2894
+ return pimpl->special_fim_sep_id;
2895
+ }
2896
+
2897
+ bool llama_vocab::get_add_space_prefix() const {
2898
+ return pimpl->add_space_prefix;
2899
+ }
2900
+
2901
+ bool llama_vocab::get_add_bos() const {
2902
+ return pimpl->add_bos;
2903
+ }
2904
+
2905
+ bool llama_vocab::get_add_eos() const {
2906
+ return pimpl->add_eos;
2907
+ }
2908
+
2909
+ bool llama_vocab::get_ignore_merges() const {
2910
+ return pimpl->ignore_merges;
2911
+ }
2912
+
2913
+ bool llama_vocab::get_clean_spaces() const {
2914
+ return pimpl->clean_spaces;
2915
+ }
2916
+
2917
+ bool llama_vocab::get_remove_extra_whitespaces() const {
2918
+ return pimpl->remove_extra_whitespaces;
2919
+ }
2920
+
2921
+ bool llama_vocab::get_escape_whitespaces() const {
2922
+ return pimpl->escape_whitespaces;
2923
+ }
2924
+
2925
+ bool llama_vocab::get_treat_whitespace_as_suffix() const {
2926
+ return pimpl->treat_whitespace_as_suffix;
2927
+ }
2928
+
2929
+ int llama_vocab::max_token_len() const {
2930
+ return pimpl->max_token_len;
2931
+ }
2932
+
2933
+ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
2934
+ GGML_ASSERT(token_left.find(' ') == std::string::npos);
2935
+ GGML_ASSERT(token_left.find('\n') == std::string::npos);
2936
+ GGML_ASSERT(token_right.find(' ') == std::string::npos);
2937
+ GGML_ASSERT(token_right.find('\n') == std::string::npos);
2938
+
2939
+ auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
2940
+ if (it == pimpl->bpe_ranks.end()) {
2941
+ return -1;
2942
+ }
2943
+
2944
+ return it->second;
2945
+ }
2946
+
2947
+ int32_t llama_vocab::tokenize(
2948
+ const char * text,
2949
+ int32_t text_len,
2950
+ llama_token * tokens,
2951
+ int32_t n_tokens_max,
2952
+ bool add_special,
2953
+ bool parse_special) const {
2954
+ auto res = tokenize(std::string(text, text_len), add_special, parse_special);
2955
+ if (n_tokens_max < (int) res.size()) {
2956
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
2957
+ return -((int) res.size());
2958
+ }
2959
+
2960
+ for (size_t i = 0; i < res.size(); i++) {
2961
+ tokens[i] = res[i];
2962
+ }
2963
+
2964
+ return res.size();
2965
+ }
2966
+
2967
+ std::vector<llama_token> llama_vocab::tokenize(
2968
+ const std::string & raw_text,
2969
+ bool add_special,
2970
+ bool parse_special) const {
2971
+ return pimpl->tokenize(raw_text, add_special, parse_special);
2972
+ }
2973
+
2974
+ const std::string & llama_vocab::token_to_piece(llama_token token) const {
2975
+ return pimpl->token_to_piece(token);
2976
+ }
2977
+
2978
+ int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
2979
+ return pimpl->token_to_piece(token, buf, length, lstrip, special);
2980
+ }
2981
+
2982
+ int32_t llama_vocab::detokenize(
2983
+ const llama_token * tokens,
2984
+ int32_t n_tokens,
2985
+ char * text,
2986
+ int32_t text_len_max,
2987
+ bool remove_special,
2988
+ bool unparse_special) const {
2989
+ return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
2990
+ }
2991
+
2992
+ std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
1976
2993
  std::string text;
1977
2994
  text.resize(std::max(text.capacity(), tokens.size()));
1978
- int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2995
+ int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1979
2996
  if (n_chars < 0) {
1980
2997
  text.resize(-n_chars);
1981
- n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2998
+ n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1982
2999
  GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
1983
3000
  }
1984
3001
 
@@ -1987,3 +3004,243 @@ std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector
1987
3004
  // NOTE: the original tokenizer decodes bytes after collecting the pieces.
1988
3005
  return text;
1989
3006
  }
3007
+
3008
+ void llama_vocab::print_info() const {
3009
+ pimpl->print_info();
3010
+ }
3011
+
3012
+ //
3013
+ // interface implementation
3014
+ //
3015
+
3016
+ int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
3017
+ return vocab->n_tokens();
3018
+ }
3019
+
3020
+ // deprecated
3021
+ int32_t llama_n_vocab(const struct llama_vocab * vocab) {
3022
+ return llama_vocab_n_tokens(vocab);
3023
+ }
3024
+
3025
+ enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
3026
+ return vocab->get_type();
3027
+ }
3028
+
3029
+ const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
3030
+ return vocab->token_get_text(token);
3031
+ }
3032
+
3033
+ float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
3034
+ return vocab->token_get_score(token);
3035
+ }
3036
+
3037
+ enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
3038
+ return vocab->token_get_attr(token);
3039
+ }
3040
+
3041
+ bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
3042
+ return vocab->is_eog(token);
3043
+ }
3044
+
3045
+ bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
3046
+ return vocab->is_control(token);
3047
+ }
3048
+
3049
+ llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
3050
+ return vocab->token_bos();
3051
+ }
3052
+
3053
+ llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
3054
+ return vocab->token_eos();
3055
+ }
3056
+
3057
+ llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
3058
+ return vocab->token_eot();
3059
+ }
3060
+
3061
+ // deprecated
3062
+ llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
3063
+ return vocab->token_bos();
3064
+ }
3065
+
3066
+ llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
3067
+ return vocab->token_sep();
3068
+ }
3069
+
3070
+ llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
3071
+ return vocab->token_nl();
3072
+ }
3073
+
3074
+ llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
3075
+ return vocab->token_pad();
3076
+ }
3077
+
3078
+ bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
3079
+ return vocab->get_add_bos();
3080
+ }
3081
+
3082
+ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
3083
+ return vocab->get_add_eos();
3084
+ }
3085
+
3086
+ llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
3087
+ return vocab->token_fim_pre();
3088
+ }
3089
+
3090
+ llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
3091
+ return vocab->token_fim_suf();
3092
+ }
3093
+
3094
+ llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
3095
+ return vocab->token_fim_mid();
3096
+ }
3097
+
3098
+ llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
3099
+ return vocab->token_fim_pad();
3100
+ }
3101
+
3102
+ llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
3103
+ return vocab->token_fim_rep();
3104
+ }
3105
+
3106
+ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
3107
+ return vocab->token_fim_sep();
3108
+ }
3109
+
3110
+ // deprecated
3111
+ const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
3112
+ return llama_vocab_get_text(vocab, token);
3113
+ }
3114
+
3115
+ // deprecated
3116
+ float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
3117
+ return llama_vocab_get_score(vocab, token);
3118
+ }
3119
+
3120
+ // deprecated
3121
+ enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
3122
+ return llama_vocab_get_attr(vocab, token);
3123
+ }
3124
+
3125
+ // deprecated
3126
+ bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
3127
+ return llama_vocab_is_eog(vocab, token);
3128
+ }
3129
+
3130
+ // deprecated
3131
+ bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
3132
+ return llama_vocab_is_control(vocab, token);
3133
+ }
3134
+
3135
+ // deprecated
3136
+ llama_token llama_token_bos(const struct llama_vocab * vocab) {
3137
+ return llama_vocab_bos(vocab);
3138
+ }
3139
+
3140
+ // deprecated
3141
+ llama_token llama_token_eos(const struct llama_vocab * vocab) {
3142
+ return llama_vocab_eos(vocab);
3143
+ }
3144
+
3145
+ // deprecated
3146
+ llama_token llama_token_eot(const struct llama_vocab * vocab) {
3147
+ return llama_vocab_eot(vocab);
3148
+ }
3149
+
3150
+ // deprecated
3151
+ llama_token llama_token_cls(const struct llama_vocab * vocab) {
3152
+ //return llama_vocab_cls(vocab);
3153
+ return llama_vocab_bos(vocab); // avoid deprecation warning
3154
+ }
3155
+
3156
+ // deprecated
3157
+ llama_token llama_token_sep(const struct llama_vocab * vocab) {
3158
+ return llama_vocab_sep(vocab);
3159
+ }
3160
+
3161
+ // deprecated
3162
+ llama_token llama_token_nl (const struct llama_vocab * vocab) {
3163
+ return llama_vocab_nl(vocab);
3164
+ }
3165
+
3166
+ // deprecated
3167
+ llama_token llama_token_pad(const struct llama_vocab * vocab) {
3168
+ return llama_vocab_pad(vocab);
3169
+ }
3170
+
3171
+ // deprecated
3172
+ bool llama_add_bos_token(const struct llama_vocab * vocab) {
3173
+ return llama_vocab_get_add_bos(vocab);
3174
+ }
3175
+
3176
+ // deprecated
3177
+ bool llama_add_eos_token(const struct llama_vocab * vocab) {
3178
+ return llama_vocab_get_add_eos(vocab);
3179
+ }
3180
+
3181
+ // deprecated
3182
+ llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
3183
+ return llama_vocab_fim_pre(vocab);
3184
+ }
3185
+
3186
+ // deprecated
3187
+ llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
3188
+ return llama_vocab_fim_suf(vocab);
3189
+ }
3190
+
3191
+ // deprecated
3192
+ llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
3193
+ return llama_vocab_fim_mid(vocab);
3194
+ }
3195
+
3196
+ // deprecated
3197
+ llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
3198
+ return llama_vocab_fim_pad(vocab);
3199
+ }
3200
+
3201
+ // deprecated
3202
+ llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
3203
+ return llama_vocab_fim_rep(vocab);
3204
+ }
3205
+
3206
+ // deprecated
3207
+ llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
3208
+ return llama_vocab_fim_sep(vocab);
3209
+ }
3210
+
3211
+ //
3212
+ // tokenization
3213
+ //
3214
+
3215
+ int32_t llama_tokenize(
3216
+ const struct llama_vocab * vocab,
3217
+ const char * text,
3218
+ int32_t text_len,
3219
+ llama_token * tokens,
3220
+ int32_t n_tokens_max,
3221
+ bool add_special,
3222
+ bool parse_special) {
3223
+ return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
3224
+ }
3225
+
3226
+ int32_t llama_token_to_piece(
3227
+ const struct llama_vocab * vocab,
3228
+ llama_token token,
3229
+ char * buf,
3230
+ int32_t length,
3231
+ int32_t lstrip,
3232
+ bool special) {
3233
+ return vocab->token_to_piece(token, buf, length, lstrip, special);
3234
+ }
3235
+
3236
+ int32_t llama_detokenize(
3237
+ const struct llama_vocab * vocab,
3238
+ const llama_token * tokens,
3239
+ int32_t n_tokens,
3240
+ char * text,
3241
+ int32_t text_len_max,
3242
+ bool remove_special,
3243
+ bool unparse_special) {
3244
+ return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
3245
+ }
3246
+