@fugood/llama.node 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +3 -1
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
#include "llama-vocab.h"
|
|
2
2
|
|
|
3
|
+
#include "llama-impl.h"
|
|
4
|
+
#include "llama-model-loader.h"
|
|
5
|
+
|
|
3
6
|
#include "unicode.h"
|
|
4
7
|
|
|
5
8
|
#include <algorithm>
|
|
@@ -9,29 +12,15 @@
|
|
|
9
12
|
#include <cstdarg>
|
|
10
13
|
#include <cstring>
|
|
11
14
|
#include <forward_list>
|
|
15
|
+
#include <map>
|
|
12
16
|
#include <queue>
|
|
13
|
-
#include <
|
|
17
|
+
#include <set>
|
|
18
|
+
#include <unordered_map>
|
|
14
19
|
|
|
15
20
|
//
|
|
16
21
|
// helpers
|
|
17
22
|
//
|
|
18
23
|
|
|
19
|
-
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
|
20
|
-
static std::string format(const char * fmt, ...) {
|
|
21
|
-
va_list ap;
|
|
22
|
-
va_list ap2;
|
|
23
|
-
va_start(ap, fmt);
|
|
24
|
-
va_copy(ap2, ap);
|
|
25
|
-
int size = vsnprintf(NULL, 0, fmt, ap);
|
|
26
|
-
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
|
27
|
-
std::vector<char> buf(size + 1);
|
|
28
|
-
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
|
29
|
-
GGML_ASSERT(size2 == size);
|
|
30
|
-
va_end(ap2);
|
|
31
|
-
va_end(ap);
|
|
32
|
-
return std::string(buf.data(), size);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
24
|
struct naive_trie {
|
|
36
25
|
naive_trie() : has_value(false), value(0) {
|
|
37
26
|
}
|
|
@@ -76,96 +65,14 @@ struct naive_trie {
|
|
|
76
65
|
};
|
|
77
66
|
|
|
78
67
|
//
|
|
79
|
-
//
|
|
68
|
+
// tokenizers
|
|
80
69
|
//
|
|
81
70
|
|
|
82
71
|
struct llm_tokenizer {
|
|
83
|
-
|
|
84
|
-
|
|
72
|
+
llm_tokenizer() {}
|
|
73
|
+
virtual ~llm_tokenizer() = default;
|
|
85
74
|
};
|
|
86
75
|
|
|
87
|
-
llama_vocab::~llama_vocab() {
|
|
88
|
-
delete tokenizer;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
|
92
|
-
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
|
93
|
-
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
|
94
|
-
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
|
95
|
-
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
|
96
|
-
|
|
97
|
-
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
|
98
|
-
if (it == bpe_ranks.end()) {
|
|
99
|
-
return -1;
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
return it->second;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
|
106
|
-
return vocab.type;
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
|
110
|
-
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
111
|
-
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
|
115
|
-
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
116
|
-
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
|
120
|
-
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
121
|
-
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
|
122
|
-
}
|
|
123
|
-
|
|
124
|
-
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
125
|
-
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
126
|
-
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
|
|
130
|
-
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
131
|
-
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
|
|
135
|
-
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
|
136
|
-
return vocab.id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
|
|
140
|
-
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
|
141
|
-
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
|
142
|
-
const auto & token_data = vocab.id_to_token.at(id);
|
|
143
|
-
switch (llama_vocab_get_type(vocab)) {
|
|
144
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
|
145
|
-
case LLAMA_VOCAB_TYPE_UGM: {
|
|
146
|
-
auto buf = token_data.text.substr(3, 2);
|
|
147
|
-
return strtol(buf.c_str(), NULL, 16);
|
|
148
|
-
}
|
|
149
|
-
case LLAMA_VOCAB_TYPE_BPE: {
|
|
150
|
-
GGML_ABORT("fatal error");
|
|
151
|
-
//return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
|
|
152
|
-
}
|
|
153
|
-
case LLAMA_VOCAB_TYPE_WPM: {
|
|
154
|
-
GGML_ABORT("fatal error");
|
|
155
|
-
}
|
|
156
|
-
default:
|
|
157
|
-
GGML_ABORT("fatal error");
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
static void llama_escape_whitespace(std::string & text) {
|
|
162
|
-
replace_all(text, " ", "\xe2\x96\x81");
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
static void llama_unescape_whitespace(std::string & word) {
|
|
166
|
-
replace_all(word, "\xe2\x96\x81", " ");
|
|
167
|
-
}
|
|
168
|
-
|
|
169
76
|
struct llm_symbol {
|
|
170
77
|
using index = int;
|
|
171
78
|
index prev;
|
|
@@ -197,14 +104,13 @@ struct llm_bigram_spm {
|
|
|
197
104
|
};
|
|
198
105
|
|
|
199
106
|
struct llm_tokenizer_spm : llm_tokenizer {
|
|
200
|
-
llm_tokenizer_spm(const llama_vocab & /*vocab*/)
|
|
107
|
+
llm_tokenizer_spm(const llama_vocab & /*vocab*/) {}
|
|
201
108
|
};
|
|
202
109
|
|
|
203
110
|
struct llm_tokenizer_spm_session {
|
|
204
111
|
llm_tokenizer_spm_session(const llama_vocab & vocab) : vocab(vocab) {}
|
|
205
112
|
|
|
206
|
-
void tokenize(const std::string & text, std::vector<
|
|
207
|
-
|
|
113
|
+
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
|
208
114
|
// split string into utf8 chars
|
|
209
115
|
int index = 0;
|
|
210
116
|
size_t offs = 0;
|
|
@@ -263,13 +169,13 @@ struct llm_tokenizer_spm_session {
|
|
|
263
169
|
}
|
|
264
170
|
|
|
265
171
|
private:
|
|
266
|
-
void resegment(llm_symbol & symbol, std::vector<
|
|
172
|
+
void resegment(llm_symbol & symbol, std::vector<llama_token> & output) {
|
|
267
173
|
auto text = std::string(symbol.text, symbol.n);
|
|
268
|
-
auto token = vocab.
|
|
174
|
+
auto token = vocab.text_to_token(text);
|
|
269
175
|
|
|
270
176
|
// Do we need to support is_unused?
|
|
271
|
-
if (token !=
|
|
272
|
-
output.push_back(
|
|
177
|
+
if (token != LLAMA_TOKEN_NULL) {
|
|
178
|
+
output.push_back(token);
|
|
273
179
|
return;
|
|
274
180
|
}
|
|
275
181
|
|
|
@@ -279,8 +185,8 @@ private:
|
|
|
279
185
|
// output any symbols that did not form tokens as bytes.
|
|
280
186
|
output.reserve(output.size() + symbol.n);
|
|
281
187
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
|
282
|
-
|
|
283
|
-
output.push_back(
|
|
188
|
+
llama_token id = vocab.byte_to_token(symbol.text[j]);
|
|
189
|
+
output.push_back(id);
|
|
284
190
|
}
|
|
285
191
|
return;
|
|
286
192
|
}
|
|
@@ -294,17 +200,17 @@ private:
|
|
|
294
200
|
return;
|
|
295
201
|
}
|
|
296
202
|
const std::string text = std::string(symbols[left].text, symbols[left].n + symbols[right].n);
|
|
297
|
-
auto token = vocab.
|
|
203
|
+
auto token = vocab.text_to_token(text);
|
|
298
204
|
|
|
299
|
-
if (token ==
|
|
205
|
+
if (token == LLAMA_TOKEN_NULL) {
|
|
300
206
|
return;
|
|
301
207
|
}
|
|
302
208
|
|
|
303
|
-
if (static_cast<
|
|
209
|
+
if (static_cast<uint32_t>(token) >= vocab.n_tokens()) {
|
|
304
210
|
return;
|
|
305
211
|
}
|
|
306
212
|
|
|
307
|
-
const auto & tok_data = vocab.
|
|
213
|
+
const auto & tok_data = vocab.get_token_data(token);
|
|
308
214
|
|
|
309
215
|
llm_bigram_spm bigram;
|
|
310
216
|
bigram.left = left;
|
|
@@ -367,9 +273,9 @@ struct llm_bigram_bpe {
|
|
|
367
273
|
};
|
|
368
274
|
|
|
369
275
|
struct llm_tokenizer_bpe : llm_tokenizer {
|
|
370
|
-
llm_tokenizer_bpe(const llama_vocab & vocab)
|
|
371
|
-
GGML_ASSERT(vocab.
|
|
372
|
-
switch (vocab.
|
|
276
|
+
llm_tokenizer_bpe(const llama_vocab & vocab) {
|
|
277
|
+
GGML_ASSERT(vocab.get_type() == LLAMA_VOCAB_TYPE_BPE);
|
|
278
|
+
switch (vocab.get_pre_type()) {
|
|
373
279
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
|
374
280
|
regex_exprs = {
|
|
375
281
|
// original regex from tokenizer.json
|
|
@@ -396,6 +302,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
396
302
|
"\\p{N}+",
|
|
397
303
|
};
|
|
398
304
|
break;
|
|
305
|
+
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM:
|
|
306
|
+
regex_exprs = {
|
|
307
|
+
"\\p{N}{1,3}",
|
|
308
|
+
"[一-龥-ゟ゠-ヿ]+",
|
|
309
|
+
"[!\"#$%&'()*+,\\-./:;<=>?@\\[\\\\\\]^_`{|}~][A-Za-z]+|[^\r\n\\p{L}\\p{P}\\p{S}]?[\\p{L}\\p{M}]+| ?[\\p{P}\\p{S}]+[\r\n]*|\\s*[\r\n]+|\\s+(?!\\S)|\\s+",
|
|
310
|
+
};
|
|
311
|
+
break;
|
|
399
312
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
|
|
400
313
|
regex_exprs = {
|
|
401
314
|
"[\r\n]",
|
|
@@ -495,39 +408,38 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
|
495
408
|
};
|
|
496
409
|
|
|
497
410
|
struct llm_tokenizer_bpe_session {
|
|
498
|
-
llm_tokenizer_bpe_session(const llama_vocab & vocab) : vocab(vocab),
|
|
499
|
-
bpe_tokenizer(static_cast<const llm_tokenizer_bpe *>(vocab.tokenizer)) {}
|
|
411
|
+
llm_tokenizer_bpe_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
|
500
412
|
|
|
501
|
-
static void append(const
|
|
413
|
+
static void append(const llama_token token_id, std::vector<llama_token> & output) {
|
|
502
414
|
output.push_back(token_id);
|
|
503
415
|
}
|
|
504
416
|
|
|
505
|
-
bool append_bos(std::vector<
|
|
506
|
-
if (vocab.
|
|
507
|
-
GGML_ASSERT(vocab.
|
|
508
|
-
output.push_back(vocab.
|
|
417
|
+
bool append_bos(std::vector<llama_token> & output) const {
|
|
418
|
+
if (vocab.get_add_bos()) {
|
|
419
|
+
GGML_ASSERT(vocab.token_bos() != LLAMA_TOKEN_NULL);
|
|
420
|
+
output.push_back(vocab.token_bos());
|
|
509
421
|
return true;
|
|
510
422
|
}
|
|
511
423
|
return false;
|
|
512
424
|
}
|
|
513
425
|
|
|
514
|
-
bool append_eos(std::vector<
|
|
515
|
-
if (vocab.
|
|
516
|
-
GGML_ASSERT(vocab.
|
|
517
|
-
output.push_back(vocab.
|
|
426
|
+
bool append_eos(std::vector<llama_token> & output) const {
|
|
427
|
+
if (vocab.get_add_eos()) {
|
|
428
|
+
GGML_ASSERT(vocab.token_eos() != LLAMA_TOKEN_NULL);
|
|
429
|
+
output.push_back(vocab.token_eos());
|
|
518
430
|
return true;
|
|
519
431
|
}
|
|
520
432
|
return false;
|
|
521
433
|
}
|
|
522
434
|
|
|
523
|
-
void check_double_bos_eos(const std::vector<
|
|
524
|
-
if (vocab.
|
|
435
|
+
void check_double_bos_eos(const std::vector<llama_token> & output) const {
|
|
436
|
+
if (vocab.get_add_bos() && output.size() >= 2 && output[1] == vocab.token_bos()) {
|
|
525
437
|
LLAMA_LOG_WARN(
|
|
526
438
|
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
527
439
|
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
528
440
|
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
529
441
|
}
|
|
530
|
-
if (vocab.
|
|
442
|
+
if (vocab.get_add_eos() && output.size() >= 2 && *(output.end()-2) == vocab.token_eos()) {
|
|
531
443
|
LLAMA_LOG_WARN(
|
|
532
444
|
"%s: Added a EOS token to the prompt as specified by the model but the prompt "
|
|
533
445
|
"also ends with a EOS token. So now the final prompt ends with 2 EOS tokens. "
|
|
@@ -535,9 +447,9 @@ struct llm_tokenizer_bpe_session {
|
|
|
535
447
|
}
|
|
536
448
|
}
|
|
537
449
|
|
|
538
|
-
void tokenize(const std::string & text, std::vector<
|
|
450
|
+
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
|
539
451
|
int final_prev_index = -1;
|
|
540
|
-
const auto word_collection = unicode_regex_split(text,
|
|
452
|
+
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
|
|
541
453
|
|
|
542
454
|
symbols_final.clear();
|
|
543
455
|
|
|
@@ -548,7 +460,8 @@ struct llm_tokenizer_bpe_session {
|
|
|
548
460
|
int index = 0;
|
|
549
461
|
size_t offset = 0;
|
|
550
462
|
|
|
551
|
-
if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
|
463
|
+
//if (vocab.tokenizer_ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
|
464
|
+
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
|
|
552
465
|
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
|
553
466
|
offset = word.size();
|
|
554
467
|
}
|
|
@@ -622,18 +535,18 @@ struct llm_tokenizer_bpe_session {
|
|
|
622
535
|
}
|
|
623
536
|
|
|
624
537
|
const std::string str = std::string(symbol.text, symbol.n);
|
|
625
|
-
const auto token = vocab.
|
|
538
|
+
const auto token = vocab.text_to_token(str);
|
|
626
539
|
|
|
627
|
-
if (token ==
|
|
540
|
+
if (token == LLAMA_TOKEN_NULL) {
|
|
628
541
|
for (auto j = str.begin(); j != str.end(); ++j) {
|
|
629
542
|
std::string byte_str(1, *j);
|
|
630
|
-
auto token_multibyte = vocab.
|
|
631
|
-
if (token_multibyte !=
|
|
632
|
-
output.push_back(token_multibyte
|
|
543
|
+
auto token_multibyte = vocab.text_to_token(byte_str);
|
|
544
|
+
if (token_multibyte != LLAMA_TOKEN_NULL) {
|
|
545
|
+
output.push_back(token_multibyte);
|
|
633
546
|
}
|
|
634
547
|
}
|
|
635
548
|
} else {
|
|
636
|
-
output.push_back(
|
|
549
|
+
output.push_back(token);
|
|
637
550
|
}
|
|
638
551
|
}
|
|
639
552
|
}
|
|
@@ -667,7 +580,7 @@ private:
|
|
|
667
580
|
}
|
|
668
581
|
|
|
669
582
|
const llama_vocab & vocab;
|
|
670
|
-
const llm_tokenizer_bpe
|
|
583
|
+
const llm_tokenizer_bpe & tokenizer;
|
|
671
584
|
|
|
672
585
|
std::vector<llm_symbol> symbols;
|
|
673
586
|
std::vector<llm_symbol> symbols_final;
|
|
@@ -679,14 +592,13 @@ private:
|
|
|
679
592
|
//
|
|
680
593
|
|
|
681
594
|
struct llm_tokenizer_wpm : llm_tokenizer {
|
|
682
|
-
llm_tokenizer_wpm(const llama_vocab & /*vocab*/)
|
|
595
|
+
llm_tokenizer_wpm(const llama_vocab & /*vocab*/) {}
|
|
683
596
|
};
|
|
684
597
|
|
|
685
598
|
struct llm_tokenizer_wpm_session {
|
|
686
599
|
llm_tokenizer_wpm_session(const llama_vocab & vocab) : vocab(vocab) {}
|
|
687
600
|
|
|
688
|
-
void tokenize(const std::string & text, std::vector<
|
|
689
|
-
const auto & token_map = vocab.token_to_id;
|
|
601
|
+
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
|
690
602
|
// normalize and split by whitespace
|
|
691
603
|
std::vector<std::string> words = preprocess(text);
|
|
692
604
|
// bos token prepended already
|
|
@@ -709,10 +621,10 @@ struct llm_tokenizer_wpm_session {
|
|
|
709
621
|
for (int i = 0; i < n; ++i) {
|
|
710
622
|
// loop through possible match length
|
|
711
623
|
bool match = false;
|
|
712
|
-
for (int j = std::min(n, i + vocab.max_token_len + 1); j > i; j--) {
|
|
713
|
-
auto
|
|
714
|
-
if (
|
|
715
|
-
output.push_back(
|
|
624
|
+
for (int j = std::min(n, i + vocab.max_token_len() + 1); j > i; j--) {
|
|
625
|
+
auto id = vocab.text_to_token(word1.substr(i, j - i));
|
|
626
|
+
if (id != LLAMA_TOKEN_NULL) {
|
|
627
|
+
output.push_back(id);
|
|
716
628
|
match = true;
|
|
717
629
|
i = j - 1;
|
|
718
630
|
break;
|
|
@@ -727,7 +639,7 @@ struct llm_tokenizer_wpm_session {
|
|
|
727
639
|
|
|
728
640
|
// we didn't find any matches for this word
|
|
729
641
|
if (current_tokens == output.size()) {
|
|
730
|
-
output.push_back(vocab.
|
|
642
|
+
output.push_back(vocab.token_unk());
|
|
731
643
|
}
|
|
732
644
|
}
|
|
733
645
|
}
|
|
@@ -796,45 +708,45 @@ private:
|
|
|
796
708
|
//
|
|
797
709
|
|
|
798
710
|
struct llm_tokenizer_ugm : llm_tokenizer {
|
|
799
|
-
llm_tokenizer_ugm(const llama_vocab & vocab
|
|
800
|
-
if (
|
|
711
|
+
llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
|
|
712
|
+
if (precompiled_charsmap.size() > 0) {
|
|
801
713
|
size_t charsmap_offset = 0;
|
|
802
714
|
|
|
803
715
|
// First four bytes of precompiled_charsmap contains length of binary
|
|
804
716
|
// blob containing XOR-compressed compact double array (XCDA) entries
|
|
805
|
-
uint32_t xcda_blob_size = *(const uint32_t *) &
|
|
717
|
+
uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
|
|
806
718
|
charsmap_offset += sizeof(xcda_blob_size);
|
|
807
|
-
if (xcda_blob_size + charsmap_offset >=
|
|
719
|
+
if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
|
|
808
720
|
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
|
809
721
|
}
|
|
810
722
|
|
|
811
723
|
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
|
|
812
724
|
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
|
|
813
|
-
xcda_array = (const uint32_t *) &
|
|
725
|
+
xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
|
|
814
726
|
xcda_array_size = xcda_blob_size / sizeof(uint32_t);
|
|
815
727
|
charsmap_offset += xcda_blob_size;
|
|
816
728
|
|
|
817
729
|
// Remaining bytes of precompiled charsmap contain null-terminated
|
|
818
730
|
// replacement strings for prefixes matched by the XCDA.
|
|
819
|
-
prefix_replacements = &
|
|
820
|
-
prefix_replacements_size =
|
|
731
|
+
prefix_replacements = &precompiled_charsmap[charsmap_offset];
|
|
732
|
+
prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
|
|
821
733
|
}
|
|
822
734
|
|
|
823
|
-
for (
|
|
824
|
-
const auto &token_data = vocab.
|
|
735
|
+
for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
|
|
736
|
+
const auto & token_data = vocab.get_token_data(id);
|
|
825
737
|
|
|
826
|
-
if (
|
|
738
|
+
if (vocab.is_normal(id)) {
|
|
827
739
|
min_score = std::min<float>(min_score, token_data.score);
|
|
828
740
|
max_score = std::max<float>(max_score, token_data.score);
|
|
829
741
|
}
|
|
830
742
|
|
|
831
|
-
if (
|
|
832
|
-
|
|
833
|
-
|
|
743
|
+
if (vocab.is_normal(id) ||
|
|
744
|
+
vocab.is_user_defined(id) ||
|
|
745
|
+
vocab.is_unused(id)) {
|
|
834
746
|
token_matcher.insert(token_data.text.data(), token_data.text.size(), id);
|
|
835
747
|
}
|
|
836
748
|
|
|
837
|
-
if (
|
|
749
|
+
if (vocab.is_user_defined(id)) {
|
|
838
750
|
user_defined_token_matcher.insert(token_data.text.data(), token_data.text.size());
|
|
839
751
|
}
|
|
840
752
|
}
|
|
@@ -863,8 +775,7 @@ struct llm_tokenizer_ugm : llm_tokenizer {
|
|
|
863
775
|
};
|
|
864
776
|
|
|
865
777
|
struct llm_tokenizer_ugm_session {
|
|
866
|
-
llm_tokenizer_ugm_session(const llama_vocab & vocab) : vocab(vocab),
|
|
867
|
-
ugm_tokenizer(static_cast<const llm_tokenizer_ugm *>(vocab.tokenizer)) {}
|
|
778
|
+
llm_tokenizer_ugm_session(const llama_vocab & vocab, const llm_tokenizer_ugm & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
|
868
779
|
|
|
869
780
|
/* This implementation is based on SentencePiece optimized Viterbi algorithm for
|
|
870
781
|
* unigram language models. The general idea is to:
|
|
@@ -879,7 +790,7 @@ struct llm_tokenizer_ugm_session {
|
|
|
879
790
|
* After processing the whole sequence we backtrack from the end to get
|
|
880
791
|
* the best tokenization.
|
|
881
792
|
*/
|
|
882
|
-
void tokenize(const std::string & text, std::vector<
|
|
793
|
+
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
|
883
794
|
// get current size of output (for reversal later)
|
|
884
795
|
size_t output_size = output.size();
|
|
885
796
|
|
|
@@ -892,9 +803,9 @@ struct llm_tokenizer_ugm_session {
|
|
|
892
803
|
}
|
|
893
804
|
|
|
894
805
|
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
|
895
|
-
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.
|
|
806
|
+
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
|
|
896
807
|
// at the beginning tokenization score is zero
|
|
897
|
-
tokenization_results[0] = { vocab.
|
|
808
|
+
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
|
|
898
809
|
|
|
899
810
|
for (size_t input_offset = 0; input_offset < input_len;) {
|
|
900
811
|
size_t prefix_offset = input_offset;
|
|
@@ -904,7 +815,7 @@ struct llm_tokenizer_ugm_session {
|
|
|
904
815
|
// traverse the token matcher trie to find a matching token
|
|
905
816
|
bool single_codepoint_token_found = false;
|
|
906
817
|
const struct best_tokenization & current_best = tokenization_results[input_offset];
|
|
907
|
-
const struct naive_trie * node =
|
|
818
|
+
const struct naive_trie * node = tokenizer.token_matcher.traverse(normalized[prefix_offset++]);
|
|
908
819
|
|
|
909
820
|
while (prefix_offset <= input_len && node != NULL) {
|
|
910
821
|
// check if we found valid token in prefix
|
|
@@ -914,13 +825,13 @@ struct llm_tokenizer_ugm_session {
|
|
|
914
825
|
single_codepoint_token_found = true;
|
|
915
826
|
}
|
|
916
827
|
llama_token token_id = node->value;
|
|
917
|
-
const auto & token_data = vocab.
|
|
828
|
+
const auto & token_data = vocab.get_token_data(token_id);
|
|
918
829
|
|
|
919
830
|
// we set the user-defined token scores to 0 to make them more likely to be selected
|
|
920
831
|
// (normal token scores are log probabilities, so they are negative)
|
|
921
832
|
// score type is double here to make tokenization results exactly
|
|
922
833
|
// the same as in the HF tokenizer using SentencePiece
|
|
923
|
-
const double token_score =
|
|
834
|
+
const double token_score = vocab.is_user_defined(token_id) ? 0.0 : token_data.score;
|
|
924
835
|
const double challenger_score = current_best.score_sum + token_score;
|
|
925
836
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
|
926
837
|
if (challenger_score > current_champ.score_sum) {
|
|
@@ -934,11 +845,11 @@ struct llm_tokenizer_ugm_session {
|
|
|
934
845
|
// if we didn't find a valid token corresponding to the whole UTF code point
|
|
935
846
|
// then use unknown token as the tokenization of this UTF code point
|
|
936
847
|
if (!single_codepoint_token_found) {
|
|
937
|
-
const double challenger_score = current_best.score_sum +
|
|
848
|
+
const double challenger_score = current_best.score_sum + tokenizer.unknown_token_score;
|
|
938
849
|
prefix_offset = input_offset + n_utf8_code_units;
|
|
939
850
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
|
940
851
|
if (challenger_score > current_champ.score_sum) {
|
|
941
|
-
struct best_tokenization challenger = { vocab.
|
|
852
|
+
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
|
|
942
853
|
current_champ = challenger;
|
|
943
854
|
}
|
|
944
855
|
}
|
|
@@ -951,7 +862,7 @@ struct llm_tokenizer_ugm_session {
|
|
|
951
862
|
// merge sequences of consecutive unknown tokens into single unknown tokens
|
|
952
863
|
bool is_prev_unknown = false;
|
|
953
864
|
for (struct best_tokenization & tokenization = tokenization_results[input_len]; ; tokenization = tokenization_results[tokenization.input_offset]) {
|
|
954
|
-
bool is_unknown = tokenization.token_id == vocab.
|
|
865
|
+
bool is_unknown = tokenization.token_id == vocab.token_unk();
|
|
955
866
|
if (!(is_prev_unknown && is_unknown)) {
|
|
956
867
|
output.push_back(tokenization.token_id);
|
|
957
868
|
}
|
|
@@ -978,11 +889,11 @@ private:
|
|
|
978
889
|
normalized->clear();
|
|
979
890
|
normalized->reserve(input.size() * 3);
|
|
980
891
|
|
|
981
|
-
const std::string space = vocab.
|
|
892
|
+
const std::string space = vocab.get_escape_whitespaces() ? tokenizer.escaped_space : " ";
|
|
982
893
|
|
|
983
|
-
bool shall_prepend_space = !vocab.
|
|
984
|
-
bool shall_append_space
|
|
985
|
-
bool shall_merge_spaces
|
|
894
|
+
const bool shall_prepend_space = !vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
|
|
895
|
+
const bool shall_append_space = vocab.get_treat_whitespace_as_suffix() && vocab.get_add_space_prefix();
|
|
896
|
+
const bool shall_merge_spaces = vocab.get_remove_extra_whitespaces();
|
|
986
897
|
|
|
987
898
|
bool is_space_prepended = false;
|
|
988
899
|
bool processing_non_ws = false;
|
|
@@ -1074,7 +985,7 @@ private:
|
|
|
1074
985
|
|
|
1075
986
|
// if input prefix matches some user-defined token return this token as normalization result
|
|
1076
987
|
auto user_defined_token_match =
|
|
1077
|
-
|
|
988
|
+
tokenizer.user_defined_token_matcher.get_longest_prefix(&input[input_offset], input.size() - input_offset);
|
|
1078
989
|
if (user_defined_token_match.second > 0) {
|
|
1079
990
|
return { &input[input_offset], user_defined_token_match.second, user_defined_token_match.second };
|
|
1080
991
|
}
|
|
@@ -1082,8 +993,8 @@ private:
|
|
|
1082
993
|
size_t longest_prefix_length = 0;
|
|
1083
994
|
size_t longest_prefix_offset = 0;
|
|
1084
995
|
|
|
1085
|
-
if (
|
|
1086
|
-
struct xcda_array_view xcda_view(
|
|
996
|
+
if (tokenizer.xcda_array_size > 0) {
|
|
997
|
+
struct xcda_array_view xcda_view(tokenizer.xcda_array, tokenizer.xcda_array_size);
|
|
1087
998
|
|
|
1088
999
|
// Find the longest normalized sequence matching the input prefix by walking
|
|
1089
1000
|
// the XOR-compressed compact double array (XCDA) starting from the root node
|
|
@@ -1119,10 +1030,10 @@ private:
|
|
|
1119
1030
|
|
|
1120
1031
|
if (longest_prefix_length > 0) {
|
|
1121
1032
|
// we have a match, so return the replacement sequence
|
|
1122
|
-
if (longest_prefix_offset >=
|
|
1033
|
+
if (longest_prefix_offset >= tokenizer.prefix_replacements_size) {
|
|
1123
1034
|
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
|
1124
1035
|
}
|
|
1125
|
-
const char * prefix_replacement = &(
|
|
1036
|
+
const char * prefix_replacement = &(tokenizer.prefix_replacements)[longest_prefix_offset];
|
|
1126
1037
|
return { prefix_replacement, strlen(prefix_replacement), longest_prefix_length };
|
|
1127
1038
|
}
|
|
1128
1039
|
|
|
@@ -1139,7 +1050,7 @@ private:
|
|
|
1139
1050
|
}
|
|
1140
1051
|
|
|
1141
1052
|
const llama_vocab & vocab;
|
|
1142
|
-
const llm_tokenizer_ugm
|
|
1053
|
+
const llm_tokenizer_ugm & tokenizer;
|
|
1143
1054
|
};
|
|
1144
1055
|
|
|
1145
1056
|
//
|
|
@@ -1201,15 +1112,15 @@ static std::vector<uint8_t> llama_unescape_rwkv_token(const std::string & escape
|
|
|
1201
1112
|
}
|
|
1202
1113
|
|
|
1203
1114
|
struct llm_tokenizer_rwkv : llm_tokenizer {
|
|
1204
|
-
llm_tokenizer_rwkv(const llama_vocab & vocab)
|
|
1115
|
+
llm_tokenizer_rwkv(const llama_vocab & vocab) {
|
|
1205
1116
|
// RWKV supports arbitrary byte tokens, but the vocab struct only supports string tokens.
|
|
1206
1117
|
// For now, we decode the vocab here into the lookup we'll use for tokenization.
|
|
1207
1118
|
|
|
1208
1119
|
// build trie
|
|
1209
|
-
for (
|
|
1210
|
-
const auto &
|
|
1211
|
-
const auto
|
|
1212
|
-
token_matcher.insert((const char *)
|
|
1120
|
+
for (uint32_t id = 0; id < vocab.n_tokens(); ++id) {
|
|
1121
|
+
const auto & data = vocab.get_token_data(id);
|
|
1122
|
+
const auto text = llama_unescape_rwkv_token(data.text);
|
|
1123
|
+
token_matcher.insert((const char *) text.data(), text.size(), id);
|
|
1213
1124
|
}
|
|
1214
1125
|
}
|
|
1215
1126
|
|
|
@@ -1217,16 +1128,15 @@ struct llm_tokenizer_rwkv : llm_tokenizer {
|
|
|
1217
1128
|
};
|
|
1218
1129
|
|
|
1219
1130
|
struct llm_tokenizer_rwkv_session {
|
|
1220
|
-
llm_tokenizer_rwkv_session(const llama_vocab & vocab) : vocab(vocab),
|
|
1221
|
-
rwkv_tokenizer(static_cast<const llm_tokenizer_rwkv &>(*vocab.tokenizer)) {}
|
|
1131
|
+
llm_tokenizer_rwkv_session(const llama_vocab & vocab, const llm_tokenizer_rwkv & tokenizer) : vocab(vocab), tokenizer(tokenizer) {}
|
|
1222
1132
|
|
|
1223
|
-
void tokenize(const std::string & text, std::vector<
|
|
1133
|
+
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
|
1224
1134
|
uint32_t position = 0;
|
|
1225
1135
|
while (position < text.size()) {
|
|
1226
|
-
const struct naive_trie * node =
|
|
1136
|
+
const struct naive_trie * node = tokenizer.token_matcher.traverse(text[position]);
|
|
1227
1137
|
if (node == NULL) {
|
|
1228
1138
|
// no matching token found, add unknown token
|
|
1229
|
-
output.push_back(vocab.
|
|
1139
|
+
output.push_back(vocab.token_unk());
|
|
1230
1140
|
position += 1;
|
|
1231
1141
|
continue;
|
|
1232
1142
|
}
|
|
@@ -1250,33 +1160,11 @@ struct llm_tokenizer_rwkv_session {
|
|
|
1250
1160
|
|
|
1251
1161
|
private:
|
|
1252
1162
|
const llama_vocab & vocab;
|
|
1253
|
-
const llm_tokenizer_rwkv &
|
|
1163
|
+
const llm_tokenizer_rwkv & tokenizer;
|
|
1254
1164
|
};
|
|
1255
1165
|
|
|
1256
|
-
void llama_vocab::init_tokenizer() {
|
|
1257
|
-
switch (type) {
|
|
1258
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
|
1259
|
-
tokenizer = new llm_tokenizer_spm(*this);
|
|
1260
|
-
break;
|
|
1261
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
|
1262
|
-
tokenizer = new llm_tokenizer_bpe(*this);
|
|
1263
|
-
break;
|
|
1264
|
-
case LLAMA_VOCAB_TYPE_WPM:
|
|
1265
|
-
tokenizer = new llm_tokenizer_wpm(*this);
|
|
1266
|
-
break;
|
|
1267
|
-
case LLAMA_VOCAB_TYPE_UGM:
|
|
1268
|
-
tokenizer = new llm_tokenizer_ugm(*this);
|
|
1269
|
-
break;
|
|
1270
|
-
case LLAMA_VOCAB_TYPE_RWKV:
|
|
1271
|
-
tokenizer = new llm_tokenizer_rwkv(*this);
|
|
1272
|
-
break;
|
|
1273
|
-
default:
|
|
1274
|
-
GGML_ABORT("unsupported vocab type");
|
|
1275
|
-
}
|
|
1276
|
-
}
|
|
1277
|
-
|
|
1278
1166
|
//
|
|
1279
|
-
//
|
|
1167
|
+
// impl
|
|
1280
1168
|
//
|
|
1281
1169
|
|
|
1282
1170
|
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
|
@@ -1285,7 +1173,7 @@ typedef enum FRAGMENT_BUFFER_VARIANT_TYPE {
|
|
|
1285
1173
|
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
|
1286
1174
|
|
|
1287
1175
|
struct fragment_buffer_variant {
|
|
1288
|
-
fragment_buffer_variant(
|
|
1176
|
+
fragment_buffer_variant(llama_token _token)
|
|
1289
1177
|
:
|
|
1290
1178
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
|
1291
1179
|
token(_token),
|
|
@@ -1296,7 +1184,7 @@ struct fragment_buffer_variant {
|
|
|
1296
1184
|
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
|
1297
1185
|
:
|
|
1298
1186
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
|
1299
|
-
token((
|
|
1187
|
+
token((llama_token) - 1),
|
|
1300
1188
|
raw_text(_raw_text),
|
|
1301
1189
|
offset(_offset),
|
|
1302
1190
|
length(_length){
|
|
@@ -1306,451 +1194,1095 @@ struct fragment_buffer_variant {
|
|
|
1306
1194
|
}
|
|
1307
1195
|
|
|
1308
1196
|
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
|
1309
|
-
const
|
|
1197
|
+
const llama_token token;
|
|
1310
1198
|
const std::string _dummy;
|
|
1311
1199
|
const std::string & raw_text;
|
|
1312
1200
|
const uint64_t offset;
|
|
1313
1201
|
const uint64_t length;
|
|
1314
1202
|
};
|
|
1315
1203
|
|
|
1316
|
-
|
|
1204
|
+
struct llama_vocab::impl {
|
|
1205
|
+
uint32_t n_token_types = 0; // for BERT-style token types
|
|
1317
1206
|
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
|
1321
|
-
const auto & data = vocab.id_to_token[special_id];
|
|
1322
|
-
const auto & special_token = data.text;
|
|
1207
|
+
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
|
1208
|
+
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1323
1209
|
|
|
1324
|
-
|
|
1325
|
-
// Ignore control and unknown tokens when parse_special == false
|
|
1326
|
-
continue;
|
|
1327
|
-
// User-defined tokens are still pre-tokenized before everything else
|
|
1328
|
-
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
|
|
1329
|
-
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
|
|
1330
|
-
}
|
|
1210
|
+
int max_token_len = 0; // used for optimizing longest token search
|
|
1331
1211
|
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1212
|
+
// default LLaMA special tokens
|
|
1213
|
+
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
|
|
1214
|
+
llama_token special_bos_id = 1;
|
|
1215
|
+
llama_token special_eos_id = 2;
|
|
1216
|
+
llama_token special_eot_id = LLAMA_TOKEN_NULL;
|
|
1217
|
+
llama_token special_eom_id = LLAMA_TOKEN_NULL;
|
|
1218
|
+
llama_token special_unk_id = 0;
|
|
1219
|
+
llama_token special_sep_id = LLAMA_TOKEN_NULL;
|
|
1220
|
+
llama_token special_pad_id = LLAMA_TOKEN_NULL;
|
|
1221
|
+
llama_token special_mask_id = LLAMA_TOKEN_NULL;
|
|
1336
1222
|
|
|
1337
|
-
|
|
1338
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
1339
|
-
const auto & raw_text = fragment.raw_text;
|
|
1223
|
+
llama_token linefeed_id = 13;
|
|
1340
1224
|
|
|
1341
|
-
|
|
1342
|
-
|
|
1225
|
+
// fim tokens
|
|
1226
|
+
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
|
|
1227
|
+
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
|
|
1228
|
+
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
|
|
1229
|
+
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
|
|
1230
|
+
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
|
|
1231
|
+
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
|
|
1343
1232
|
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1233
|
+
// tokenizer flags
|
|
1234
|
+
bool add_space_prefix = false;
|
|
1235
|
+
bool add_bos = false;
|
|
1236
|
+
bool add_eos = false;
|
|
1237
|
+
bool ignore_merges = false;
|
|
1238
|
+
bool clean_spaces = false; // clean_up_tokenization_spaces
|
|
1239
|
+
bool remove_extra_whitespaces = false;
|
|
1240
|
+
bool escape_whitespaces = true;
|
|
1241
|
+
bool treat_whitespace_as_suffix = false;
|
|
1350
1242
|
|
|
1351
|
-
|
|
1352
|
-
|
|
1243
|
+
std::unordered_map<std::string, llama_token> token_to_id;
|
|
1244
|
+
std::vector<token_data> id_to_token;
|
|
1353
1245
|
|
|
1354
|
-
|
|
1355
|
-
|
|
1246
|
+
std::vector<llama_token> cache_special_tokens;
|
|
1247
|
+
std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
|
|
1356
1248
|
|
|
1357
|
-
|
|
1358
|
-
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
1359
|
-
#endif
|
|
1360
|
-
auto source = std::distance(buffer.begin(), it);
|
|
1249
|
+
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
|
1361
1250
|
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
if (match > raw_text_base_offset) {
|
|
1365
|
-
// left
|
|
1366
|
-
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
|
1367
|
-
int64_t left_reminder_length = match - raw_text_base_offset;
|
|
1251
|
+
// set of all tokens that cause "end of generation"
|
|
1252
|
+
std::set<llama_token> special_eog_ids;
|
|
1368
1253
|
|
|
1369
|
-
|
|
1370
|
-
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
|
1371
|
-
left_reminder_length--;
|
|
1372
|
-
}
|
|
1373
|
-
}
|
|
1254
|
+
std::unique_ptr<llm_tokenizer> tokenizer;
|
|
1374
1255
|
|
|
1375
|
-
|
|
1376
|
-
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
|
1377
|
-
it++;
|
|
1378
|
-
}
|
|
1256
|
+
std::vector<char> precompiled_charsmap;
|
|
1379
1257
|
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
#endif
|
|
1383
|
-
}
|
|
1258
|
+
impl(const llama_vocab & vocab) : vocab(vocab) {
|
|
1259
|
+
}
|
|
1384
1260
|
|
|
1385
|
-
|
|
1386
|
-
buffer.emplace_after(it, special_id);
|
|
1387
|
-
it++;
|
|
1261
|
+
~impl() = default;
|
|
1388
1262
|
|
|
1389
|
-
|
|
1390
|
-
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
|
1391
|
-
int64_t right_reminder_offset = match + special_token.length();
|
|
1392
|
-
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
|
1263
|
+
void load(llama_model_loader & ml, const LLM_KV & kv);
|
|
1393
1264
|
|
|
1394
|
-
|
|
1395
|
-
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
|
1396
|
-
right_reminder_offset++;
|
|
1397
|
-
right_reminder_length--;
|
|
1398
|
-
}
|
|
1399
|
-
}
|
|
1265
|
+
enum llama_vocab_type get_type() const;
|
|
1400
1266
|
|
|
1401
|
-
|
|
1402
|
-
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
|
1403
|
-
it++;
|
|
1404
|
-
}
|
|
1267
|
+
std::string type_name() const;
|
|
1405
1268
|
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1269
|
+
bool is_normal (llama_token id) const;
|
|
1270
|
+
bool is_unknown (llama_token id) const;
|
|
1271
|
+
bool is_control (llama_token id) const;
|
|
1272
|
+
bool is_byte (llama_token id) const;
|
|
1273
|
+
bool is_user_defined(llama_token id) const;
|
|
1274
|
+
bool is_unused (llama_token id) const;
|
|
1275
|
+
bool is_eog (llama_token id) const;
|
|
1409
1276
|
|
|
1410
|
-
|
|
1411
|
-
buffer.erase_after(buffer.before_begin());
|
|
1412
|
-
} else {
|
|
1413
|
-
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
|
1414
|
-
}
|
|
1277
|
+
uint8_t token_to_byte(llama_token id) const;
|
|
1415
1278
|
|
|
1416
|
-
|
|
1417
|
-
raw_text_base_offset = right_reminder_offset;
|
|
1418
|
-
raw_text_base_length = right_reminder_length;
|
|
1279
|
+
llama_token_attr token_get_attr(llama_token id) const;
|
|
1419
1280
|
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1281
|
+
void init_tokenizer(enum llama_vocab_type type);
|
|
1282
|
+
|
|
1283
|
+
void tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const;
|
|
1284
|
+
|
|
1285
|
+
std::string token_to_piece_for_cache(
|
|
1286
|
+
llama_token token,
|
|
1287
|
+
bool special) const;
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
std::vector<llama_token> tokenize(
|
|
1291
|
+
const std::string & raw_text,
|
|
1292
|
+
bool add_special,
|
|
1293
|
+
bool parse_special = false) const;
|
|
1294
|
+
|
|
1295
|
+
int32_t tokenize(
|
|
1296
|
+
const char * text,
|
|
1297
|
+
int32_t text_len,
|
|
1298
|
+
llama_token * tokens,
|
|
1299
|
+
int32_t n_tokens_max,
|
|
1300
|
+
bool add_special,
|
|
1301
|
+
bool parse_special) const;
|
|
1302
|
+
|
|
1303
|
+
// does not write null-terminator to buf
|
|
1304
|
+
int32_t token_to_piece(
|
|
1305
|
+
llama_token token,
|
|
1306
|
+
char * buf,
|
|
1307
|
+
int32_t length,
|
|
1308
|
+
int32_t lstrip,
|
|
1309
|
+
bool special) const;
|
|
1310
|
+
|
|
1311
|
+
// use cached data
|
|
1312
|
+
const std::string & token_to_piece(llama_token token) const;
|
|
1313
|
+
|
|
1314
|
+
int32_t detokenize(
|
|
1315
|
+
const llama_token * tokens,
|
|
1316
|
+
int32_t n_tokens,
|
|
1317
|
+
char * text,
|
|
1318
|
+
int32_t text_len_max,
|
|
1319
|
+
bool remove_special,
|
|
1320
|
+
bool unparse_special) const;
|
|
1321
|
+
|
|
1322
|
+
std::string detokenize(
|
|
1323
|
+
const std::vector<llama_token> & tokens,
|
|
1324
|
+
bool special) const;
|
|
1325
|
+
|
|
1326
|
+
void print_info() const;
|
|
1327
|
+
|
|
1328
|
+
private:
|
|
1329
|
+
const llama_vocab & vocab;
|
|
1330
|
+
};
|
|
1331
|
+
|
|
1332
|
+
void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
1333
|
+
struct gguf_context * ctx = ml.meta.get();
|
|
1334
|
+
|
|
1335
|
+
// determine vocab type
|
|
1336
|
+
{
|
|
1337
|
+
std::string tokenizer_model;
|
|
1338
|
+
std::string tokenizer_pre;
|
|
1339
|
+
|
|
1340
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
|
1341
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
1342
|
+
|
|
1343
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, n_token_types, false);
|
|
1344
|
+
|
|
1345
|
+
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
|
|
1346
|
+
type = LLAMA_VOCAB_TYPE_NONE;
|
|
1347
|
+
|
|
1348
|
+
// default special tokens
|
|
1349
|
+
special_bos_id = LLAMA_TOKEN_NULL;
|
|
1350
|
+
special_eos_id = LLAMA_TOKEN_NULL;
|
|
1351
|
+
special_unk_id = LLAMA_TOKEN_NULL;
|
|
1352
|
+
special_sep_id = LLAMA_TOKEN_NULL;
|
|
1353
|
+
special_pad_id = LLAMA_TOKEN_NULL;
|
|
1354
|
+
special_mask_id = LLAMA_TOKEN_NULL;
|
|
1355
|
+
linefeed_id = LLAMA_TOKEN_NULL;
|
|
1356
|
+
|
|
1357
|
+
// read vocab size from metadata
|
|
1358
|
+
uint32_t n_tokens = 0;
|
|
1359
|
+
if (ml.get_key(LLM_KV_VOCAB_SIZE, n_tokens, false)) {
|
|
1360
|
+
LLAMA_LOG_WARN("%s: adding %u dummy tokens\n", __func__, n_tokens);
|
|
1361
|
+
id_to_token.resize(n_tokens);
|
|
1362
|
+
}
|
|
1363
|
+
|
|
1364
|
+
return;
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
if (tokenizer_model == "llama") {
|
|
1368
|
+
type = LLAMA_VOCAB_TYPE_SPM;
|
|
1369
|
+
|
|
1370
|
+
// default special tokens
|
|
1371
|
+
special_bos_id = 1;
|
|
1372
|
+
special_eos_id = 2;
|
|
1373
|
+
special_unk_id = 0;
|
|
1374
|
+
special_sep_id = LLAMA_TOKEN_NULL;
|
|
1375
|
+
special_pad_id = LLAMA_TOKEN_NULL;
|
|
1376
|
+
special_mask_id = LLAMA_TOKEN_NULL;
|
|
1377
|
+
} else if (tokenizer_model == "bert") {
|
|
1378
|
+
type = LLAMA_VOCAB_TYPE_WPM;
|
|
1379
|
+
|
|
1380
|
+
// default special tokens
|
|
1381
|
+
special_bos_id = 101;
|
|
1382
|
+
special_eos_id = LLAMA_TOKEN_NULL;
|
|
1383
|
+
special_unk_id = 100;
|
|
1384
|
+
special_sep_id = 102;
|
|
1385
|
+
special_pad_id = 0;
|
|
1386
|
+
special_mask_id = 103;
|
|
1387
|
+
} else if (tokenizer_model == "gpt2") {
|
|
1388
|
+
type = LLAMA_VOCAB_TYPE_BPE;
|
|
1389
|
+
|
|
1390
|
+
// read bpe merges and populate bpe ranks
|
|
1391
|
+
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
|
1392
|
+
if (merges_keyidx == -1) {
|
|
1393
|
+
throw std::runtime_error("cannot find tokenizer merges in model file\n");
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
|
|
1397
|
+
for (int i = 0; i < n_merges; i++) {
|
|
1398
|
+
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
|
1399
|
+
//GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);
|
|
1400
|
+
|
|
1401
|
+
std::string first;
|
|
1402
|
+
std::string second;
|
|
1403
|
+
|
|
1404
|
+
const size_t pos = word.find(' ', 1);
|
|
1405
|
+
|
|
1406
|
+
if (pos != std::string::npos) {
|
|
1407
|
+
first = word.substr(0, pos);
|
|
1408
|
+
second = word.substr(pos + 1);
|
|
1431
1409
|
}
|
|
1410
|
+
|
|
1411
|
+
bpe_ranks.emplace(std::make_pair(first, second), i);
|
|
1432
1412
|
}
|
|
1433
|
-
|
|
1413
|
+
|
|
1414
|
+
// default special tokens
|
|
1415
|
+
special_bos_id = 11;
|
|
1416
|
+
special_eos_id = 11;
|
|
1417
|
+
special_unk_id = LLAMA_TOKEN_NULL;
|
|
1418
|
+
special_sep_id = LLAMA_TOKEN_NULL;
|
|
1419
|
+
special_pad_id = LLAMA_TOKEN_NULL;
|
|
1420
|
+
special_mask_id = LLAMA_TOKEN_NULL;
|
|
1421
|
+
} else if (tokenizer_model == "t5") {
|
|
1422
|
+
type = LLAMA_VOCAB_TYPE_UGM;
|
|
1423
|
+
|
|
1424
|
+
// default special tokens
|
|
1425
|
+
special_bos_id = LLAMA_TOKEN_NULL;
|
|
1426
|
+
special_eos_id = 1;
|
|
1427
|
+
special_unk_id = 2;
|
|
1428
|
+
special_sep_id = LLAMA_TOKEN_NULL;
|
|
1429
|
+
special_pad_id = 0;
|
|
1430
|
+
special_mask_id = LLAMA_TOKEN_NULL;
|
|
1431
|
+
|
|
1432
|
+
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
|
1433
|
+
if (precompiled_charsmap_keyidx != -1) {
|
|
1434
|
+
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
|
1435
|
+
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
|
1436
|
+
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
|
1437
|
+
#ifdef IS_BIG_ENDIAN
|
|
1438
|
+
// correct endiannes of data in precompiled_charsmap binary blob
|
|
1439
|
+
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
|
1440
|
+
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
|
1441
|
+
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
|
|
1442
|
+
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
|
|
1443
|
+
uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
|
|
1444
|
+
for (size_t i = 0; i < xcda_array_size; ++i) {
|
|
1445
|
+
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
|
|
1446
|
+
}
|
|
1447
|
+
#endif
|
|
1448
|
+
}
|
|
1449
|
+
} else if (tokenizer_model == "rwkv") {
|
|
1450
|
+
type = LLAMA_VOCAB_TYPE_RWKV;
|
|
1451
|
+
|
|
1452
|
+
// default special tokens
|
|
1453
|
+
special_bos_id = LLAMA_TOKEN_NULL;
|
|
1454
|
+
special_eos_id = LLAMA_TOKEN_NULL;
|
|
1455
|
+
special_unk_id = LLAMA_TOKEN_NULL;
|
|
1456
|
+
special_sep_id = LLAMA_TOKEN_NULL;
|
|
1457
|
+
special_pad_id = LLAMA_TOKEN_NULL;
|
|
1458
|
+
} else {
|
|
1459
|
+
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
|
1460
|
+
}
|
|
1461
|
+
|
|
1462
|
+
// for now, only BPE models have pre-tokenizers
|
|
1463
|
+
if (type == LLAMA_VOCAB_TYPE_BPE) {
|
|
1464
|
+
add_space_prefix = false;
|
|
1465
|
+
clean_spaces = true;
|
|
1466
|
+
if (tokenizer_pre.empty()) {
|
|
1467
|
+
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
|
1468
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
|
1469
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
|
1470
|
+
LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__);
|
|
1471
|
+
LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__);
|
|
1472
|
+
LLAMA_LOG_WARN("%s: ************************************ \n", __func__);
|
|
1473
|
+
LLAMA_LOG_WARN("%s: \n", __func__);
|
|
1474
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1475
|
+
} else if (tokenizer_pre == "default") {
|
|
1476
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1477
|
+
} else if (
|
|
1478
|
+
tokenizer_pre == "llama3" ||
|
|
1479
|
+
tokenizer_pre == "llama-v3" ||
|
|
1480
|
+
tokenizer_pre == "llama-bpe"||
|
|
1481
|
+
tokenizer_pre == "falcon3") {
|
|
1482
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
|
1483
|
+
ignore_merges = true;
|
|
1484
|
+
add_bos = true;
|
|
1485
|
+
} else if (
|
|
1486
|
+
tokenizer_pre == "deepseek-llm") {
|
|
1487
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
|
|
1488
|
+
clean_spaces = false;
|
|
1489
|
+
} else if (
|
|
1490
|
+
tokenizer_pre == "deepseek-coder") {
|
|
1491
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
|
|
1492
|
+
clean_spaces = false;
|
|
1493
|
+
} else if (
|
|
1494
|
+
tokenizer_pre == "deepseek-v3") {
|
|
1495
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
|
|
1496
|
+
clean_spaces = false;
|
|
1497
|
+
} else if (
|
|
1498
|
+
tokenizer_pre == "falcon") {
|
|
1499
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_FALCON;
|
|
1500
|
+
} else if (
|
|
1501
|
+
tokenizer_pre == "mpt") {
|
|
1502
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_MPT;
|
|
1503
|
+
} else if (
|
|
1504
|
+
tokenizer_pre == "starcoder") {
|
|
1505
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
|
1506
|
+
} else if (
|
|
1507
|
+
tokenizer_pre == "gpt-2" ||
|
|
1508
|
+
tokenizer_pre == "phi-2" ||
|
|
1509
|
+
tokenizer_pre == "jina-es" ||
|
|
1510
|
+
tokenizer_pre == "jina-de" ||
|
|
1511
|
+
tokenizer_pre == "gigachat" ||
|
|
1512
|
+
tokenizer_pre == "jina-v1-en" ||
|
|
1513
|
+
tokenizer_pre == "jina-v2-es" ||
|
|
1514
|
+
tokenizer_pre == "jina-v2-de" ||
|
|
1515
|
+
tokenizer_pre == "jina-v2-code" ||
|
|
1516
|
+
tokenizer_pre == "roberta-bpe") {
|
|
1517
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
1518
|
+
} else if (
|
|
1519
|
+
tokenizer_pre == "refact") {
|
|
1520
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
|
|
1521
|
+
} else if (
|
|
1522
|
+
tokenizer_pre == "command-r") {
|
|
1523
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
|
1524
|
+
clean_spaces = false;
|
|
1525
|
+
} else if (
|
|
1526
|
+
tokenizer_pre == "qwen2") {
|
|
1527
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1528
|
+
clean_spaces = false;
|
|
1529
|
+
} else if (
|
|
1530
|
+
tokenizer_pre == "stablelm2") {
|
|
1531
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
|
1532
|
+
} else if (
|
|
1533
|
+
tokenizer_pre == "olmo") {
|
|
1534
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
|
1535
|
+
} else if (
|
|
1536
|
+
tokenizer_pre == "dbrx") {
|
|
1537
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
|
1538
|
+
} else if (
|
|
1539
|
+
tokenizer_pre == "smaug-bpe") {
|
|
1540
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
|
1541
|
+
} else if (
|
|
1542
|
+
tokenizer_pre == "poro-chat") {
|
|
1543
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_PORO;
|
|
1544
|
+
clean_spaces = false;
|
|
1545
|
+
} else if (
|
|
1546
|
+
tokenizer_pre == "chatglm-bpe") {
|
|
1547
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_CHATGLM4;
|
|
1548
|
+
special_bos_id = LLAMA_TOKEN_NULL;
|
|
1549
|
+
} else if (
|
|
1550
|
+
tokenizer_pre == "viking") {
|
|
1551
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_VIKING;
|
|
1552
|
+
clean_spaces = false;
|
|
1553
|
+
} else if (
|
|
1554
|
+
tokenizer_pre == "jais") {
|
|
1555
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
|
1556
|
+
} else if (
|
|
1557
|
+
tokenizer_pre == "tekken") {
|
|
1558
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_TEKKEN;
|
|
1559
|
+
clean_spaces = false;
|
|
1560
|
+
ignore_merges = true;
|
|
1561
|
+
add_bos = true;
|
|
1562
|
+
} else if (
|
|
1563
|
+
tokenizer_pre == "smollm") {
|
|
1564
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLLM;
|
|
1565
|
+
clean_spaces = false;
|
|
1566
|
+
} else if (
|
|
1567
|
+
tokenizer_pre == "codeshell") {
|
|
1568
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_CODESHELL;
|
|
1569
|
+
} else if (
|
|
1570
|
+
tokenizer_pre == "bloom") {
|
|
1571
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_BLOOM;
|
|
1572
|
+
} else if (
|
|
1573
|
+
tokenizer_pre == "gpt3-finnish") {
|
|
1574
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH;
|
|
1575
|
+
} else if (
|
|
1576
|
+
tokenizer_pre == "exaone") {
|
|
1577
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
|
1578
|
+
} else if (
|
|
1579
|
+
tokenizer_pre == "chameleon") {
|
|
1580
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
|
1581
|
+
add_bos = true;
|
|
1582
|
+
clean_spaces = false;
|
|
1583
|
+
} else if (
|
|
1584
|
+
tokenizer_pre == "minerva-7b") {
|
|
1585
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
|
1586
|
+
} else if (
|
|
1587
|
+
tokenizer_pre == "megrez") {
|
|
1588
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
1589
|
+
} else {
|
|
1590
|
+
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
1591
|
+
}
|
|
1592
|
+
} else if (type == LLAMA_VOCAB_TYPE_SPM) {
|
|
1593
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1594
|
+
add_space_prefix = true;
|
|
1595
|
+
clean_spaces = false;
|
|
1596
|
+
add_bos = true;
|
|
1597
|
+
add_eos = false;
|
|
1598
|
+
} else if (type == LLAMA_VOCAB_TYPE_WPM) {
|
|
1599
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1600
|
+
add_space_prefix = false;
|
|
1601
|
+
clean_spaces = true;
|
|
1602
|
+
add_bos = true;
|
|
1603
|
+
add_eos = false;
|
|
1604
|
+
} else if (type == LLAMA_VOCAB_TYPE_UGM) {
|
|
1605
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1606
|
+
add_bos = false;
|
|
1607
|
+
add_eos = true;
|
|
1608
|
+
} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
|
|
1609
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1610
|
+
add_space_prefix = false;
|
|
1611
|
+
clean_spaces = false;
|
|
1612
|
+
add_bos = false;
|
|
1613
|
+
add_eos = false;
|
|
1614
|
+
} else {
|
|
1615
|
+
pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
|
1434
1616
|
}
|
|
1617
|
+
|
|
1618
|
+
ml.get_key(LLM_KV_TOKENIZER_ADD_PREFIX, add_space_prefix, false);
|
|
1619
|
+
ml.get_key(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, remove_extra_whitespaces, false);
|
|
1435
1620
|
}
|
|
1436
|
-
}
|
|
1437
1621
|
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
std::
|
|
1441
|
-
|
|
1442
|
-
bool parse_special) {
|
|
1443
|
-
GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
|
1622
|
+
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
|
1623
|
+
if (token_idx == -1) {
|
|
1624
|
+
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
|
1625
|
+
}
|
|
1444
1626
|
|
|
1445
|
-
|
|
1446
|
-
|
|
1627
|
+
const float * scores = nullptr;
|
|
1628
|
+
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
|
1629
|
+
if (score_idx != -1) {
|
|
1630
|
+
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
|
1631
|
+
}
|
|
1447
1632
|
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1633
|
+
const int * toktypes = nullptr;
|
|
1634
|
+
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
|
1635
|
+
if (toktype_idx != -1) {
|
|
1636
|
+
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
|
1451
1637
|
}
|
|
1452
1638
|
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1639
|
+
uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
|
|
1640
|
+
id_to_token.resize(n_tokens);
|
|
1641
|
+
|
|
1642
|
+
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
1643
|
+
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
|
1644
|
+
if (word.empty()) {
|
|
1645
|
+
LLAMA_LOG_WARN("%s: empty token at index %u\n", __func__, i);
|
|
1646
|
+
word = "[EMPTY_" + std::to_string(i) + "]";
|
|
1647
|
+
}
|
|
1648
|
+
|
|
1649
|
+
token_to_id[word] = i;
|
|
1650
|
+
max_token_len = std::max(max_token_len, (int) word.size());
|
|
1651
|
+
|
|
1652
|
+
auto & token_data = id_to_token[i];
|
|
1653
|
+
token_data.text = std::move(word);
|
|
1654
|
+
token_data.score = scores ? scores[i] : 0.0f;
|
|
1655
|
+
token_data.attr = LLAMA_TOKEN_ATTR_NORMAL;
|
|
1656
|
+
|
|
1657
|
+
if (toktypes) { //TODO: remove, required until per token attributes are available from GGUF file
|
|
1658
|
+
switch(toktypes[i]) {
|
|
1659
|
+
case LLAMA_TOKEN_TYPE_UNKNOWN: token_data.attr = LLAMA_TOKEN_ATTR_UNKNOWN; break;
|
|
1660
|
+
case LLAMA_TOKEN_TYPE_UNUSED: token_data.attr = LLAMA_TOKEN_ATTR_UNUSED; break;
|
|
1661
|
+
case LLAMA_TOKEN_TYPE_NORMAL: token_data.attr = LLAMA_TOKEN_ATTR_NORMAL; break;
|
|
1662
|
+
case LLAMA_TOKEN_TYPE_CONTROL: token_data.attr = LLAMA_TOKEN_ATTR_CONTROL; break;
|
|
1663
|
+
case LLAMA_TOKEN_TYPE_USER_DEFINED: token_data.attr = LLAMA_TOKEN_ATTR_USER_DEFINED; break;
|
|
1664
|
+
case LLAMA_TOKEN_TYPE_BYTE: token_data.attr = LLAMA_TOKEN_ATTR_BYTE; break;
|
|
1665
|
+
case LLAMA_TOKEN_TYPE_UNDEFINED: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
1666
|
+
default: token_data.attr = LLAMA_TOKEN_ATTR_UNDEFINED; break;
|
|
1667
|
+
}
|
|
1668
|
+
}
|
|
1669
|
+
}
|
|
1670
|
+
GGML_ASSERT(id_to_token.size() == token_to_id.size());
|
|
1460
1671
|
|
|
1461
|
-
|
|
1672
|
+
init_tokenizer(type);
|
|
1462
1673
|
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1674
|
+
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
|
1675
|
+
if (type == LLAMA_VOCAB_TYPE_SPM) {
|
|
1676
|
+
try {
|
|
1677
|
+
linefeed_id = vocab.byte_to_token('\n');
|
|
1678
|
+
} catch (const std::exception & e) {
|
|
1679
|
+
LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
|
|
1680
|
+
linefeed_id = special_pad_id;
|
|
1681
|
+
}
|
|
1682
|
+
} else if (type == LLAMA_VOCAB_TYPE_WPM) {
|
|
1683
|
+
linefeed_id = special_pad_id;
|
|
1684
|
+
} else if (type == LLAMA_VOCAB_TYPE_RWKV) {
|
|
1685
|
+
const std::vector<int> ids = tokenize("\n", false);
|
|
1686
|
+
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
|
1687
|
+
linefeed_id = ids[0];
|
|
1688
|
+
} else {
|
|
1689
|
+
const std::vector<int> ids = tokenize("\xC4\x8A", false); // U+010A
|
|
1690
|
+
|
|
1691
|
+
//GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
|
1692
|
+
if (ids.empty()) {
|
|
1693
|
+
LLAMA_LOG_WARN("%s: model vocab missing newline token, using special_pad_id instead\n", __func__);
|
|
1694
|
+
linefeed_id = special_pad_id;
|
|
1695
|
+
} else {
|
|
1696
|
+
linefeed_id = ids[0];
|
|
1697
|
+
}
|
|
1698
|
+
}
|
|
1468
1699
|
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1700
|
+
// special tokens
|
|
1701
|
+
{
|
|
1702
|
+
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
|
1703
|
+
{ LLM_KV_TOKENIZER_BOS_ID, special_bos_id },
|
|
1704
|
+
{ LLM_KV_TOKENIZER_EOS_ID, special_eos_id },
|
|
1705
|
+
{ LLM_KV_TOKENIZER_EOT_ID, special_eot_id },
|
|
1706
|
+
{ LLM_KV_TOKENIZER_EOM_ID, special_eom_id },
|
|
1707
|
+
{ LLM_KV_TOKENIZER_UNK_ID, special_unk_id },
|
|
1708
|
+
{ LLM_KV_TOKENIZER_SEP_ID, special_sep_id },
|
|
1709
|
+
{ LLM_KV_TOKENIZER_PAD_ID, special_pad_id },
|
|
1710
|
+
{ LLM_KV_TOKENIZER_MASK_ID, special_mask_id },
|
|
1711
|
+
{ LLM_KV_TOKENIZER_FIM_PRE_ID, special_fim_pre_id },
|
|
1712
|
+
{ LLM_KV_TOKENIZER_FIM_SUF_ID, special_fim_suf_id },
|
|
1713
|
+
{ LLM_KV_TOKENIZER_FIM_MID_ID, special_fim_mid_id },
|
|
1714
|
+
{ LLM_KV_TOKENIZER_FIM_PAD_ID, special_fim_pad_id },
|
|
1715
|
+
{ LLM_KV_TOKENIZER_FIM_REP_ID, special_fim_rep_id },
|
|
1716
|
+
{ LLM_KV_TOKENIZER_FIM_SEP_ID, special_fim_sep_id },
|
|
1717
|
+
|
|
1718
|
+
// deprecated
|
|
1719
|
+
{ LLM_KV_TOKENIZER_PREFIX_ID, special_fim_pre_id },
|
|
1720
|
+
{ LLM_KV_TOKENIZER_SUFFIX_ID, special_fim_suf_id },
|
|
1721
|
+
{ LLM_KV_TOKENIZER_MIDDLE_ID, special_fim_mid_id },
|
|
1722
|
+
};
|
|
1723
|
+
|
|
1724
|
+
for (const auto & it : special_token_types) {
|
|
1725
|
+
const std::string & key = kv(std::get<0>(it));
|
|
1726
|
+
int32_t & id = std::get<1>(it);
|
|
1727
|
+
|
|
1728
|
+
uint32_t new_id;
|
|
1729
|
+
if (!ml.get_key(std::get<0>(it), new_id, false)) {
|
|
1730
|
+
continue;
|
|
1731
|
+
}
|
|
1732
|
+
if (new_id >= id_to_token.size()) {
|
|
1733
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %u, using default id %d\n",
|
|
1734
|
+
__func__, key.c_str(), new_id, id);
|
|
1735
|
+
} else {
|
|
1736
|
+
id = new_id;
|
|
1737
|
+
}
|
|
1738
|
+
}
|
|
1472
1739
|
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
}
|
|
1740
|
+
// Handle add_bos and add_eos
|
|
1741
|
+
{
|
|
1742
|
+
bool temp = true;
|
|
1477
1743
|
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1744
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
|
1745
|
+
add_bos = temp;
|
|
1746
|
+
}
|
|
1747
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
|
1748
|
+
add_eos = temp;
|
|
1749
|
+
}
|
|
1750
|
+
}
|
|
1751
|
+
|
|
1752
|
+
// auto-detect special tokens by text
|
|
1753
|
+
// TODO: convert scripts should provide these tokens through the KV metadata LLM_KV_TOKENIZER_...
|
|
1754
|
+
// for now, we apply this workaround to find the tokens based on their text
|
|
1755
|
+
|
|
1756
|
+
for (const auto & t : token_to_id) {
|
|
1757
|
+
// find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
|
|
1758
|
+
if (special_eot_id == LLAMA_TOKEN_NULL) {
|
|
1759
|
+
if (false
|
|
1760
|
+
|| t.first == "<|eot_id|>"
|
|
1761
|
+
|| t.first == "<|im_end|>"
|
|
1762
|
+
|| t.first == "<|end|>"
|
|
1763
|
+
|| t.first == "<end_of_turn>"
|
|
1764
|
+
|| t.first == "<|endoftext|>"
|
|
1765
|
+
|| t.first == "<EOT>"
|
|
1766
|
+
|| t.first == "<|end▁of▁sentence|>" // DeepSeek
|
|
1767
|
+
) {
|
|
1768
|
+
special_eot_id = t.second;
|
|
1769
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1770
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1771
|
+
__func__, t.second, t.first.c_str());
|
|
1772
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1488
1773
|
}
|
|
1489
1774
|
}
|
|
1775
|
+
}
|
|
1490
1776
|
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1777
|
+
// find EOM token: "<|eom_id|>"
|
|
1778
|
+
if (special_eom_id == LLAMA_TOKEN_NULL) {
|
|
1779
|
+
if (false
|
|
1780
|
+
|| t.first == "<|eom_id|>"
|
|
1781
|
+
) {
|
|
1782
|
+
special_eom_id = t.second;
|
|
1783
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1784
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1785
|
+
__func__, t.second, t.first.c_str());
|
|
1786
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1787
|
+
}
|
|
1496
1788
|
}
|
|
1789
|
+
}
|
|
1497
1790
|
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1791
|
+
// find FIM_PRE token: "<|fim_prefix|>", "<fim-prefix>", "<PRE>", etc.
|
|
1792
|
+
if (special_fim_pre_id == LLAMA_TOKEN_NULL) {
|
|
1793
|
+
if (false
|
|
1794
|
+
|| t.first == "<|fim_prefix|>" // Qwen
|
|
1795
|
+
|| t.first == "<fim-prefix>"
|
|
1796
|
+
|| t.first == "<|fim▁begin|>" // DeepSeek
|
|
1797
|
+
|| t.first == "<PRE>"
|
|
1798
|
+
) {
|
|
1799
|
+
special_fim_pre_id = t.second;
|
|
1800
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1801
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1802
|
+
__func__, t.second, t.first.c_str());
|
|
1803
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1804
|
+
}
|
|
1501
1805
|
}
|
|
1502
|
-
}
|
|
1503
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
|
1504
|
-
{
|
|
1505
|
-
llm_tokenizer_bpe_session session(vocab);
|
|
1506
|
-
// it calls some other methods that are not exist in llm_tokenizer,
|
|
1507
|
-
// here just cast it to bpe tokenizer object
|
|
1508
|
-
if (add_special) {
|
|
1509
|
-
session.append_bos(output);
|
|
1510
|
-
}
|
|
1511
|
-
for (const auto & fragment : fragment_buffer) {
|
|
1512
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
1513
|
-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
1806
|
+
}
|
|
1514
1807
|
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1808
|
+
// find FIM_SUF token: "<|fim_suffix|>", "<fim-suffix>", "<SUF>", etc.
|
|
1809
|
+
if (special_fim_suf_id == LLAMA_TOKEN_NULL) {
|
|
1810
|
+
if (false
|
|
1811
|
+
|| t.first == "<|fim_suffix|>" // Qwen
|
|
1812
|
+
|| t.first == "<fim-suffix>"
|
|
1813
|
+
|| t.first == "<|fim▁hole|>" // DeepSeek
|
|
1814
|
+
|| t.first == "<SUF>"
|
|
1815
|
+
) {
|
|
1816
|
+
special_fim_suf_id = t.second;
|
|
1817
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1818
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1819
|
+
__func__, t.second, t.first.c_str());
|
|
1820
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1521
1821
|
}
|
|
1522
1822
|
}
|
|
1823
|
+
}
|
|
1523
1824
|
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1825
|
+
// find FIM_MID token: "<|fim_middle|>", "<fim-middle>", "<MID>", etc.
|
|
1826
|
+
if (special_fim_mid_id == LLAMA_TOKEN_NULL) {
|
|
1827
|
+
if (false
|
|
1828
|
+
|| t.first == "<|fim_middle|>" // Qwen
|
|
1829
|
+
|| t.first == "<fim-middle>"
|
|
1830
|
+
|| t.first == "<|fim▁end|>" // DeepSeek
|
|
1831
|
+
|| t.first == "<MID>"
|
|
1832
|
+
) {
|
|
1833
|
+
special_fim_mid_id = t.second;
|
|
1834
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1835
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1836
|
+
__func__, t.second, t.first.c_str());
|
|
1837
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1838
|
+
}
|
|
1534
1839
|
}
|
|
1840
|
+
}
|
|
1535
1841
|
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1842
|
+
// find FIM_PAD token: "<|fim_pad|>", "<fim-pad>", "<PAD>", etc.
|
|
1843
|
+
if (special_fim_pad_id == LLAMA_TOKEN_NULL) {
|
|
1844
|
+
if (false
|
|
1845
|
+
|| t.first == "<|fim_pad|>" // Qwen
|
|
1846
|
+
|| t.first == "<fim-pad>"
|
|
1847
|
+
|| t.first == "<PAD>"
|
|
1848
|
+
) {
|
|
1849
|
+
special_fim_pad_id = t.second;
|
|
1850
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1851
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1852
|
+
__func__, t.second, t.first.c_str());
|
|
1853
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1548
1854
|
}
|
|
1549
1855
|
}
|
|
1856
|
+
}
|
|
1550
1857
|
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1858
|
+
// find FIM_REP token: "<|fim_repo|>", "<fim-repo>", "<REP>", etc.
|
|
1859
|
+
if (special_fim_rep_id == LLAMA_TOKEN_NULL) {
|
|
1860
|
+
if (false
|
|
1861
|
+
|| t.first == "<|fim_repo|>" // Qwen
|
|
1862
|
+
|| t.first == "<|repo_name|>"
|
|
1863
|
+
|| t.first == "<fim-repo>"
|
|
1864
|
+
|| t.first == "<REPO>"
|
|
1865
|
+
) {
|
|
1866
|
+
special_fim_rep_id = t.second;
|
|
1867
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1868
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1869
|
+
__func__, t.second, t.first.c_str());
|
|
1870
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1871
|
+
}
|
|
1561
1872
|
}
|
|
1562
|
-
|
|
1873
|
+
}
|
|
1563
1874
|
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
|
|
1875
|
+
// find FIM_SEP token: "<|file_sep|>"
|
|
1876
|
+
if (special_fim_sep_id == LLAMA_TOKEN_NULL) {
|
|
1877
|
+
if (false
|
|
1878
|
+
|| t.first == "<|file_sep|>" // Qwen
|
|
1879
|
+
) {
|
|
1880
|
+
special_fim_sep_id = t.second;
|
|
1881
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1882
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1883
|
+
__func__, t.second, t.first.c_str());
|
|
1884
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1573
1885
|
}
|
|
1574
1886
|
}
|
|
1887
|
+
}
|
|
1888
|
+
}
|
|
1575
1889
|
|
|
1576
|
-
|
|
1577
|
-
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
1581
|
-
}
|
|
1890
|
+
// maintain a list of tokens that cause end-of-generation
|
|
1891
|
+
// this is currently determined based on the token text, which is obviously not ideal
|
|
1892
|
+
// ref: https://github.com/ggerganov/llama.cpp/issues/9606
|
|
1893
|
+
special_eog_ids.clear();
|
|
1582
1894
|
|
|
1583
|
-
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
}
|
|
1587
|
-
} break;
|
|
1588
|
-
case LLAMA_VOCAB_TYPE_RWKV:
|
|
1589
|
-
{
|
|
1590
|
-
llm_tokenizer_rwkv_session session(vocab);
|
|
1591
|
-
for (const auto & fragment : fragment_buffer) {
|
|
1592
|
-
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
1593
|
-
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
1895
|
+
if (special_fim_pad_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_pad_id) == 0) {
|
|
1896
|
+
special_eog_ids.insert(special_fim_pad_id);
|
|
1897
|
+
}
|
|
1594
1898
|
|
|
1595
|
-
|
|
1596
|
-
|
|
1597
|
-
|
|
1899
|
+
if (special_fim_rep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_rep_id) == 0) {
|
|
1900
|
+
special_eog_ids.insert(special_fim_rep_id);
|
|
1901
|
+
}
|
|
1598
1902
|
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1903
|
+
if (special_fim_sep_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_fim_sep_id) == 0) {
|
|
1904
|
+
special_eog_ids.insert(special_fim_sep_id);
|
|
1905
|
+
}
|
|
1906
|
+
|
|
1907
|
+
for (const auto & t : token_to_id) {
|
|
1908
|
+
if (false
|
|
1909
|
+
|| t.first == "<|eot_id|>"
|
|
1910
|
+
|| t.first == "<|im_end|>"
|
|
1911
|
+
|| t.first == "<|end|>"
|
|
1912
|
+
|| t.first == "<end_of_turn>"
|
|
1913
|
+
|| t.first == "<|endoftext|>"
|
|
1914
|
+
|| t.first == "<|eom_id|>"
|
|
1915
|
+
|| t.first == "<EOT>"
|
|
1916
|
+
) {
|
|
1917
|
+
special_eog_ids.insert(t.second);
|
|
1918
|
+
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
|
1919
|
+
LLAMA_LOG_WARN("%s: control-looking token: %6d '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
|
1920
|
+
__func__, t.second, t.first.c_str());
|
|
1921
|
+
id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
|
1603
1922
|
}
|
|
1604
|
-
}
|
|
1605
|
-
|
|
1606
|
-
|
|
1923
|
+
} else {
|
|
1924
|
+
// token is control, but not marked as EOG -> print a debug log
|
|
1925
|
+
if (id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL && special_eog_ids.count(t.second) == 0) {
|
|
1926
|
+
LLAMA_LOG_DEBUG("%s: control token: %6d '%s' is not marked as EOG\n",
|
|
1927
|
+
__func__, t.second, t.first.c_str());
|
|
1928
|
+
}
|
|
1929
|
+
}
|
|
1930
|
+
}
|
|
1931
|
+
|
|
1932
|
+
// sanity checks
|
|
1933
|
+
if (special_eos_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eos_id) == 0) {
|
|
1934
|
+
special_eog_ids.insert(special_eos_id);
|
|
1935
|
+
LLAMA_LOG_WARN("%s: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1938
|
+
if (special_eot_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eot_id) == 0) {
|
|
1939
|
+
special_eog_ids.insert(special_eot_id);
|
|
1940
|
+
LLAMA_LOG_WARN("%s: special_eot_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
|
1941
|
+
}
|
|
1942
|
+
|
|
1943
|
+
if (special_eom_id != LLAMA_TOKEN_NULL && special_eog_ids.count(special_eom_id) == 0) {
|
|
1944
|
+
special_eog_ids.insert(special_eom_id);
|
|
1945
|
+
LLAMA_LOG_WARN("%s: special_eom_id is not in special_eog_ids - the tokenizer config may be incorrect\n", __func__);
|
|
1946
|
+
}
|
|
1607
1947
|
}
|
|
1608
1948
|
|
|
1609
|
-
|
|
1610
|
-
|
|
1949
|
+
// build special tokens cache
|
|
1950
|
+
{
|
|
1951
|
+
for (llama_token id = 0; id < (llama_token) n_tokens; ++id) {
|
|
1952
|
+
if (id_to_token[id].attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)) {
|
|
1953
|
+
cache_special_tokens.push_back(id);
|
|
1954
|
+
}
|
|
1955
|
+
}
|
|
1611
1956
|
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
switch (llama_vocab_get_type(vocab)) {
|
|
1616
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
|
1617
|
-
case LLAMA_VOCAB_TYPE_UGM: {
|
|
1618
|
-
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
|
1619
|
-
auto token = vocab.token_to_id.find(buf);
|
|
1620
|
-
if (token != vocab.token_to_id.end()) {
|
|
1621
|
-
return (*token).second;
|
|
1957
|
+
std::sort(cache_special_tokens.begin(), cache_special_tokens.end(),
|
|
1958
|
+
[&] (const llama_token a, const llama_token b) {
|
|
1959
|
+
return id_to_token[a].text.size() > id_to_token[b].text.size();
|
|
1622
1960
|
}
|
|
1623
|
-
|
|
1624
|
-
|
|
1625
|
-
|
|
1961
|
+
);
|
|
1962
|
+
|
|
1963
|
+
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t) cache_special_tokens.size());
|
|
1964
|
+
}
|
|
1965
|
+
|
|
1966
|
+
// build token to piece cache
|
|
1967
|
+
{
|
|
1968
|
+
size_t size_cache = 0;
|
|
1969
|
+
|
|
1970
|
+
std::vector<std::string> cache(n_tokens);
|
|
1971
|
+
|
|
1972
|
+
for (uint32_t id = 0; id < n_tokens; ++id) {
|
|
1973
|
+
cache[id] = token_to_piece_for_cache(id, true);
|
|
1974
|
+
|
|
1975
|
+
size_cache += cache[id].size();
|
|
1626
1976
|
}
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1977
|
+
|
|
1978
|
+
std::swap(cache_token_to_piece, cache);
|
|
1979
|
+
|
|
1980
|
+
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
|
1981
|
+
}
|
|
1982
|
+
|
|
1983
|
+
// Handle per token attributes
|
|
1984
|
+
//NOTE: Each model customizes per token attributes.
|
|
1985
|
+
//NOTE: Per token attributes are missing from the GGUF file.
|
|
1986
|
+
//TODO: Extract attributes from GGUF file.
|
|
1987
|
+
{
|
|
1988
|
+
auto _contains_any = [] (const std::string & str, const std::vector<std::string> & substrs) -> bool {
|
|
1989
|
+
for (const auto & substr : substrs) {
|
|
1990
|
+
if (str.find(substr) < std::string::npos) {
|
|
1991
|
+
return true;
|
|
1992
|
+
}
|
|
1993
|
+
}
|
|
1994
|
+
return false;
|
|
1995
|
+
};
|
|
1996
|
+
|
|
1997
|
+
auto _set_tokenid_attr = [&] (const llama_token id, llama_token_attr attr, bool value) {
|
|
1998
|
+
uint32_t current = id_to_token.at(id).attr;
|
|
1999
|
+
current = value ? (current | attr) : (current & ~attr);
|
|
2000
|
+
id_to_token[id].attr = (llama_token_attr) current;
|
|
2001
|
+
};
|
|
2002
|
+
|
|
2003
|
+
auto _set_token_attr = [&] (const std::string & token, llama_token_attr attr, bool value) {
|
|
2004
|
+
_set_tokenid_attr(token_to_id.at(token), attr, value);
|
|
2005
|
+
};
|
|
2006
|
+
|
|
2007
|
+
std::string model_name;
|
|
2008
|
+
std::string tokenizer_pre;
|
|
2009
|
+
|
|
2010
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
|
|
2011
|
+
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
2012
|
+
|
|
2013
|
+
// model name to lowercase
|
|
2014
|
+
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
|
|
2015
|
+
[] (const std::string::value_type x) {
|
|
2016
|
+
return std::tolower(x);
|
|
2017
|
+
}
|
|
2018
|
+
);
|
|
2019
|
+
|
|
2020
|
+
// set attributes by model/tokenizer name
|
|
2021
|
+
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
|
|
2022
|
+
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
|
|
2023
|
+
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
|
|
2024
|
+
for (auto id : cache_special_tokens) {
|
|
2025
|
+
_set_tokenid_attr(id, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
2026
|
+
}
|
|
2027
|
+
for (const auto * token : {"</s>"}) {
|
|
2028
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, true);
|
|
2029
|
+
}
|
|
2030
|
+
for (const auto * token : {"<unk>", "<s>", "<|endoftext|>"}) {
|
|
2031
|
+
_set_token_attr(token, LLAMA_TOKEN_ATTR_RSTRIP, false);
|
|
2032
|
+
}
|
|
1630
2033
|
}
|
|
1631
|
-
default:
|
|
1632
|
-
GGML_ABORT("fatal error");
|
|
1633
2034
|
}
|
|
1634
2035
|
}
|
|
1635
2036
|
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
return vocab.id_to_token[token].text.c_str();
|
|
2037
|
+
enum llama_vocab_type llama_vocab::impl::get_type() const {
|
|
2038
|
+
return type;
|
|
1639
2039
|
}
|
|
1640
2040
|
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
2041
|
+
std::string llama_vocab::impl::type_name() const{
|
|
2042
|
+
switch (type) {
|
|
2043
|
+
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
|
2044
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
|
2045
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
|
2046
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
|
2047
|
+
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
|
2048
|
+
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
|
2049
|
+
default: return "unknown";
|
|
2050
|
+
}
|
|
1644
2051
|
}
|
|
1645
2052
|
|
|
1646
|
-
|
|
1647
|
-
GGML_ASSERT(
|
|
1648
|
-
return
|
|
2053
|
+
bool llama_vocab::impl::is_normal(llama_token id) const {
|
|
2054
|
+
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
|
|
2055
|
+
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_NORMAL;
|
|
1649
2056
|
}
|
|
1650
2057
|
|
|
1651
|
-
bool
|
|
1652
|
-
|
|
2058
|
+
bool llama_vocab::impl::is_unknown(llama_token id) const {
|
|
2059
|
+
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
|
|
2060
|
+
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNKNOWN;
|
|
1653
2061
|
}
|
|
1654
2062
|
|
|
1655
|
-
bool
|
|
1656
|
-
|
|
2063
|
+
bool llama_vocab::impl::is_control(llama_token id) const {
|
|
2064
|
+
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
|
|
2065
|
+
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_CONTROL;
|
|
1657
2066
|
}
|
|
1658
2067
|
|
|
1659
|
-
|
|
1660
|
-
|
|
2068
|
+
bool llama_vocab::impl::is_byte(llama_token id) const {
|
|
2069
|
+
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
|
|
2070
|
+
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_BYTE;
|
|
1661
2071
|
}
|
|
1662
2072
|
|
|
1663
|
-
|
|
1664
|
-
|
|
2073
|
+
bool llama_vocab::impl::is_user_defined(llama_token id) const {
|
|
2074
|
+
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
|
|
2075
|
+
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_USER_DEFINED;
|
|
1665
2076
|
}
|
|
1666
2077
|
|
|
1667
|
-
|
|
1668
|
-
|
|
2078
|
+
bool llama_vocab::impl::is_unused(llama_token id) const {
|
|
2079
|
+
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
|
|
2080
|
+
return id_to_token[id].attr & LLAMA_TOKEN_ATTR_UNUSED;
|
|
1669
2081
|
}
|
|
1670
2082
|
|
|
1671
|
-
|
|
1672
|
-
return
|
|
2083
|
+
bool llama_vocab::impl::is_eog(llama_token id) const {
|
|
2084
|
+
return id != LLAMA_TOKEN_NULL && special_eog_ids.count(id) > 0;
|
|
1673
2085
|
}
|
|
1674
2086
|
|
|
1675
|
-
|
|
1676
|
-
|
|
2087
|
+
uint8_t llama_vocab::impl::token_to_byte(llama_token id) const {
|
|
2088
|
+
GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
|
|
2089
|
+
GGML_ASSERT(is_byte(id));
|
|
2090
|
+
const auto & token_data = id_to_token.at(id);
|
|
2091
|
+
switch (get_type()) {
|
|
2092
|
+
case LLAMA_VOCAB_TYPE_SPM:
|
|
2093
|
+
case LLAMA_VOCAB_TYPE_UGM: {
|
|
2094
|
+
auto buf = token_data.text.substr(3, 2);
|
|
2095
|
+
return strtol(buf.c_str(), NULL, 16);
|
|
2096
|
+
}
|
|
2097
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
|
2098
|
+
GGML_ABORT("fatal error");
|
|
2099
|
+
}
|
|
2100
|
+
case LLAMA_VOCAB_TYPE_WPM: {
|
|
2101
|
+
GGML_ABORT("fatal error");
|
|
2102
|
+
}
|
|
2103
|
+
default:
|
|
2104
|
+
GGML_ABORT("fatal error");
|
|
2105
|
+
}
|
|
1677
2106
|
}
|
|
1678
2107
|
|
|
1679
|
-
|
|
1680
|
-
|
|
2108
|
+
llama_token_attr llama_vocab::impl::token_get_attr(llama_token id) const {
|
|
2109
|
+
GGML_ASSERT(type != LLAMA_VOCAB_TYPE_NONE);
|
|
2110
|
+
return id_to_token.at(id).attr;
|
|
1681
2111
|
}
|
|
1682
2112
|
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
}
|
|
2113
|
+
void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
|
|
2114
|
+
LLAMA_LOG_DEBUG("%s: initializing tokenizer for type %d\n", __func__, type);
|
|
1686
2115
|
|
|
1687
|
-
|
|
1688
|
-
|
|
2116
|
+
switch (type) {
|
|
2117
|
+
case LLAMA_VOCAB_TYPE_SPM:
|
|
2118
|
+
tokenizer = std::make_unique<llm_tokenizer_spm>(vocab);
|
|
2119
|
+
break;
|
|
2120
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
|
2121
|
+
tokenizer = std::make_unique<llm_tokenizer_bpe>(vocab);
|
|
2122
|
+
break;
|
|
2123
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
|
2124
|
+
tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
|
|
2125
|
+
break;
|
|
2126
|
+
case LLAMA_VOCAB_TYPE_UGM:
|
|
2127
|
+
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
|
|
2128
|
+
break;
|
|
2129
|
+
case LLAMA_VOCAB_TYPE_RWKV:
|
|
2130
|
+
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
|
|
2131
|
+
break;
|
|
2132
|
+
default:
|
|
2133
|
+
GGML_ABORT("unsupported vocab type");
|
|
2134
|
+
}
|
|
1689
2135
|
}
|
|
1690
2136
|
|
|
1691
|
-
|
|
1692
|
-
|
|
1693
|
-
|
|
2137
|
+
//
|
|
2138
|
+
// (de-) tokenize
|
|
2139
|
+
//
|
|
1694
2140
|
|
|
1695
|
-
|
|
1696
|
-
return vocab.tokenizer_add_eos;
|
|
1697
|
-
}
|
|
2141
|
+
// #define PRETOKENIZERDEBUG
|
|
1698
2142
|
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
2143
|
+
void llama_vocab::impl::tokenizer_st_partition(std::forward_list<fragment_buffer_variant> & buffer, bool parse_special) const {
|
|
2144
|
+
// for each special token
|
|
2145
|
+
for (const llama_token special_id : cache_special_tokens) {
|
|
2146
|
+
const auto & data = vocab.get_token_data(special_id);
|
|
2147
|
+
const auto & text = data.text;
|
|
1702
2148
|
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
2149
|
+
if (!parse_special && (data.attr & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_UNKNOWN))) {
|
|
2150
|
+
// Ignore control and unknown tokens when parse_special == false
|
|
2151
|
+
continue;
|
|
2152
|
+
// User-defined tokens are still pre-tokenized before everything else
|
|
2153
|
+
// ref: https://github.com/huggingface/tokenizers/blob/fdd26ba9a3f0c133427aab0423888cbde91362d7/tokenizers/src/tokenizer/mod.rs#L726
|
|
2154
|
+
// This is mostly relevant for neox-style tokenizers (mpt, olmo, stablelm, etc.)
|
|
2155
|
+
}
|
|
1706
2156
|
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
2157
|
+
// for each text fragment
|
|
2158
|
+
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
|
2159
|
+
while (it != buffer.end()) {
|
|
2160
|
+
auto & fragment = (*it);
|
|
1710
2161
|
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
2162
|
+
// if a fragment is text ( not yet processed )
|
|
2163
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2164
|
+
const auto & raw_text = fragment.raw_text;
|
|
1714
2165
|
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
}
|
|
2166
|
+
auto raw_text_base_offset = fragment.offset;
|
|
2167
|
+
auto raw_text_base_length = fragment.length;
|
|
1718
2168
|
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
2169
|
+
// loop over the text
|
|
2170
|
+
while (true) {
|
|
2171
|
+
// find the first occurrence of a given special token in this fragment
|
|
2172
|
+
// passing offset argument only limit the "search area" but match coordinates
|
|
2173
|
+
// are still relative to the source full raw_text
|
|
2174
|
+
auto match = raw_text.find(text, raw_text_base_offset);
|
|
1722
2175
|
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
}
|
|
2176
|
+
// no occurrences found, stop processing this fragment for a given special token
|
|
2177
|
+
if (match == std::string::npos) break;
|
|
1726
2178
|
|
|
1727
|
-
|
|
1728
|
-
|
|
1729
|
-
}
|
|
2179
|
+
// check if match is within bounds of offset <-> length
|
|
2180
|
+
if (match + text.length() > raw_text_base_offset + raw_text_base_length) break;
|
|
1730
2181
|
|
|
1731
|
-
|
|
1732
|
-
|
|
1733
|
-
|
|
2182
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2183
|
+
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
2184
|
+
#endif
|
|
2185
|
+
auto source = std::distance(buffer.begin(), it);
|
|
1734
2186
|
|
|
1735
|
-
|
|
1736
|
-
|
|
1737
|
-
|
|
1738
|
-
|
|
1739
|
-
|
|
1740
|
-
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
2187
|
+
// if match is further than base offset
|
|
2188
|
+
// then we have some text to the left of it
|
|
2189
|
+
if (match > raw_text_base_offset) {
|
|
2190
|
+
// left
|
|
2191
|
+
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
|
2192
|
+
int64_t left_reminder_length = match - raw_text_base_offset;
|
|
2193
|
+
|
|
2194
|
+
if (data.attr & LLAMA_TOKEN_ATTR_LSTRIP) {
|
|
2195
|
+
while (left_reminder_length > 0 && isspace(raw_text[left_reminder_offset + left_reminder_length - 1])) {
|
|
2196
|
+
left_reminder_length--;
|
|
2197
|
+
}
|
|
2198
|
+
}
|
|
2199
|
+
|
|
2200
|
+
if (left_reminder_length > 0) {
|
|
2201
|
+
buffer.emplace_after(it, raw_text, left_reminder_offset, left_reminder_length);
|
|
2202
|
+
it++;
|
|
2203
|
+
}
|
|
2204
|
+
|
|
2205
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2206
|
+
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
|
2207
|
+
#endif
|
|
2208
|
+
}
|
|
2209
|
+
|
|
2210
|
+
// special token
|
|
2211
|
+
buffer.emplace_after(it, special_id);
|
|
2212
|
+
it++;
|
|
2213
|
+
|
|
2214
|
+
// right
|
|
2215
|
+
if (match + text.length() < raw_text_base_offset + raw_text_base_length) {
|
|
2216
|
+
int64_t right_reminder_offset = match + text.length();
|
|
2217
|
+
int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + text.length());
|
|
2218
|
+
|
|
2219
|
+
if (data.attr & LLAMA_TOKEN_ATTR_RSTRIP) {
|
|
2220
|
+
while (right_reminder_length > 0 && isspace(raw_text[right_reminder_offset])) {
|
|
2221
|
+
right_reminder_offset++;
|
|
2222
|
+
right_reminder_length--;
|
|
2223
|
+
}
|
|
2224
|
+
}
|
|
2225
|
+
|
|
2226
|
+
if (right_reminder_length > 0) {
|
|
2227
|
+
buffer.emplace_after(it, raw_text, right_reminder_offset, right_reminder_length);
|
|
2228
|
+
it++;
|
|
2229
|
+
}
|
|
2230
|
+
|
|
2231
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2232
|
+
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
|
2233
|
+
#endif
|
|
2234
|
+
|
|
2235
|
+
if (source == 0) {
|
|
2236
|
+
buffer.erase_after(buffer.before_begin());
|
|
2237
|
+
} else {
|
|
2238
|
+
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
|
|
2239
|
+
}
|
|
2240
|
+
|
|
2241
|
+
// repeat for the right side
|
|
2242
|
+
raw_text_base_offset = right_reminder_offset;
|
|
2243
|
+
raw_text_base_length = right_reminder_length;
|
|
2244
|
+
|
|
2245
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2246
|
+
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
2247
|
+
#endif
|
|
2248
|
+
} else {
|
|
2249
|
+
if (source == 0) {
|
|
2250
|
+
buffer.erase_after(buffer.before_begin());
|
|
2251
|
+
} else {
|
|
2252
|
+
buffer.erase_after(std::next(buffer.begin(), (source - 1)));
|
|
2253
|
+
}
|
|
2254
|
+
break;
|
|
2255
|
+
}
|
|
2256
|
+
}
|
|
2257
|
+
}
|
|
2258
|
+
it++;
|
|
2259
|
+
}
|
|
1747
2260
|
}
|
|
2261
|
+
}
|
|
1748
2262
|
|
|
1749
|
-
|
|
1750
|
-
|
|
2263
|
+
// NOTE: avoid ever using this except for building the token_to_piece caches
|
|
2264
|
+
std::string llama_vocab::impl::token_to_piece_for_cache(llama_token token, bool special) const {
|
|
2265
|
+
std::string piece;
|
|
2266
|
+
piece.resize(piece.capacity()); // using string internal cache
|
|
2267
|
+
const int n_chars = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
|
|
2268
|
+
if (n_chars < 0) {
|
|
2269
|
+
piece.resize(-n_chars);
|
|
2270
|
+
int check = vocab.token_to_piece(token, &piece[0], piece.size(), 0, special);
|
|
2271
|
+
GGML_ASSERT(check == -n_chars);
|
|
2272
|
+
}
|
|
2273
|
+
else {
|
|
2274
|
+
piece.resize(n_chars);
|
|
1751
2275
|
}
|
|
1752
2276
|
|
|
1753
|
-
return
|
|
2277
|
+
return piece;
|
|
2278
|
+
}
|
|
2279
|
+
|
|
2280
|
+
static void llama_escape_whitespace(std::string & text) {
|
|
2281
|
+
replace_all(text, " ", "\xe2\x96\x81");
|
|
2282
|
+
}
|
|
2283
|
+
|
|
2284
|
+
static void llama_unescape_whitespace(std::string & word) {
|
|
2285
|
+
replace_all(word, "\xe2\x96\x81", " ");
|
|
1754
2286
|
}
|
|
1755
2287
|
|
|
1756
2288
|
static std::string llama_decode_text(const std::string & text) {
|
|
@@ -1773,11 +2305,185 @@ static std::string llama_decode_text(const std::string & text) {
|
|
|
1773
2305
|
return decoded_text;
|
|
1774
2306
|
}
|
|
1775
2307
|
|
|
1776
|
-
|
|
1777
|
-
|
|
2308
|
+
std::vector<llama_token> llama_vocab::impl::tokenize(
|
|
2309
|
+
const std::string & raw_text,
|
|
2310
|
+
bool add_special,
|
|
2311
|
+
bool parse_special) const {
|
|
2312
|
+
GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
|
2313
|
+
|
|
2314
|
+
std::vector<llama_token> output;
|
|
2315
|
+
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
|
2316
|
+
|
|
2317
|
+
if (!raw_text.empty()) {
|
|
2318
|
+
fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
|
|
2319
|
+
tokenizer_st_partition(fragment_buffer, parse_special);
|
|
2320
|
+
}
|
|
2321
|
+
|
|
2322
|
+
switch (get_type()) {
|
|
2323
|
+
case LLAMA_VOCAB_TYPE_SPM:
|
|
2324
|
+
{
|
|
2325
|
+
// OG tokenizer behavior:
|
|
2326
|
+
//
|
|
2327
|
+
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
|
2328
|
+
// tokenizer.encode('', add_special_tokens=False) returns []
|
|
2329
|
+
|
|
2330
|
+
bool is_prev_special = true; // prefix with space if first token
|
|
2331
|
+
|
|
2332
|
+
if (add_special && add_bos) {
|
|
2333
|
+
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
|
|
2334
|
+
output.push_back(special_bos_id);
|
|
2335
|
+
is_prev_special = true;
|
|
2336
|
+
}
|
|
2337
|
+
|
|
2338
|
+
for (const auto & fragment : fragment_buffer) {
|
|
2339
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2340
|
+
std::string text;
|
|
2341
|
+
|
|
2342
|
+
// prefix with space if previous is special
|
|
2343
|
+
if (add_space_prefix && is_prev_special) {
|
|
2344
|
+
text = ' ';
|
|
2345
|
+
}
|
|
2346
|
+
|
|
2347
|
+
text += fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
2348
|
+
|
|
2349
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2350
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
|
2351
|
+
#endif
|
|
2352
|
+
llama_escape_whitespace(text);
|
|
2353
|
+
llm_tokenizer_spm_session session(vocab);
|
|
2354
|
+
session.tokenize(text, output);
|
|
2355
|
+
is_prev_special = false;
|
|
2356
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
2357
|
+
output.push_back(fragment.token);
|
|
2358
|
+
is_prev_special = true;
|
|
2359
|
+
}
|
|
2360
|
+
}
|
|
2361
|
+
|
|
2362
|
+
if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
|
|
2363
|
+
LLAMA_LOG_WARN(
|
|
2364
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
2365
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
2366
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
2367
|
+
}
|
|
2368
|
+
|
|
2369
|
+
if (add_special && add_eos) {
|
|
2370
|
+
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
|
|
2371
|
+
output.push_back(special_eos_id);
|
|
2372
|
+
}
|
|
2373
|
+
} break;
|
|
2374
|
+
case LLAMA_VOCAB_TYPE_BPE:
|
|
2375
|
+
{
|
|
2376
|
+
llm_tokenizer_bpe_session session(vocab, *static_cast<const llm_tokenizer_bpe *>(tokenizer.get()));
|
|
2377
|
+
// it calls some other methods that are not exist in llm_tokenizer,
|
|
2378
|
+
// here just cast it to bpe tokenizer object
|
|
2379
|
+
if (add_special) {
|
|
2380
|
+
session.append_bos(output);
|
|
2381
|
+
}
|
|
2382
|
+
for (const auto & fragment : fragment_buffer) {
|
|
2383
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2384
|
+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
2385
|
+
|
|
2386
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2387
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
|
2388
|
+
#endif
|
|
2389
|
+
session.tokenize(text, output);
|
|
2390
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
2391
|
+
session.append(fragment.token, output);
|
|
2392
|
+
}
|
|
2393
|
+
}
|
|
2394
|
+
|
|
2395
|
+
if (add_special) {
|
|
2396
|
+
session.append_eos(output);
|
|
2397
|
+
session.check_double_bos_eos(output);
|
|
2398
|
+
}
|
|
2399
|
+
} break;
|
|
2400
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
|
2401
|
+
{
|
|
2402
|
+
if (add_special) {
|
|
2403
|
+
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
|
|
2404
|
+
output.push_back(special_bos_id);
|
|
2405
|
+
}
|
|
2406
|
+
|
|
2407
|
+
llm_tokenizer_wpm_session session(vocab);
|
|
2408
|
+
|
|
2409
|
+
for (const auto & fragment : fragment_buffer) {
|
|
2410
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2411
|
+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
2412
|
+
|
|
2413
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2414
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
|
2415
|
+
#endif
|
|
2416
|
+
session.tokenize(text, output);
|
|
2417
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
2418
|
+
output.push_back(fragment.token);
|
|
2419
|
+
}
|
|
2420
|
+
}
|
|
2421
|
+
|
|
2422
|
+
if (add_special) {
|
|
2423
|
+
GGML_ASSERT(special_sep_id != LLAMA_TOKEN_NULL);
|
|
2424
|
+
output.push_back(special_sep_id);
|
|
2425
|
+
}
|
|
2426
|
+
} break;
|
|
2427
|
+
case LLAMA_VOCAB_TYPE_UGM:
|
|
2428
|
+
{
|
|
2429
|
+
if (add_special && add_bos) {
|
|
2430
|
+
GGML_ASSERT(special_bos_id != LLAMA_TOKEN_NULL);
|
|
2431
|
+
output.push_back(special_bos_id);
|
|
2432
|
+
}
|
|
2433
|
+
llm_tokenizer_ugm_session session(vocab, *static_cast<const llm_tokenizer_ugm *>(tokenizer.get()));
|
|
2434
|
+
|
|
2435
|
+
for (const auto & fragment : fragment_buffer) {
|
|
2436
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2437
|
+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
2438
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2439
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
|
2440
|
+
#endif
|
|
2441
|
+
session.tokenize(text, output);
|
|
2442
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
2443
|
+
output.push_back(fragment.token);
|
|
2444
|
+
}
|
|
2445
|
+
}
|
|
2446
|
+
|
|
2447
|
+
if (add_special && add_bos && output.size() >= 2 && output[1] == special_bos_id) {
|
|
2448
|
+
LLAMA_LOG_WARN(
|
|
2449
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
2450
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
2451
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
2452
|
+
}
|
|
2453
|
+
|
|
2454
|
+
if (add_special && add_eos) {
|
|
2455
|
+
GGML_ASSERT(special_eos_id != LLAMA_TOKEN_NULL);
|
|
2456
|
+
output.push_back(special_eos_id);
|
|
2457
|
+
}
|
|
2458
|
+
} break;
|
|
2459
|
+
case LLAMA_VOCAB_TYPE_RWKV:
|
|
2460
|
+
{
|
|
2461
|
+
llm_tokenizer_rwkv_session session(vocab, *static_cast<const llm_tokenizer_rwkv *>(tokenizer.get()));
|
|
2462
|
+
for (const auto & fragment : fragment_buffer) {
|
|
2463
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
|
2464
|
+
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
2465
|
+
|
|
2466
|
+
#ifdef PRETOKENIZERDEBUG
|
|
2467
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
|
2468
|
+
#endif
|
|
2469
|
+
|
|
2470
|
+
session.tokenize(text, output);
|
|
2471
|
+
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
2472
|
+
output.push_back(fragment.token);
|
|
2473
|
+
}
|
|
2474
|
+
}
|
|
2475
|
+
} break;
|
|
2476
|
+
case LLAMA_VOCAB_TYPE_NONE:
|
|
2477
|
+
GGML_ABORT("fatal error");
|
|
2478
|
+
}
|
|
2479
|
+
|
|
2480
|
+
return output;
|
|
2481
|
+
}
|
|
2482
|
+
|
|
2483
|
+
int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
|
|
1778
2484
|
// ref: https://github.com/ggerganov/llama.cpp/pull/7587#discussion_r1620983843
|
|
1779
2485
|
static const int attr_special = LLAMA_TOKEN_ATTR_UNKNOWN | LLAMA_TOKEN_ATTR_CONTROL;
|
|
1780
|
-
const llama_token_attr attr =
|
|
2486
|
+
const llama_token_attr attr = token_get_attr(token);
|
|
1781
2487
|
if (!special && (attr & attr_special)) {
|
|
1782
2488
|
return 0;
|
|
1783
2489
|
}
|
|
@@ -1798,7 +2504,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
|
|
1798
2504
|
|
|
1799
2505
|
// if we have a cache - use it
|
|
1800
2506
|
{
|
|
1801
|
-
const auto & cache =
|
|
2507
|
+
const auto & cache = cache_token_to_piece;
|
|
1802
2508
|
|
|
1803
2509
|
if (!cache.empty()) {
|
|
1804
2510
|
const auto & result = cache.at(token);
|
|
@@ -1806,9 +2512,9 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
|
|
1806
2512
|
}
|
|
1807
2513
|
}
|
|
1808
2514
|
|
|
1809
|
-
if (0 <= token && token < (int32_t)
|
|
1810
|
-
const std::string & token_text =
|
|
1811
|
-
switch (
|
|
2515
|
+
if (0 <= token && token < (int32_t) id_to_token.size()) {
|
|
2516
|
+
const std::string & token_text = id_to_token[token].text;
|
|
2517
|
+
switch (get_type()) {
|
|
1812
2518
|
case LLAMA_VOCAB_TYPE_WPM:
|
|
1813
2519
|
case LLAMA_VOCAB_TYPE_SPM:
|
|
1814
2520
|
case LLAMA_VOCAB_TYPE_UGM: {
|
|
@@ -1823,7 +2529,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
|
|
1823
2529
|
return _try_copy(result.data(), result.size());
|
|
1824
2530
|
}
|
|
1825
2531
|
if (attr & LLAMA_TOKEN_ATTR_BYTE) {
|
|
1826
|
-
char byte = (char)
|
|
2532
|
+
char byte = (char) token_to_byte(token);
|
|
1827
2533
|
return _try_copy((char*) &byte, 1);
|
|
1828
2534
|
}
|
|
1829
2535
|
break;
|
|
@@ -1859,43 +2565,46 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
|
|
|
1859
2565
|
return 0;
|
|
1860
2566
|
}
|
|
1861
2567
|
|
|
1862
|
-
|
|
1863
|
-
|
|
2568
|
+
const std::string & llama_vocab::impl::token_to_piece(llama_token token) const {
|
|
2569
|
+
return cache_token_to_piece.at(token);
|
|
2570
|
+
}
|
|
2571
|
+
|
|
2572
|
+
int32_t llama_vocab::impl::detokenize(
|
|
1864
2573
|
const llama_token * tokens,
|
|
1865
2574
|
int32_t n_tokens,
|
|
1866
2575
|
char * text,
|
|
1867
2576
|
int32_t text_len_max,
|
|
1868
2577
|
bool remove_special,
|
|
1869
|
-
bool unparse_special) {
|
|
1870
|
-
if (
|
|
2578
|
+
bool unparse_special) const {
|
|
2579
|
+
if (type == LLAMA_VOCAB_TYPE_NONE) {
|
|
1871
2580
|
return 0;
|
|
1872
2581
|
}
|
|
1873
2582
|
|
|
1874
|
-
GGML_ASSERT(
|
|
2583
|
+
GGML_ASSERT(tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first.");
|
|
1875
2584
|
|
|
1876
2585
|
int32_t avail = text_len_max;
|
|
1877
2586
|
int32_t total = 0;
|
|
1878
2587
|
|
|
1879
2588
|
// remove the leading space
|
|
1880
|
-
bool remove_space =
|
|
2589
|
+
bool remove_space = add_space_prefix;
|
|
1881
2590
|
|
|
1882
|
-
if (remove_special &&
|
|
1883
|
-
if (n_tokens > 0 && tokens[0] ==
|
|
2591
|
+
if (remove_special && add_bos) {
|
|
2592
|
+
if (n_tokens > 0 && tokens[0] == special_bos_id) {
|
|
1884
2593
|
remove_space = false;
|
|
1885
2594
|
n_tokens--;
|
|
1886
2595
|
tokens++;
|
|
1887
2596
|
}
|
|
1888
2597
|
}
|
|
1889
2598
|
|
|
1890
|
-
if (remove_special &&
|
|
1891
|
-
if (n_tokens > 0 && tokens[n_tokens-1] ==
|
|
2599
|
+
if (remove_special && add_eos) {
|
|
2600
|
+
if (n_tokens > 0 && tokens[n_tokens - 1] == special_eos_id) {
|
|
1892
2601
|
n_tokens--;
|
|
1893
2602
|
}
|
|
1894
2603
|
}
|
|
1895
2604
|
|
|
1896
2605
|
for (int32_t i = 0; i < n_tokens; ++i) {
|
|
1897
2606
|
GGML_ASSERT(avail >= 0);
|
|
1898
|
-
int32_t n_chars =
|
|
2607
|
+
int32_t n_chars = token_to_piece(tokens[i], text, avail, remove_space, unparse_special);
|
|
1899
2608
|
remove_space = false;
|
|
1900
2609
|
if (n_chars < 0) {
|
|
1901
2610
|
avail = 0;
|
|
@@ -1911,7 +2620,7 @@ int32_t llama_detokenize_impl(
|
|
|
1911
2620
|
return -total;
|
|
1912
2621
|
}
|
|
1913
2622
|
|
|
1914
|
-
if (
|
|
2623
|
+
if (clean_spaces) {
|
|
1915
2624
|
text -= total; // restart text
|
|
1916
2625
|
|
|
1917
2626
|
// first pass: characters ?!., //TODO: where do these characters come from?
|
|
@@ -1972,13 +2681,321 @@ int32_t llama_detokenize_impl(
|
|
|
1972
2681
|
return total <= text_len_max ? total : -total;
|
|
1973
2682
|
}
|
|
1974
2683
|
|
|
1975
|
-
|
|
2684
|
+
void llama_vocab::impl::print_info() const {
|
|
2685
|
+
LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, type_name().c_str());
|
|
2686
|
+
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, vocab.n_tokens());
|
|
2687
|
+
LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (uint32_t) bpe_ranks.size());
|
|
2688
|
+
|
|
2689
|
+
// special tokens
|
|
2690
|
+
if (special_bos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, special_bos_id, id_to_token[special_bos_id].text.c_str() ); }
|
|
2691
|
+
if (special_eos_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, special_eos_id, id_to_token[special_eos_id].text.c_str() ); }
|
|
2692
|
+
if (special_eot_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, special_eot_id, id_to_token[special_eot_id].text.c_str() ); }
|
|
2693
|
+
if (special_eom_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: EOM token = %d '%s'\n", __func__, special_eom_id, id_to_token[special_eom_id].text.c_str() ); }
|
|
2694
|
+
if (special_unk_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, special_unk_id, id_to_token[special_unk_id].text.c_str() ); }
|
|
2695
|
+
if (special_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, special_sep_id, id_to_token[special_sep_id].text.c_str() ); }
|
|
2696
|
+
if (special_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, special_pad_id, id_to_token[special_pad_id].text.c_str() ); }
|
|
2697
|
+
if (special_mask_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, special_mask_id, id_to_token[special_mask_id].text.c_str() ); }
|
|
2698
|
+
|
|
2699
|
+
if (linefeed_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, linefeed_id, id_to_token[linefeed_id].text.c_str() ); }
|
|
2700
|
+
|
|
2701
|
+
if (special_fim_pre_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PRE token = %d '%s'\n", __func__, special_fim_pre_id, id_to_token[special_fim_pre_id].text.c_str() ); }
|
|
2702
|
+
if (special_fim_suf_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SUF token = %d '%s'\n", __func__, special_fim_suf_id, id_to_token[special_fim_suf_id].text.c_str() ); }
|
|
2703
|
+
if (special_fim_mid_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM MID token = %d '%s'\n", __func__, special_fim_mid_id, id_to_token[special_fim_mid_id].text.c_str() ); }
|
|
2704
|
+
if (special_fim_pad_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM PAD token = %d '%s'\n", __func__, special_fim_pad_id, id_to_token[special_fim_pad_id].text.c_str() ); }
|
|
2705
|
+
if (special_fim_rep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM REP token = %d '%s'\n", __func__, special_fim_rep_id, id_to_token[special_fim_rep_id].text.c_str() ); }
|
|
2706
|
+
if (special_fim_sep_id != LLAMA_TOKEN_NULL) { LLAMA_LOG_INFO( "%s: FIM SEP token = %d '%s'\n", __func__, special_fim_sep_id, id_to_token[special_fim_sep_id].text.c_str() ); }
|
|
2707
|
+
|
|
2708
|
+
for (const auto & id : special_eog_ids) {
|
|
2709
|
+
LLAMA_LOG_INFO( "%s: EOG token = %d '%s'\n", __func__, id, id_to_token[id].text.c_str() );
|
|
2710
|
+
}
|
|
2711
|
+
|
|
2712
|
+
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, max_token_len);
|
|
2713
|
+
}
|
|
2714
|
+
|
|
2715
|
+
llama_vocab::llama_vocab() : pimpl(new impl(*this)) {
|
|
2716
|
+
}
|
|
2717
|
+
|
|
2718
|
+
llama_vocab::~llama_vocab() {
|
|
2719
|
+
}
|
|
2720
|
+
|
|
2721
|
+
void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|
2722
|
+
pimpl->load(ml, kv);
|
|
2723
|
+
}
|
|
2724
|
+
|
|
2725
|
+
enum llama_vocab_type llama_vocab::get_type() const {
|
|
2726
|
+
return pimpl->type;
|
|
2727
|
+
}
|
|
2728
|
+
|
|
2729
|
+
enum llama_vocab_pre_type llama_vocab::get_pre_type() const {
|
|
2730
|
+
return pimpl->pre_type;
|
|
2731
|
+
}
|
|
2732
|
+
|
|
2733
|
+
uint32_t llama_vocab::n_tokens() const {
|
|
2734
|
+
return (uint32_t) pimpl->id_to_token.size();
|
|
2735
|
+
}
|
|
2736
|
+
|
|
2737
|
+
uint32_t llama_vocab::n_token_types() const {
|
|
2738
|
+
return (uint32_t) pimpl->n_token_types;
|
|
2739
|
+
}
|
|
2740
|
+
|
|
2741
|
+
std::string llama_vocab::type_name() const{
|
|
2742
|
+
return pimpl->type_name();
|
|
2743
|
+
}
|
|
2744
|
+
|
|
2745
|
+
bool llama_vocab::is_normal(llama_token id) const {
|
|
2746
|
+
return pimpl->is_normal(id);
|
|
2747
|
+
}
|
|
2748
|
+
|
|
2749
|
+
bool llama_vocab::is_unknown(llama_token id) const {
|
|
2750
|
+
return pimpl->is_unknown(id);
|
|
2751
|
+
}
|
|
2752
|
+
|
|
2753
|
+
bool llama_vocab::is_control(llama_token id) const {
|
|
2754
|
+
return pimpl->is_control(id);
|
|
2755
|
+
}
|
|
2756
|
+
|
|
2757
|
+
bool llama_vocab::is_byte(llama_token id) const {
|
|
2758
|
+
return pimpl->is_byte(id);
|
|
2759
|
+
}
|
|
2760
|
+
|
|
2761
|
+
bool llama_vocab::is_user_defined(llama_token id) const {
|
|
2762
|
+
return pimpl->is_user_defined(id);
|
|
2763
|
+
}
|
|
2764
|
+
|
|
2765
|
+
bool llama_vocab::is_unused(llama_token id) const {
|
|
2766
|
+
return pimpl->is_unused(id);
|
|
2767
|
+
}
|
|
2768
|
+
|
|
2769
|
+
bool llama_vocab::is_eog(llama_token id) const {
|
|
2770
|
+
return pimpl->is_eog(id);
|
|
2771
|
+
}
|
|
2772
|
+
|
|
2773
|
+
uint8_t llama_vocab::token_to_byte(llama_token id) const {
|
|
2774
|
+
return pimpl->token_to_byte(id);
|
|
2775
|
+
}
|
|
2776
|
+
|
|
2777
|
+
llama_token llama_vocab::byte_to_token(uint8_t ch) const {
|
|
2778
|
+
GGML_ASSERT(get_type() != LLAMA_VOCAB_TYPE_NONE);
|
|
2779
|
+
static const char * hex = "0123456789ABCDEF";
|
|
2780
|
+
switch (get_type()) {
|
|
2781
|
+
case LLAMA_VOCAB_TYPE_SPM:
|
|
2782
|
+
case LLAMA_VOCAB_TYPE_UGM: {
|
|
2783
|
+
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
|
2784
|
+
auto token = pimpl->token_to_id.find(buf);
|
|
2785
|
+
if (token != pimpl->token_to_id.end()) {
|
|
2786
|
+
return (*token).second;
|
|
2787
|
+
}
|
|
2788
|
+
// Try to fall back to just the byte as a string
|
|
2789
|
+
const char buf2[2] = { (char)ch, 0 };
|
|
2790
|
+
return pimpl->token_to_id.at(buf2);
|
|
2791
|
+
}
|
|
2792
|
+
case LLAMA_VOCAB_TYPE_WPM:
|
|
2793
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
|
2794
|
+
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
|
|
2795
|
+
}
|
|
2796
|
+
default:
|
|
2797
|
+
GGML_ABORT("fatal error");
|
|
2798
|
+
}
|
|
2799
|
+
}
|
|
2800
|
+
|
|
2801
|
+
llama_token llama_vocab::text_to_token(const std::string & text) const {
|
|
2802
|
+
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
|
|
2803
|
+
auto it = pimpl->token_to_id.find(text);
|
|
2804
|
+
if (it != pimpl->token_to_id.end()) {
|
|
2805
|
+
return (*it).second;
|
|
2806
|
+
}
|
|
2807
|
+
return LLAMA_TOKEN_NULL;
|
|
2808
|
+
}
|
|
2809
|
+
|
|
2810
|
+
const llama_vocab::token_data & llama_vocab::get_token_data(llama_token id) const {
|
|
2811
|
+
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
|
|
2812
|
+
return pimpl->id_to_token.at(id);
|
|
2813
|
+
}
|
|
2814
|
+
|
|
2815
|
+
const char * llama_vocab::token_get_text(llama_token id) const {
|
|
2816
|
+
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
|
|
2817
|
+
return pimpl->id_to_token.at(id).text.c_str();
|
|
2818
|
+
}
|
|
2819
|
+
|
|
2820
|
+
float llama_vocab::token_get_score(llama_token id) const {
|
|
2821
|
+
GGML_ASSERT(pimpl->type != LLAMA_VOCAB_TYPE_NONE);
|
|
2822
|
+
return pimpl->id_to_token.at(id).score;
|
|
2823
|
+
}
|
|
2824
|
+
|
|
2825
|
+
llama_token_attr llama_vocab::token_get_attr(llama_token id) const {
|
|
2826
|
+
return pimpl->token_get_attr(id);
|
|
2827
|
+
}
|
|
2828
|
+
|
|
2829
|
+
llama_token llama_vocab::token_bos() const {
|
|
2830
|
+
return pimpl->special_bos_id;
|
|
2831
|
+
}
|
|
2832
|
+
|
|
2833
|
+
llama_token llama_vocab::token_eos() const {
|
|
2834
|
+
return pimpl->special_eos_id;
|
|
2835
|
+
}
|
|
2836
|
+
|
|
2837
|
+
llama_token llama_vocab::token_eot() const {
|
|
2838
|
+
return pimpl->special_eot_id;
|
|
2839
|
+
}
|
|
2840
|
+
|
|
2841
|
+
llama_token llama_vocab::token_eom() const {
|
|
2842
|
+
return pimpl->special_eom_id;
|
|
2843
|
+
}
|
|
2844
|
+
|
|
2845
|
+
llama_token llama_vocab::token_unk() const {
|
|
2846
|
+
return pimpl->special_unk_id;
|
|
2847
|
+
}
|
|
2848
|
+
|
|
2849
|
+
llama_token llama_vocab::token_sep() const {
|
|
2850
|
+
return pimpl->special_sep_id;
|
|
2851
|
+
}
|
|
2852
|
+
|
|
2853
|
+
llama_token llama_vocab::token_nl() const {
|
|
2854
|
+
return pimpl->linefeed_id;
|
|
2855
|
+
}
|
|
2856
|
+
|
|
2857
|
+
llama_token llama_vocab::token_pad() const {
|
|
2858
|
+
return pimpl->special_pad_id;
|
|
2859
|
+
}
|
|
2860
|
+
|
|
2861
|
+
llama_token llama_vocab::token_prefix() const {
|
|
2862
|
+
return pimpl->special_fim_pre_id;
|
|
2863
|
+
}
|
|
2864
|
+
|
|
2865
|
+
llama_token llama_vocab::token_middle() const {
|
|
2866
|
+
return pimpl->special_fim_mid_id;
|
|
2867
|
+
}
|
|
2868
|
+
|
|
2869
|
+
llama_token llama_vocab::token_suffix() const {
|
|
2870
|
+
return pimpl->special_fim_suf_id;
|
|
2871
|
+
}
|
|
2872
|
+
|
|
2873
|
+
llama_token llama_vocab::token_fim_pre() const {
|
|
2874
|
+
return pimpl->special_fim_pre_id;
|
|
2875
|
+
}
|
|
2876
|
+
|
|
2877
|
+
llama_token llama_vocab::token_fim_suf() const {
|
|
2878
|
+
return pimpl->special_fim_suf_id;
|
|
2879
|
+
}
|
|
2880
|
+
|
|
2881
|
+
llama_token llama_vocab::token_fim_mid() const {
|
|
2882
|
+
return pimpl->special_fim_mid_id;
|
|
2883
|
+
}
|
|
2884
|
+
|
|
2885
|
+
llama_token llama_vocab::token_fim_pad() const {
|
|
2886
|
+
return pimpl->special_fim_pad_id;
|
|
2887
|
+
}
|
|
2888
|
+
|
|
2889
|
+
llama_token llama_vocab::token_fim_rep() const {
|
|
2890
|
+
return pimpl->special_fim_rep_id;
|
|
2891
|
+
}
|
|
2892
|
+
|
|
2893
|
+
llama_token llama_vocab::token_fim_sep() const {
|
|
2894
|
+
return pimpl->special_fim_sep_id;
|
|
2895
|
+
}
|
|
2896
|
+
|
|
2897
|
+
bool llama_vocab::get_add_space_prefix() const {
|
|
2898
|
+
return pimpl->add_space_prefix;
|
|
2899
|
+
}
|
|
2900
|
+
|
|
2901
|
+
bool llama_vocab::get_add_bos() const {
|
|
2902
|
+
return pimpl->add_bos;
|
|
2903
|
+
}
|
|
2904
|
+
|
|
2905
|
+
bool llama_vocab::get_add_eos() const {
|
|
2906
|
+
return pimpl->add_eos;
|
|
2907
|
+
}
|
|
2908
|
+
|
|
2909
|
+
bool llama_vocab::get_ignore_merges() const {
|
|
2910
|
+
return pimpl->ignore_merges;
|
|
2911
|
+
}
|
|
2912
|
+
|
|
2913
|
+
bool llama_vocab::get_clean_spaces() const {
|
|
2914
|
+
return pimpl->clean_spaces;
|
|
2915
|
+
}
|
|
2916
|
+
|
|
2917
|
+
bool llama_vocab::get_remove_extra_whitespaces() const {
|
|
2918
|
+
return pimpl->remove_extra_whitespaces;
|
|
2919
|
+
}
|
|
2920
|
+
|
|
2921
|
+
bool llama_vocab::get_escape_whitespaces() const {
|
|
2922
|
+
return pimpl->escape_whitespaces;
|
|
2923
|
+
}
|
|
2924
|
+
|
|
2925
|
+
bool llama_vocab::get_treat_whitespace_as_suffix() const {
|
|
2926
|
+
return pimpl->treat_whitespace_as_suffix;
|
|
2927
|
+
}
|
|
2928
|
+
|
|
2929
|
+
int llama_vocab::max_token_len() const {
|
|
2930
|
+
return pimpl->max_token_len;
|
|
2931
|
+
}
|
|
2932
|
+
|
|
2933
|
+
int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string & token_right) const {
|
|
2934
|
+
GGML_ASSERT(token_left.find(' ') == std::string::npos);
|
|
2935
|
+
GGML_ASSERT(token_left.find('\n') == std::string::npos);
|
|
2936
|
+
GGML_ASSERT(token_right.find(' ') == std::string::npos);
|
|
2937
|
+
GGML_ASSERT(token_right.find('\n') == std::string::npos);
|
|
2938
|
+
|
|
2939
|
+
auto it = pimpl->bpe_ranks.find(std::make_pair(token_left, token_right));
|
|
2940
|
+
if (it == pimpl->bpe_ranks.end()) {
|
|
2941
|
+
return -1;
|
|
2942
|
+
}
|
|
2943
|
+
|
|
2944
|
+
return it->second;
|
|
2945
|
+
}
|
|
2946
|
+
|
|
2947
|
+
int32_t llama_vocab::tokenize(
|
|
2948
|
+
const char * text,
|
|
2949
|
+
int32_t text_len,
|
|
2950
|
+
llama_token * tokens,
|
|
2951
|
+
int32_t n_tokens_max,
|
|
2952
|
+
bool add_special,
|
|
2953
|
+
bool parse_special) const {
|
|
2954
|
+
auto res = tokenize(std::string(text, text_len), add_special, parse_special);
|
|
2955
|
+
if (n_tokens_max < (int) res.size()) {
|
|
2956
|
+
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
|
2957
|
+
return -((int) res.size());
|
|
2958
|
+
}
|
|
2959
|
+
|
|
2960
|
+
for (size_t i = 0; i < res.size(); i++) {
|
|
2961
|
+
tokens[i] = res[i];
|
|
2962
|
+
}
|
|
2963
|
+
|
|
2964
|
+
return res.size();
|
|
2965
|
+
}
|
|
2966
|
+
|
|
2967
|
+
std::vector<llama_token> llama_vocab::tokenize(
|
|
2968
|
+
const std::string & raw_text,
|
|
2969
|
+
bool add_special,
|
|
2970
|
+
bool parse_special) const {
|
|
2971
|
+
return pimpl->tokenize(raw_text, add_special, parse_special);
|
|
2972
|
+
}
|
|
2973
|
+
|
|
2974
|
+
const std::string & llama_vocab::token_to_piece(llama_token token) const {
|
|
2975
|
+
return pimpl->token_to_piece(token);
|
|
2976
|
+
}
|
|
2977
|
+
|
|
2978
|
+
int32_t llama_vocab::token_to_piece(llama_token token, char * buf, int32_t length, int32_t lstrip, bool special) const {
|
|
2979
|
+
return pimpl->token_to_piece(token, buf, length, lstrip, special);
|
|
2980
|
+
}
|
|
2981
|
+
|
|
2982
|
+
int32_t llama_vocab::detokenize(
|
|
2983
|
+
const llama_token * tokens,
|
|
2984
|
+
int32_t n_tokens,
|
|
2985
|
+
char * text,
|
|
2986
|
+
int32_t text_len_max,
|
|
2987
|
+
bool remove_special,
|
|
2988
|
+
bool unparse_special) const {
|
|
2989
|
+
return pimpl->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
|
|
2990
|
+
}
|
|
2991
|
+
|
|
2992
|
+
std::string llama_vocab::detokenize(const std::vector<llama_token> & tokens, bool special) const {
|
|
1976
2993
|
std::string text;
|
|
1977
2994
|
text.resize(std::max(text.capacity(), tokens.size()));
|
|
1978
|
-
int32_t n_chars =
|
|
2995
|
+
int32_t n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
1979
2996
|
if (n_chars < 0) {
|
|
1980
2997
|
text.resize(-n_chars);
|
|
1981
|
-
n_chars =
|
|
2998
|
+
n_chars = detokenize(tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
|
1982
2999
|
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
|
|
1983
3000
|
}
|
|
1984
3001
|
|
|
@@ -1987,3 +3004,243 @@ std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector
|
|
|
1987
3004
|
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
|
|
1988
3005
|
return text;
|
|
1989
3006
|
}
|
|
3007
|
+
|
|
3008
|
+
void llama_vocab::print_info() const {
|
|
3009
|
+
pimpl->print_info();
|
|
3010
|
+
}
|
|
3011
|
+
|
|
3012
|
+
//
|
|
3013
|
+
// interface implementation
|
|
3014
|
+
//
|
|
3015
|
+
|
|
3016
|
+
int32_t llama_vocab_n_tokens(const struct llama_vocab * vocab) {
|
|
3017
|
+
return vocab->n_tokens();
|
|
3018
|
+
}
|
|
3019
|
+
|
|
3020
|
+
// deprecated
|
|
3021
|
+
int32_t llama_n_vocab(const struct llama_vocab * vocab) {
|
|
3022
|
+
return llama_vocab_n_tokens(vocab);
|
|
3023
|
+
}
|
|
3024
|
+
|
|
3025
|
+
enum llama_vocab_type llama_vocab_type(const struct llama_vocab * vocab) {
|
|
3026
|
+
return vocab->get_type();
|
|
3027
|
+
}
|
|
3028
|
+
|
|
3029
|
+
const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token) {
|
|
3030
|
+
return vocab->token_get_text(token);
|
|
3031
|
+
}
|
|
3032
|
+
|
|
3033
|
+
float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token) {
|
|
3034
|
+
return vocab->token_get_score(token);
|
|
3035
|
+
}
|
|
3036
|
+
|
|
3037
|
+
enum llama_token_attr llama_vocab_get_attr(const struct llama_vocab * vocab, llama_token token) {
|
|
3038
|
+
return vocab->token_get_attr(token);
|
|
3039
|
+
}
|
|
3040
|
+
|
|
3041
|
+
bool llama_vocab_is_eog(const struct llama_vocab * vocab, llama_token token) {
|
|
3042
|
+
return vocab->is_eog(token);
|
|
3043
|
+
}
|
|
3044
|
+
|
|
3045
|
+
bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token) {
|
|
3046
|
+
return vocab->is_control(token);
|
|
3047
|
+
}
|
|
3048
|
+
|
|
3049
|
+
llama_token llama_vocab_bos(const struct llama_vocab * vocab) {
|
|
3050
|
+
return vocab->token_bos();
|
|
3051
|
+
}
|
|
3052
|
+
|
|
3053
|
+
llama_token llama_vocab_eos(const struct llama_vocab * vocab) {
|
|
3054
|
+
return vocab->token_eos();
|
|
3055
|
+
}
|
|
3056
|
+
|
|
3057
|
+
llama_token llama_vocab_eot(const struct llama_vocab * vocab) {
|
|
3058
|
+
return vocab->token_eot();
|
|
3059
|
+
}
|
|
3060
|
+
|
|
3061
|
+
// deprecated
|
|
3062
|
+
llama_token llama_vocab_cls(const struct llama_vocab * vocab) {
|
|
3063
|
+
return vocab->token_bos();
|
|
3064
|
+
}
|
|
3065
|
+
|
|
3066
|
+
llama_token llama_vocab_sep(const struct llama_vocab * vocab) {
|
|
3067
|
+
return vocab->token_sep();
|
|
3068
|
+
}
|
|
3069
|
+
|
|
3070
|
+
llama_token llama_vocab_nl (const struct llama_vocab * vocab) {
|
|
3071
|
+
return vocab->token_nl();
|
|
3072
|
+
}
|
|
3073
|
+
|
|
3074
|
+
llama_token llama_vocab_pad(const struct llama_vocab * vocab) {
|
|
3075
|
+
return vocab->token_pad();
|
|
3076
|
+
}
|
|
3077
|
+
|
|
3078
|
+
bool llama_vocab_get_add_bos(const struct llama_vocab * vocab) {
|
|
3079
|
+
return vocab->get_add_bos();
|
|
3080
|
+
}
|
|
3081
|
+
|
|
3082
|
+
bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
|
|
3083
|
+
return vocab->get_add_eos();
|
|
3084
|
+
}
|
|
3085
|
+
|
|
3086
|
+
llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
|
|
3087
|
+
return vocab->token_fim_pre();
|
|
3088
|
+
}
|
|
3089
|
+
|
|
3090
|
+
llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab) {
|
|
3091
|
+
return vocab->token_fim_suf();
|
|
3092
|
+
}
|
|
3093
|
+
|
|
3094
|
+
llama_token llama_vocab_fim_mid(const struct llama_vocab * vocab) {
|
|
3095
|
+
return vocab->token_fim_mid();
|
|
3096
|
+
}
|
|
3097
|
+
|
|
3098
|
+
llama_token llama_vocab_fim_pad(const struct llama_vocab * vocab) {
|
|
3099
|
+
return vocab->token_fim_pad();
|
|
3100
|
+
}
|
|
3101
|
+
|
|
3102
|
+
llama_token llama_vocab_fim_rep(const struct llama_vocab * vocab) {
|
|
3103
|
+
return vocab->token_fim_rep();
|
|
3104
|
+
}
|
|
3105
|
+
|
|
3106
|
+
llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
|
|
3107
|
+
return vocab->token_fim_sep();
|
|
3108
|
+
}
|
|
3109
|
+
|
|
3110
|
+
// deprecated
|
|
3111
|
+
const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
|
|
3112
|
+
return llama_vocab_get_text(vocab, token);
|
|
3113
|
+
}
|
|
3114
|
+
|
|
3115
|
+
// deprecated
|
|
3116
|
+
float llama_token_get_score(const struct llama_vocab * vocab, llama_token token) {
|
|
3117
|
+
return llama_vocab_get_score(vocab, token);
|
|
3118
|
+
}
|
|
3119
|
+
|
|
3120
|
+
// deprecated
|
|
3121
|
+
enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token) {
|
|
3122
|
+
return llama_vocab_get_attr(vocab, token);
|
|
3123
|
+
}
|
|
3124
|
+
|
|
3125
|
+
// deprecated
|
|
3126
|
+
bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token) {
|
|
3127
|
+
return llama_vocab_is_eog(vocab, token);
|
|
3128
|
+
}
|
|
3129
|
+
|
|
3130
|
+
// deprecated
|
|
3131
|
+
bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token) {
|
|
3132
|
+
return llama_vocab_is_control(vocab, token);
|
|
3133
|
+
}
|
|
3134
|
+
|
|
3135
|
+
// deprecated
|
|
3136
|
+
llama_token llama_token_bos(const struct llama_vocab * vocab) {
|
|
3137
|
+
return llama_vocab_bos(vocab);
|
|
3138
|
+
}
|
|
3139
|
+
|
|
3140
|
+
// deprecated
|
|
3141
|
+
llama_token llama_token_eos(const struct llama_vocab * vocab) {
|
|
3142
|
+
return llama_vocab_eos(vocab);
|
|
3143
|
+
}
|
|
3144
|
+
|
|
3145
|
+
// deprecated
|
|
3146
|
+
llama_token llama_token_eot(const struct llama_vocab * vocab) {
|
|
3147
|
+
return llama_vocab_eot(vocab);
|
|
3148
|
+
}
|
|
3149
|
+
|
|
3150
|
+
// deprecated
|
|
3151
|
+
llama_token llama_token_cls(const struct llama_vocab * vocab) {
|
|
3152
|
+
//return llama_vocab_cls(vocab);
|
|
3153
|
+
return llama_vocab_bos(vocab); // avoid deprecation warning
|
|
3154
|
+
}
|
|
3155
|
+
|
|
3156
|
+
// deprecated
|
|
3157
|
+
llama_token llama_token_sep(const struct llama_vocab * vocab) {
|
|
3158
|
+
return llama_vocab_sep(vocab);
|
|
3159
|
+
}
|
|
3160
|
+
|
|
3161
|
+
// deprecated
|
|
3162
|
+
llama_token llama_token_nl (const struct llama_vocab * vocab) {
|
|
3163
|
+
return llama_vocab_nl(vocab);
|
|
3164
|
+
}
|
|
3165
|
+
|
|
3166
|
+
// deprecated
|
|
3167
|
+
llama_token llama_token_pad(const struct llama_vocab * vocab) {
|
|
3168
|
+
return llama_vocab_pad(vocab);
|
|
3169
|
+
}
|
|
3170
|
+
|
|
3171
|
+
// deprecated
|
|
3172
|
+
bool llama_add_bos_token(const struct llama_vocab * vocab) {
|
|
3173
|
+
return llama_vocab_get_add_bos(vocab);
|
|
3174
|
+
}
|
|
3175
|
+
|
|
3176
|
+
// deprecated
|
|
3177
|
+
bool llama_add_eos_token(const struct llama_vocab * vocab) {
|
|
3178
|
+
return llama_vocab_get_add_eos(vocab);
|
|
3179
|
+
}
|
|
3180
|
+
|
|
3181
|
+
// deprecated
|
|
3182
|
+
llama_token llama_token_fim_pre(const struct llama_vocab * vocab) {
|
|
3183
|
+
return llama_vocab_fim_pre(vocab);
|
|
3184
|
+
}
|
|
3185
|
+
|
|
3186
|
+
// deprecated
|
|
3187
|
+
llama_token llama_token_fim_suf(const struct llama_vocab * vocab) {
|
|
3188
|
+
return llama_vocab_fim_suf(vocab);
|
|
3189
|
+
}
|
|
3190
|
+
|
|
3191
|
+
// deprecated
|
|
3192
|
+
llama_token llama_token_fim_mid(const struct llama_vocab * vocab) {
|
|
3193
|
+
return llama_vocab_fim_mid(vocab);
|
|
3194
|
+
}
|
|
3195
|
+
|
|
3196
|
+
// deprecated
|
|
3197
|
+
llama_token llama_token_fim_pad(const struct llama_vocab * vocab) {
|
|
3198
|
+
return llama_vocab_fim_pad(vocab);
|
|
3199
|
+
}
|
|
3200
|
+
|
|
3201
|
+
// deprecated
|
|
3202
|
+
llama_token llama_token_fim_rep(const struct llama_vocab * vocab) {
|
|
3203
|
+
return llama_vocab_fim_rep(vocab);
|
|
3204
|
+
}
|
|
3205
|
+
|
|
3206
|
+
// deprecated
|
|
3207
|
+
llama_token llama_token_fim_sep(const struct llama_vocab * vocab) {
|
|
3208
|
+
return llama_vocab_fim_sep(vocab);
|
|
3209
|
+
}
|
|
3210
|
+
|
|
3211
|
+
//
|
|
3212
|
+
// tokenization
|
|
3213
|
+
//
|
|
3214
|
+
|
|
3215
|
+
int32_t llama_tokenize(
|
|
3216
|
+
const struct llama_vocab * vocab,
|
|
3217
|
+
const char * text,
|
|
3218
|
+
int32_t text_len,
|
|
3219
|
+
llama_token * tokens,
|
|
3220
|
+
int32_t n_tokens_max,
|
|
3221
|
+
bool add_special,
|
|
3222
|
+
bool parse_special) {
|
|
3223
|
+
return vocab->tokenize(text, text_len, tokens, n_tokens_max, add_special, parse_special);
|
|
3224
|
+
}
|
|
3225
|
+
|
|
3226
|
+
int32_t llama_token_to_piece(
|
|
3227
|
+
const struct llama_vocab * vocab,
|
|
3228
|
+
llama_token token,
|
|
3229
|
+
char * buf,
|
|
3230
|
+
int32_t length,
|
|
3231
|
+
int32_t lstrip,
|
|
3232
|
+
bool special) {
|
|
3233
|
+
return vocab->token_to_piece(token, buf, length, lstrip, special);
|
|
3234
|
+
}
|
|
3235
|
+
|
|
3236
|
+
int32_t llama_detokenize(
|
|
3237
|
+
const struct llama_vocab * vocab,
|
|
3238
|
+
const llama_token * tokens,
|
|
3239
|
+
int32_t n_tokens,
|
|
3240
|
+
char * text,
|
|
3241
|
+
int32_t text_len_max,
|
|
3242
|
+
bool remove_special,
|
|
3243
|
+
bool unparse_special) {
|
|
3244
|
+
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
|
|
3245
|
+
}
|
|
3246
|
+
|