llama_cpp 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +41 -21
- data/ext/llama_cpp/src/ggml-metal.m +44 -3
- data/ext/llama_cpp/src/ggml-metal.metal +162 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +30 -56
- data/ext/llama_cpp/src/ggml.c +13 -9
- data/ext/llama_cpp/src/ggml.h +3 -2
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +359 -58
- data/ext/llama_cpp/src/llama.h +18 -12
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -4
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <thread>
|
76
76
|
#include <unordered_map>
|
77
77
|
#include <set>
|
78
|
+
#include <forward_list>
|
78
79
|
|
79
80
|
#if defined(_MSC_VER)
|
80
81
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -1178,6 +1179,8 @@ struct llama_vocab {
|
|
1178
1179
|
std::unordered_map<token, id> token_to_id;
|
1179
1180
|
std::vector<token_data> id_to_token;
|
1180
1181
|
|
1182
|
+
std::unordered_map<token, id> special_tokens_cache;
|
1183
|
+
|
1181
1184
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
1182
1185
|
|
1183
1186
|
// default LLaMA special tokens
|
@@ -1442,7 +1445,10 @@ static bool llama_kv_cache_find_slot(
|
|
1442
1445
|
|
1443
1446
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
1444
1447
|
cache.cells[cache.head + i].pos = batch.pos[i];
|
1445
|
-
|
1448
|
+
|
1449
|
+
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
|
1450
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
|
1451
|
+
}
|
1446
1452
|
}
|
1447
1453
|
|
1448
1454
|
return true;
|
@@ -1522,6 +1528,9 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1522
1528
|
cache.cells[i].pos = -1;
|
1523
1529
|
cache.cells[i].seq_id.clear();
|
1524
1530
|
if (new_head == cache.size) new_head = i;
|
1531
|
+
} else {
|
1532
|
+
cache.cells[i].seq_id.clear();
|
1533
|
+
cache.cells[i].seq_id.insert(seq_id);
|
1525
1534
|
}
|
1526
1535
|
}
|
1527
1536
|
|
@@ -2120,7 +2129,7 @@ static void llm_load_hparams(
|
|
2120
2129
|
}
|
2121
2130
|
|
2122
2131
|
// TODO: This should probably be in llama.h
|
2123
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
2132
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
|
2124
2133
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
2125
2134
|
|
2126
2135
|
static void llm_load_vocab(
|
@@ -2236,6 +2245,101 @@ static void llm_load_vocab(
|
|
2236
2245
|
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
|
2237
2246
|
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
|
2238
2247
|
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
|
2248
|
+
|
2249
|
+
// build special tokens cache
|
2250
|
+
{
|
2251
|
+
// TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
|
2252
|
+
// and will always be correctly labeled in 'added_tokens.json' etc.
|
2253
|
+
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2254
|
+
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2255
|
+
// are special tokens.
|
2256
|
+
// From testing, this appears to corelate 1:1 with special tokens.
|
2257
|
+
//
|
2258
|
+
|
2259
|
+
// Counting special tokens and verifying in only one direction
|
2260
|
+
// is sufficient to detect difference in those two sets.
|
2261
|
+
//
|
2262
|
+
uint32_t special_tokens_count_by_type = 0;
|
2263
|
+
uint32_t special_tokens_count_from_verification = 0;
|
2264
|
+
|
2265
|
+
bool special_tokens_definition_mismatch = false;
|
2266
|
+
|
2267
|
+
for (const auto & t : vocab.token_to_id) {
|
2268
|
+
const auto & token = t.first;
|
2269
|
+
const auto & id = t.second;
|
2270
|
+
|
2271
|
+
// Count all non-normal tokens in the vocab while iterating
|
2272
|
+
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
2273
|
+
special_tokens_count_by_type++;
|
2274
|
+
}
|
2275
|
+
|
2276
|
+
// Skip single character tokens
|
2277
|
+
if (token.length() > 1) {
|
2278
|
+
bool is_tokenizable = false;
|
2279
|
+
|
2280
|
+
// Split token string representation in two, in all possible ways
|
2281
|
+
// and check if both halves can be matched to a valid token
|
2282
|
+
for (unsigned i = 1; i < token.length();) {
|
2283
|
+
const auto left = token.substr(0, i);
|
2284
|
+
const auto right = token.substr(i);
|
2285
|
+
|
2286
|
+
// check if we didnt partition in the middle of a utf sequence
|
2287
|
+
auto utf = utf8_len(left.at(left.length() - 1));
|
2288
|
+
|
2289
|
+
if (utf == 1) {
|
2290
|
+
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
2291
|
+
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
2292
|
+
is_tokenizable = true;
|
2293
|
+
break;
|
2294
|
+
}
|
2295
|
+
i++;
|
2296
|
+
} else {
|
2297
|
+
// skip over the rest of multibyte utf sequence
|
2298
|
+
i += utf - 1;
|
2299
|
+
}
|
2300
|
+
}
|
2301
|
+
|
2302
|
+
if (!is_tokenizable) {
|
2303
|
+
// Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
|
2304
|
+
// it's faster to re-filter them here, since there are way less candidates now
|
2305
|
+
|
2306
|
+
// Calculate a total "utf" length of a token string representation
|
2307
|
+
size_t utf8_str_len = 0;
|
2308
|
+
for (unsigned i = 0; i < token.length();) {
|
2309
|
+
utf8_str_len++;
|
2310
|
+
i += utf8_len(token.at(i));
|
2311
|
+
}
|
2312
|
+
|
2313
|
+
// And skip the ones which are one character
|
2314
|
+
if (utf8_str_len > 1) {
|
2315
|
+
// At this point what we have left are special tokens only
|
2316
|
+
vocab.special_tokens_cache[token] = id;
|
2317
|
+
|
2318
|
+
// Count manually found special tokens
|
2319
|
+
special_tokens_count_from_verification++;
|
2320
|
+
|
2321
|
+
// If this manually found special token is not marked as such, flag a mismatch
|
2322
|
+
if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
|
2323
|
+
special_tokens_definition_mismatch = true;
|
2324
|
+
}
|
2325
|
+
}
|
2326
|
+
}
|
2327
|
+
}
|
2328
|
+
}
|
2329
|
+
|
2330
|
+
if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
|
2331
|
+
LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
|
2332
|
+
__func__,
|
2333
|
+
special_tokens_count_from_verification, vocab.id_to_token.size(),
|
2334
|
+
special_tokens_count_by_type, vocab.id_to_token.size()
|
2335
|
+
);
|
2336
|
+
} else {
|
2337
|
+
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
2338
|
+
__func__,
|
2339
|
+
special_tokens_count_from_verification, vocab.id_to_token.size()
|
2340
|
+
);
|
2341
|
+
}
|
2342
|
+
}
|
2239
2343
|
}
|
2240
2344
|
|
2241
2345
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
@@ -2834,8 +2938,8 @@ static void llm_load_tensors(
|
|
2834
2938
|
auto & layer = model.layers[i];
|
2835
2939
|
|
2836
2940
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2837
|
-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd,
|
2838
|
-
layer.wo
|
2941
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2942
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2839
2943
|
|
2840
2944
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2841
2945
|
|
@@ -3075,7 +3179,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
3075
3179
|
for (int h = 0; h < 1; ++h) {
|
3076
3180
|
for (int j = 0; j < n_tokens; ++j) {
|
3077
3181
|
const llama_pos pos = batch.pos[j];
|
3078
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3182
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3079
3183
|
|
3080
3184
|
for (int i = 0; i < n_kv; ++i) {
|
3081
3185
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3461,7 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3461
3565
|
for (int h = 0; h < 1; ++h) {
|
3462
3566
|
for (int j = 0; j < n_tokens; ++j) {
|
3463
3567
|
const llama_pos pos = batch.pos[j];
|
3464
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3568
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3465
3569
|
|
3466
3570
|
for (int i = 0; i < n_kv; ++i) {
|
3467
3571
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3860,7 +3964,7 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3860
3964
|
for (int h = 0; h < 1; ++h) {
|
3861
3965
|
for (int j = 0; j < n_tokens; ++j) {
|
3862
3966
|
const llama_pos pos = batch.pos[j];
|
3863
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3967
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3864
3968
|
|
3865
3969
|
for (int i = 0; i < n_kv; ++i) {
|
3866
3970
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4212,7 +4316,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
4212
4316
|
for (int h = 0; h < 1; ++h) {
|
4213
4317
|
for (int j = 0; j < n_tokens; ++j) {
|
4214
4318
|
const llama_pos pos = batch.pos[j];
|
4215
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4319
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4216
4320
|
|
4217
4321
|
for (int i = 0; i < n_kv; ++i) {
|
4218
4322
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4564,7 +4668,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4564
4668
|
for (int h = 0; h < 1; ++h) {
|
4565
4669
|
for (int j = 0; j < n_tokens; ++j) {
|
4566
4670
|
const llama_pos pos = batch.pos[j];
|
4567
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4671
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4568
4672
|
|
4569
4673
|
for (int i = 0; i < n_kv; ++i) {
|
4570
4674
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4795,7 +4899,7 @@ static struct ggml_cgraph * llm_build_persimmon(
|
|
4795
4899
|
for (int h = 0; h < 1; ++h) {
|
4796
4900
|
for (int j = 0; j < n_tokens; ++j) {
|
4797
4901
|
const llama_pos pos = batch.pos[j];
|
4798
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4902
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4799
4903
|
for (int i = 0; i < n_kv; ++i) {
|
4800
4904
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4801
4905
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
@@ -5193,7 +5297,7 @@ static struct ggml_cgraph * llm_build_bloom(
|
|
5193
5297
|
for (int h = 0; h < 1; ++h) {
|
5194
5298
|
for (int j = 0; j < n_tokens; ++j) {
|
5195
5299
|
const llama_pos pos = batch.pos[j];
|
5196
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5300
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5197
5301
|
|
5198
5302
|
for (int i = 0; i < n_kv; ++i) {
|
5199
5303
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -5363,7 +5467,7 @@ static struct ggml_cgraph * llm_build_mpt(
|
|
5363
5467
|
const int64_t n_layer = hparams.n_layer;
|
5364
5468
|
const int64_t n_ctx = cparams.n_ctx;
|
5365
5469
|
const int64_t n_head = hparams.n_head;
|
5366
|
-
const int64_t n_head_kv = hparams.n_head_kv;
|
5470
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5367
5471
|
const int64_t n_embd_head = hparams.n_embd_head();
|
5368
5472
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5369
5473
|
|
@@ -5461,7 +5565,7 @@ static struct ggml_cgraph * llm_build_mpt(
|
|
5461
5565
|
for (int h = 0; h < 1; ++h) {
|
5462
5566
|
for (int j = 0; j < n_tokens; ++j) {
|
5463
5567
|
const llama_pos pos = batch.pos[j];
|
5464
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5568
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5465
5569
|
|
5466
5570
|
for (int i = 0; i < n_kv; ++i) {
|
5467
5571
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -5761,8 +5865,11 @@ static int llama_decode_internal(
|
|
5761
5865
|
|
5762
5866
|
// helpers for smoother batch API transistion
|
5763
5867
|
// after deprecating the llama_eval calls, these will be removed
|
5764
|
-
std::vector<llama_pos>
|
5765
|
-
|
5868
|
+
std::vector<llama_pos> pos;
|
5869
|
+
|
5870
|
+
std::vector<int32_t> n_seq_id;
|
5871
|
+
std::vector<llama_seq_id *> seq_id_arr;
|
5872
|
+
std::vector<std::vector<llama_seq_id>> seq_id;
|
5766
5873
|
|
5767
5874
|
if (batch.pos == nullptr) {
|
5768
5875
|
pos.resize(n_tokens);
|
@@ -5774,12 +5881,18 @@ static int llama_decode_internal(
|
|
5774
5881
|
}
|
5775
5882
|
|
5776
5883
|
if (batch.seq_id == nullptr) {
|
5884
|
+
n_seq_id.resize(n_tokens);
|
5777
5885
|
seq_id.resize(n_tokens);
|
5886
|
+
seq_id_arr.resize(n_tokens);
|
5778
5887
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
5779
|
-
|
5888
|
+
n_seq_id[i] = 1;
|
5889
|
+
seq_id[i].resize(1);
|
5890
|
+
seq_id[i][0] = batch.all_seq_id;
|
5891
|
+
seq_id_arr[i] = seq_id[i].data();
|
5780
5892
|
}
|
5781
5893
|
|
5782
|
-
batch.
|
5894
|
+
batch.n_seq_id = n_seq_id.data();
|
5895
|
+
batch.seq_id = seq_id_arr.data();
|
5783
5896
|
}
|
5784
5897
|
|
5785
5898
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
@@ -5800,6 +5913,13 @@ static int llama_decode_internal(
|
|
5800
5913
|
|
5801
5914
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
5802
5915
|
|
5916
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
5917
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
5918
|
+
|
5919
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5920
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
5921
|
+
|
5922
|
+
|
5803
5923
|
#ifdef GGML_USE_CUBLAS
|
5804
5924
|
for (int i = 0; i < gf->n_leafs; i++) {
|
5805
5925
|
ggml_tensor * node = gf->leafs[i];
|
@@ -5817,6 +5937,12 @@ static int llama_decode_internal(
|
|
5817
5937
|
}
|
5818
5938
|
|
5819
5939
|
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
5940
|
+
|
5941
|
+
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
5942
|
+
if (!lctx.embedding.empty()) {
|
5943
|
+
embeddings->backend = GGML_BACKEND_CPU;
|
5944
|
+
}
|
5945
|
+
res->backend = GGML_BACKEND_CPU;
|
5820
5946
|
#endif
|
5821
5947
|
|
5822
5948
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -5841,12 +5967,6 @@ static int llama_decode_internal(
|
|
5841
5967
|
n_threads = 1;
|
5842
5968
|
}
|
5843
5969
|
|
5844
|
-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
5845
|
-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
5846
|
-
|
5847
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5848
|
-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
5849
|
-
|
5850
5970
|
#if GGML_USE_MPI
|
5851
5971
|
const int64_t n_layer = hparams.n_layer;
|
5852
5972
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
@@ -6199,7 +6319,6 @@ struct llm_tokenizer_bpe {
|
|
6199
6319
|
llm_symbol sym;
|
6200
6320
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
6201
6321
|
sym.text = word.c_str() + offset;
|
6202
|
-
sym.n = 1;
|
6203
6322
|
sym.n = char_len;
|
6204
6323
|
offset += sym.n;
|
6205
6324
|
sym.prev = index - 1;
|
@@ -6459,7 +6578,137 @@ private:
|
|
6459
6578
|
llm_bigram_bpe::queue work_queue;
|
6460
6579
|
};
|
6461
6580
|
|
6462
|
-
|
6581
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
|
6582
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
6583
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
6584
|
+
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
6585
|
+
|
6586
|
+
struct fragment_buffer_variant{
|
6587
|
+
fragment_buffer_variant(llama_vocab::id _token)
|
6588
|
+
:
|
6589
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
6590
|
+
token(_token),
|
6591
|
+
raw_text(_dummy),
|
6592
|
+
offset(0),
|
6593
|
+
length(0){}
|
6594
|
+
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
6595
|
+
:
|
6596
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
6597
|
+
token((llama_vocab::id)-1),
|
6598
|
+
raw_text(_raw_text),
|
6599
|
+
offset(_offset),
|
6600
|
+
length(_length){
|
6601
|
+
GGML_ASSERT( _offset >= 0 );
|
6602
|
+
GGML_ASSERT( _length >= 1 );
|
6603
|
+
GGML_ASSERT( offset + length <= raw_text.length() );
|
6604
|
+
}
|
6605
|
+
|
6606
|
+
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
6607
|
+
const llama_vocab::id token;
|
6608
|
+
const std::string _dummy;
|
6609
|
+
const std::string & raw_text;
|
6610
|
+
const uint64_t offset;
|
6611
|
+
const uint64_t length;
|
6612
|
+
};
|
6613
|
+
|
6614
|
+
// #define PRETOKENIZERDEBUG
|
6615
|
+
|
6616
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
6617
|
+
{
|
6618
|
+
// for each special token
|
6619
|
+
for (const auto & st: vocab.special_tokens_cache) {
|
6620
|
+
const auto & special_token = st.first;
|
6621
|
+
const auto & special_id = st.second;
|
6622
|
+
|
6623
|
+
// for each text fragment
|
6624
|
+
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
6625
|
+
while (it != buffer.end()) {
|
6626
|
+
auto & fragment = (*it);
|
6627
|
+
|
6628
|
+
// if a fragment is text ( not yet processed )
|
6629
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
6630
|
+
auto * raw_text = &(fragment.raw_text);
|
6631
|
+
|
6632
|
+
auto raw_text_base_offset = fragment.offset;
|
6633
|
+
auto raw_text_base_length = fragment.length;
|
6634
|
+
|
6635
|
+
// loop over the text
|
6636
|
+
while (true) {
|
6637
|
+
// find the first occurence of a given special token in this fragment
|
6638
|
+
// passing offset argument only limit the "search area" but match coordinates
|
6639
|
+
// are still relative to the source full raw_text
|
6640
|
+
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6641
|
+
|
6642
|
+
// no occurences found, stop processing this fragment for a given special token
|
6643
|
+
if (match == std::string::npos) break;
|
6644
|
+
|
6645
|
+
// check if match is within bounds of offset <-> length
|
6646
|
+
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
6647
|
+
|
6648
|
+
#ifdef PRETOKENIZERDEBUG
|
6649
|
+
fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6650
|
+
#endif
|
6651
|
+
auto source = std::distance(buffer.begin(), it);
|
6652
|
+
|
6653
|
+
// if match is further than base offset
|
6654
|
+
// then we have some text to the left of it
|
6655
|
+
if (match > raw_text_base_offset) {
|
6656
|
+
// left
|
6657
|
+
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
6658
|
+
const int64_t left_reminder_length = match - raw_text_base_offset;
|
6659
|
+
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
6660
|
+
|
6661
|
+
#ifdef PRETOKENIZERDEBUG
|
6662
|
+
fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
6663
|
+
#endif
|
6664
|
+
it++;
|
6665
|
+
}
|
6666
|
+
|
6667
|
+
// special token
|
6668
|
+
buffer.emplace_after(it, special_id);
|
6669
|
+
it++;
|
6670
|
+
|
6671
|
+
// right
|
6672
|
+
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
6673
|
+
const int64_t right_reminder_offset = match + special_token.length();
|
6674
|
+
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
6675
|
+
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
6676
|
+
|
6677
|
+
#ifdef PRETOKENIZERDEBUG
|
6678
|
+
fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
6679
|
+
#endif
|
6680
|
+
|
6681
|
+
it++;
|
6682
|
+
|
6683
|
+
if (source == 0) {
|
6684
|
+
buffer.erase_after(buffer.before_begin());
|
6685
|
+
} else {
|
6686
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6687
|
+
}
|
6688
|
+
|
6689
|
+
// repeat for the right side
|
6690
|
+
raw_text_base_offset = right_reminder_offset;
|
6691
|
+
raw_text_base_length = right_reminder_length;
|
6692
|
+
|
6693
|
+
#ifdef PRETOKENIZERDEBUG
|
6694
|
+
fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6695
|
+
#endif
|
6696
|
+
} else {
|
6697
|
+
if (source == 0) {
|
6698
|
+
buffer.erase_after(buffer.before_begin());
|
6699
|
+
} else {
|
6700
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6701
|
+
}
|
6702
|
+
break;
|
6703
|
+
}
|
6704
|
+
}
|
6705
|
+
}
|
6706
|
+
it++;
|
6707
|
+
}
|
6708
|
+
}
|
6709
|
+
}
|
6710
|
+
|
6711
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
|
6463
6712
|
std::vector<llama_vocab::id> output;
|
6464
6713
|
|
6465
6714
|
// OG tokenizer behavior:
|
@@ -6475,20 +6724,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
6475
6724
|
return output;
|
6476
6725
|
}
|
6477
6726
|
|
6727
|
+
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
6728
|
+
fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
|
6729
|
+
|
6730
|
+
if (special) tokenizer_st_partition( vocab, fragment_buffer );
|
6731
|
+
|
6478
6732
|
switch (vocab.type) {
|
6479
6733
|
case LLAMA_VOCAB_TYPE_SPM:
|
6480
6734
|
{
|
6481
|
-
|
6482
|
-
|
6735
|
+
for (const auto & fragment: fragment_buffer)
|
6736
|
+
{
|
6737
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6738
|
+
{
|
6739
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
6483
6740
|
|
6484
|
-
|
6485
|
-
|
6486
|
-
|
6741
|
+
// TODO: It's likely possible to get rid of this string copy entirely
|
6742
|
+
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
6743
|
+
// and passing 'add space prefix' as bool argument
|
6744
|
+
//
|
6745
|
+
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
|
6746
|
+
|
6747
|
+
#ifdef PRETOKENIZERDEBUG
|
6748
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6749
|
+
#endif
|
6750
|
+
llm_tokenizer_spm tokenizer(vocab);
|
6751
|
+
llama_escape_whitespace(raw_text);
|
6752
|
+
tokenizer.tokenize(raw_text, output);
|
6753
|
+
}
|
6754
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6755
|
+
{
|
6756
|
+
output.push_back(fragment.token);
|
6757
|
+
}
|
6758
|
+
}
|
6487
6759
|
} break;
|
6488
6760
|
case LLAMA_VOCAB_TYPE_BPE:
|
6489
6761
|
{
|
6490
|
-
|
6491
|
-
|
6762
|
+
for (const auto & fragment: fragment_buffer)
|
6763
|
+
{
|
6764
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6765
|
+
{
|
6766
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6767
|
+
|
6768
|
+
#ifdef PRETOKENIZERDEBUG
|
6769
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6770
|
+
#endif
|
6771
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
6772
|
+
tokenizer.tokenize(raw_text, output);
|
6773
|
+
}
|
6774
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6775
|
+
{
|
6776
|
+
output.push_back(fragment.token);
|
6777
|
+
}
|
6778
|
+
}
|
6492
6779
|
} break;
|
6493
6780
|
}
|
6494
6781
|
|
@@ -6761,7 +7048,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
6761
7048
|
std::vector<llama_grammar_candidate> rejects;
|
6762
7049
|
|
6763
7050
|
if (stack.empty()) {
|
6764
|
-
for (auto tok : candidates) {
|
7051
|
+
for (const auto & tok : candidates) {
|
6765
7052
|
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
6766
7053
|
rejects.push_back(tok);
|
6767
7054
|
}
|
@@ -6772,7 +7059,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
6772
7059
|
const llama_grammar_element * stack_pos = stack.back();
|
6773
7060
|
|
6774
7061
|
std::vector<llama_grammar_candidate> next_candidates;
|
6775
|
-
for (auto tok : candidates) {
|
7062
|
+
for (const auto & tok : candidates) {
|
6776
7063
|
if (*tok.code_points == 0) {
|
6777
7064
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
6778
7065
|
// that cannot satisfy this position in grammar
|
@@ -6798,7 +7085,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
6798
7085
|
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
6799
7086
|
|
6800
7087
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
6801
|
-
for (auto tok : next_rejects) {
|
7088
|
+
for (const auto & tok : next_rejects) {
|
6802
7089
|
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
6803
7090
|
}
|
6804
7091
|
|
@@ -8831,6 +9118,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
|
|
8831
9118
|
}
|
8832
9119
|
|
8833
9120
|
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
9121
|
+
if (seq_id_src == seq_id_dst) {
|
9122
|
+
return;
|
9123
|
+
}
|
8834
9124
|
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
8835
9125
|
}
|
8836
9126
|
|
@@ -9283,7 +9573,7 @@ int llama_eval_embd(
|
|
9283
9573
|
int n_past) {
|
9284
9574
|
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
9285
9575
|
|
9286
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
9576
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
9287
9577
|
|
9288
9578
|
const int ret = llama_decode_internal(*ctx, batch);
|
9289
9579
|
if (ret < 0) {
|
@@ -9304,20 +9594,21 @@ struct llama_batch llama_batch_get_one(
|
|
9304
9594
|
llama_pos pos_0,
|
9305
9595
|
llama_seq_id seq_id) {
|
9306
9596
|
return {
|
9307
|
-
/*n_tokens
|
9308
|
-
/*tokens
|
9309
|
-
/*embd
|
9310
|
-
/*pos
|
9311
|
-
/*
|
9312
|
-
/*
|
9313
|
-
/*
|
9314
|
-
/*
|
9315
|
-
/*
|
9597
|
+
/*n_tokens =*/ n_tokens,
|
9598
|
+
/*tokens =*/ tokens,
|
9599
|
+
/*embd =*/ nullptr,
|
9600
|
+
/*pos =*/ nullptr,
|
9601
|
+
/*n_seq_id =*/ nullptr,
|
9602
|
+
/*seq_id =*/ nullptr,
|
9603
|
+
/*logits =*/ nullptr,
|
9604
|
+
/*all_pos_0 =*/ pos_0,
|
9605
|
+
/*all_pos_1 =*/ 1,
|
9606
|
+
/*all_seq_id =*/ seq_id,
|
9316
9607
|
};
|
9317
9608
|
}
|
9318
9609
|
|
9319
|
-
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
9320
|
-
llama_batch batch = {
|
9610
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
9611
|
+
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
9321
9612
|
|
9322
9613
|
if (embd) {
|
9323
9614
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
@@ -9325,19 +9616,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
|
9325
9616
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
9326
9617
|
}
|
9327
9618
|
|
9328
|
-
batch.pos
|
9329
|
-
batch.
|
9330
|
-
batch.
|
9619
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
9620
|
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
9621
|
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
9622
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9623
|
+
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
9624
|
+
}
|
9625
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
9331
9626
|
|
9332
9627
|
return batch;
|
9333
9628
|
}
|
9334
9629
|
|
9335
9630
|
void llama_batch_free(struct llama_batch batch) {
|
9336
|
-
if (batch.token)
|
9337
|
-
if (batch.embd)
|
9338
|
-
if (batch.pos)
|
9339
|
-
if (batch.
|
9340
|
-
if (batch.
|
9631
|
+
if (batch.token) free(batch.token);
|
9632
|
+
if (batch.embd) free(batch.embd);
|
9633
|
+
if (batch.pos) free(batch.pos);
|
9634
|
+
if (batch.n_seq_id) free(batch.n_seq_id);
|
9635
|
+
if (batch.seq_id) {
|
9636
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
9637
|
+
free(batch.seq_id[i]);
|
9638
|
+
}
|
9639
|
+
free(batch.seq_id);
|
9640
|
+
}
|
9641
|
+
if (batch.logits) free(batch.logits);
|
9341
9642
|
}
|
9342
9643
|
|
9343
9644
|
int llama_decode(
|
@@ -9402,15 +9703,15 @@ llama_token llama_token_eot(const struct llama_context * ctx) {
|
|
9402
9703
|
return ctx->model.vocab.special_eot_id;
|
9403
9704
|
}
|
9404
9705
|
|
9405
|
-
|
9406
9706
|
int llama_tokenize(
|
9407
9707
|
const struct llama_model * model,
|
9408
9708
|
const char * text,
|
9409
9709
|
int text_len,
|
9410
9710
|
llama_token * tokens,
|
9411
9711
|
int n_max_tokens,
|
9412
|
-
bool add_bos
|
9413
|
-
|
9712
|
+
bool add_bos,
|
9713
|
+
bool special) {
|
9714
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
9414
9715
|
|
9415
9716
|
if (n_max_tokens < (int) res.size()) {
|
9416
9717
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|