llama_cpp 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +18 -0
- data/examples/chat.rb +8 -6
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +122 -183
- data/ext/llama_cpp/src/ggml-cuda.cu +188 -20
- data/ext/llama_cpp/src/ggml-metal.m +57 -8
- data/ext/llama_cpp/src/ggml-metal.metal +171 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +188 -222
- data/ext/llama_cpp/src/ggml.c +375 -93
- data/ext/llama_cpp/src/ggml.h +11 -9
- data/ext/llama_cpp/src/k_quants.c +12 -20
- data/ext/llama_cpp/src/llama.cpp +459 -153
- data/ext/llama_cpp/src/llama.h +34 -33
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +4 -4
- data/sig/llama_cpp.rbs +15 -16
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -75,6 +75,7 @@
|
|
75
75
|
#include <thread>
|
76
76
|
#include <unordered_map>
|
77
77
|
#include <set>
|
78
|
+
#include <forward_list>
|
78
79
|
|
79
80
|
#if defined(_MSC_VER)
|
80
81
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
@@ -969,14 +970,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
|
969
970
|
(void) tensor;
|
970
971
|
}
|
971
972
|
|
972
|
-
static std::string
|
973
|
+
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
973
974
|
std::vector<char> result(8, 0);
|
974
975
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
975
976
|
if (n_tokens < 0) {
|
976
977
|
result.resize(-n_tokens);
|
977
978
|
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
978
979
|
GGML_ASSERT(check == -n_tokens);
|
979
|
-
}
|
980
|
+
}
|
981
|
+
else {
|
980
982
|
result.resize(n_tokens);
|
981
983
|
}
|
982
984
|
|
@@ -1012,8 +1014,8 @@ enum e_model {
|
|
1012
1014
|
};
|
1013
1015
|
|
1014
1016
|
static const size_t kB = 1024;
|
1015
|
-
static const size_t MB =
|
1016
|
-
static const size_t GB =
|
1017
|
+
static const size_t MB = 1024*kB;
|
1018
|
+
static const size_t GB = 1024*MB;
|
1017
1019
|
|
1018
1020
|
struct llama_hparams {
|
1019
1021
|
bool vocab_only;
|
@@ -1036,21 +1038,21 @@ struct llama_hparams {
|
|
1036
1038
|
float f_max_alibi_bias;
|
1037
1039
|
|
1038
1040
|
bool operator!=(const llama_hparams & other) const {
|
1039
|
-
if (this->vocab_only
|
1040
|
-
if (this->n_vocab
|
1041
|
+
if (this->vocab_only != other.vocab_only) return true;
|
1042
|
+
if (this->n_vocab != other.n_vocab) return true;
|
1041
1043
|
if (this->n_ctx_train != other.n_ctx_train) return true;
|
1042
|
-
if (this->n_embd
|
1043
|
-
if (this->n_head
|
1044
|
-
if (this->n_head_kv
|
1045
|
-
if (this->n_layer
|
1046
|
-
if (this->n_rot
|
1047
|
-
if (this->n_ff
|
1044
|
+
if (this->n_embd != other.n_embd) return true;
|
1045
|
+
if (this->n_head != other.n_head) return true;
|
1046
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
1047
|
+
if (this->n_layer != other.n_layer) return true;
|
1048
|
+
if (this->n_rot != other.n_rot) return true;
|
1049
|
+
if (this->n_ff != other.n_ff) return true;
|
1048
1050
|
|
1049
1051
|
const float EPSILON = 1e-9;
|
1050
1052
|
|
1051
|
-
if (!is_float_close(this->f_norm_eps,
|
1052
|
-
if (!is_float_close(this->f_norm_rms_eps,
|
1053
|
-
if (!is_float_close(this->rope_freq_base_train,
|
1053
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1054
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1055
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1054
1056
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1055
1057
|
|
1056
1058
|
return false;
|
@@ -1178,6 +1180,8 @@ struct llama_vocab {
|
|
1178
1180
|
std::unordered_map<token, id> token_to_id;
|
1179
1181
|
std::vector<token_data> id_to_token;
|
1180
1182
|
|
1183
|
+
std::unordered_map<token, id> special_tokens_cache;
|
1184
|
+
|
1181
1185
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
1182
1186
|
|
1183
1187
|
// default LLaMA special tokens
|
@@ -1187,17 +1191,17 @@ struct llama_vocab {
|
|
1187
1191
|
id special_sep_id = -1;
|
1188
1192
|
id special_pad_id = -1;
|
1189
1193
|
|
1190
|
-
id linefeed_id
|
1194
|
+
id linefeed_id = 13;
|
1191
1195
|
id special_prefix_id = 32007;
|
1192
1196
|
id special_middle_id = 32009;
|
1193
1197
|
id special_suffix_id = 32008;
|
1194
|
-
id special_eot_id
|
1198
|
+
id special_eot_id = 32010;
|
1195
1199
|
|
1196
1200
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1197
|
-
|
1198
|
-
|
1199
|
-
|
1200
|
-
|
1201
|
+
GGML_ASSERT(token_left.find(" ") == std::string::npos);
|
1202
|
+
GGML_ASSERT(token_left.find("\n") == std::string::npos);
|
1203
|
+
GGML_ASSERT(token_right.find(" ") == std::string::npos);
|
1204
|
+
GGML_ASSERT(token_right.find("\n") == std::string::npos);
|
1201
1205
|
|
1202
1206
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
1203
1207
|
if (it == bpe_ranks.end()) {
|
@@ -1351,10 +1355,7 @@ static bool llama_kv_cache_init(
|
|
1351
1355
|
cache.cells.clear();
|
1352
1356
|
cache.cells.resize(n_ctx);
|
1353
1357
|
|
1354
|
-
|
1355
|
-
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1356
|
-
// change it and test that it works
|
1357
|
-
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1358
|
+
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1358
1359
|
memset(cache.buf.data, 0, cache.buf.size);
|
1359
1360
|
|
1360
1361
|
struct ggml_init_params params;
|
@@ -1442,7 +1443,10 @@ static bool llama_kv_cache_find_slot(
|
|
1442
1443
|
|
1443
1444
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
1444
1445
|
cache.cells[cache.head + i].pos = batch.pos[i];
|
1445
|
-
|
1446
|
+
|
1447
|
+
for (int32_t j = 0; j < batch.n_seq_id[i]; j++) {
|
1448
|
+
cache.cells[cache.head + i].seq_id.insert(batch.seq_id[i][j]);
|
1449
|
+
}
|
1446
1450
|
}
|
1447
1451
|
|
1448
1452
|
return true;
|
@@ -1522,6 +1526,9 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1522
1526
|
cache.cells[i].pos = -1;
|
1523
1527
|
cache.cells[i].seq_id.clear();
|
1524
1528
|
if (new_head == cache.size) new_head = i;
|
1529
|
+
} else {
|
1530
|
+
cache.cells[i].seq_id.clear();
|
1531
|
+
cache.cells[i].seq_id.insert(seq_id);
|
1525
1532
|
}
|
1526
1533
|
}
|
1527
1534
|
|
@@ -2120,7 +2127,7 @@ static void llm_load_hparams(
|
|
2120
2127
|
}
|
2121
2128
|
|
2122
2129
|
// TODO: This should probably be in llama.h
|
2123
|
-
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos);
|
2130
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false);
|
2124
2131
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
2125
2132
|
|
2126
2133
|
static void llm_load_vocab(
|
@@ -2227,15 +2234,130 @@ static void llm_load_vocab(
|
|
2227
2234
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
2228
2235
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
2229
2236
|
} else {
|
2230
|
-
|
2237
|
+
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
|
2238
|
+
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
|
2239
|
+
vocab.linefeed_id = ids[0];
|
2231
2240
|
}
|
2232
2241
|
|
2233
2242
|
// special tokens
|
2234
|
-
|
2235
|
-
|
2236
|
-
|
2237
|
-
|
2238
|
-
|
2243
|
+
{
|
2244
|
+
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
|
2245
|
+
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
|
2246
|
+
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
|
2247
|
+
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
|
2248
|
+
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
|
2249
|
+
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
|
2250
|
+
};
|
2251
|
+
for (const auto & it : special_token_types) {
|
2252
|
+
const std::string & key = kv(std::get<0>(it));
|
2253
|
+
int32_t & id = std::get<1>(it), old_id = id;
|
2254
|
+
|
2255
|
+
GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
|
2256
|
+
// Must be >= -1 and < vocab size. Since the key is unsigned, -1
|
2257
|
+
// can only come from the default value, so there's no point in
|
2258
|
+
// validating that.
|
2259
|
+
if (size_t(id + 1) > vocab.id_to_token.size()) {
|
2260
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
|
2261
|
+
__func__, key.c_str(), id, old_id);
|
2262
|
+
id = old_id;
|
2263
|
+
}
|
2264
|
+
}
|
2265
|
+
}
|
2266
|
+
|
2267
|
+
// build special tokens cache
|
2268
|
+
{
|
2269
|
+
// TODO: It is unclear (to me) at this point, whether special tokes are guaranteed to be of a deterministic type,
|
2270
|
+
// and will always be correctly labeled in 'added_tokens.json' etc.
|
2271
|
+
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
2272
|
+
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
2273
|
+
// are special tokens.
|
2274
|
+
// From testing, this appears to corelate 1:1 with special tokens.
|
2275
|
+
//
|
2276
|
+
|
2277
|
+
// Counting special tokens and verifying in only one direction
|
2278
|
+
// is sufficient to detect difference in those two sets.
|
2279
|
+
//
|
2280
|
+
uint32_t special_tokens_count_by_type = 0;
|
2281
|
+
uint32_t special_tokens_count_from_verification = 0;
|
2282
|
+
|
2283
|
+
bool special_tokens_definition_mismatch = false;
|
2284
|
+
|
2285
|
+
for (const auto & t : vocab.token_to_id) {
|
2286
|
+
const auto & token = t.first;
|
2287
|
+
const auto & id = t.second;
|
2288
|
+
|
2289
|
+
// Count all non-normal tokens in the vocab while iterating
|
2290
|
+
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
2291
|
+
special_tokens_count_by_type++;
|
2292
|
+
}
|
2293
|
+
|
2294
|
+
// Skip single character tokens
|
2295
|
+
if (token.length() > 1) {
|
2296
|
+
bool is_tokenizable = false;
|
2297
|
+
|
2298
|
+
// Split token string representation in two, in all possible ways
|
2299
|
+
// and check if both halves can be matched to a valid token
|
2300
|
+
for (unsigned i = 1; i < token.length();) {
|
2301
|
+
const auto left = token.substr(0, i);
|
2302
|
+
const auto right = token.substr(i);
|
2303
|
+
|
2304
|
+
// check if we didnt partition in the middle of a utf sequence
|
2305
|
+
auto utf = utf8_len(left.at(left.length() - 1));
|
2306
|
+
|
2307
|
+
if (utf == 1) {
|
2308
|
+
if (vocab.token_to_id.find(left) != vocab.token_to_id.end() &&
|
2309
|
+
vocab.token_to_id.find(right) != vocab.token_to_id.end() ) {
|
2310
|
+
is_tokenizable = true;
|
2311
|
+
break;
|
2312
|
+
}
|
2313
|
+
i++;
|
2314
|
+
} else {
|
2315
|
+
// skip over the rest of multibyte utf sequence
|
2316
|
+
i += utf - 1;
|
2317
|
+
}
|
2318
|
+
}
|
2319
|
+
|
2320
|
+
if (!is_tokenizable) {
|
2321
|
+
// Some tokens are multibyte, but they are utf sequences with equivalent text length of 1
|
2322
|
+
// it's faster to re-filter them here, since there are way less candidates now
|
2323
|
+
|
2324
|
+
// Calculate a total "utf" length of a token string representation
|
2325
|
+
size_t utf8_str_len = 0;
|
2326
|
+
for (unsigned i = 0; i < token.length();) {
|
2327
|
+
utf8_str_len++;
|
2328
|
+
i += utf8_len(token.at(i));
|
2329
|
+
}
|
2330
|
+
|
2331
|
+
// And skip the ones which are one character
|
2332
|
+
if (utf8_str_len > 1) {
|
2333
|
+
// At this point what we have left are special tokens only
|
2334
|
+
vocab.special_tokens_cache[token] = id;
|
2335
|
+
|
2336
|
+
// Count manually found special tokens
|
2337
|
+
special_tokens_count_from_verification++;
|
2338
|
+
|
2339
|
+
// If this manually found special token is not marked as such, flag a mismatch
|
2340
|
+
if (vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL) {
|
2341
|
+
special_tokens_definition_mismatch = true;
|
2342
|
+
}
|
2343
|
+
}
|
2344
|
+
}
|
2345
|
+
}
|
2346
|
+
}
|
2347
|
+
|
2348
|
+
if (special_tokens_definition_mismatch || special_tokens_count_from_verification != special_tokens_count_by_type) {
|
2349
|
+
LLAMA_LOG_WARN("%s: mismatch in special tokens definition ( %u/%zu vs %u/%zu ).\n",
|
2350
|
+
__func__,
|
2351
|
+
special_tokens_count_from_verification, vocab.id_to_token.size(),
|
2352
|
+
special_tokens_count_by_type, vocab.id_to_token.size()
|
2353
|
+
);
|
2354
|
+
} else {
|
2355
|
+
LLAMA_LOG_INFO("%s: special tokens definition check successful ( %u/%zu ).\n",
|
2356
|
+
__func__,
|
2357
|
+
special_tokens_count_from_verification, vocab.id_to_token.size()
|
2358
|
+
);
|
2359
|
+
}
|
2360
|
+
}
|
2239
2361
|
}
|
2240
2362
|
|
2241
2363
|
static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
@@ -2834,8 +2956,8 @@ static void llm_load_tensors(
|
|
2834
2956
|
auto & layer = model.layers[i];
|
2835
2957
|
|
2836
2958
|
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2837
|
-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd,
|
2838
|
-
layer.wo
|
2959
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2960
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2839
2961
|
|
2840
2962
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2841
2963
|
|
@@ -3075,7 +3197,7 @@ static struct ggml_cgraph * llm_build_llama(
|
|
3075
3197
|
for (int h = 0; h < 1; ++h) {
|
3076
3198
|
for (int j = 0; j < n_tokens; ++j) {
|
3077
3199
|
const llama_pos pos = batch.pos[j];
|
3078
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3200
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3079
3201
|
|
3080
3202
|
for (int i = 0; i < n_kv; ++i) {
|
3081
3203
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3461,7 +3583,7 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3461
3583
|
for (int h = 0; h < 1; ++h) {
|
3462
3584
|
for (int j = 0; j < n_tokens; ++j) {
|
3463
3585
|
const llama_pos pos = batch.pos[j];
|
3464
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3586
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3465
3587
|
|
3466
3588
|
for (int i = 0; i < n_kv; ++i) {
|
3467
3589
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -3860,7 +3982,7 @@ static struct ggml_cgraph * llm_build_refact(
|
|
3860
3982
|
for (int h = 0; h < 1; ++h) {
|
3861
3983
|
for (int j = 0; j < n_tokens; ++j) {
|
3862
3984
|
const llama_pos pos = batch.pos[j];
|
3863
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
3985
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
3864
3986
|
|
3865
3987
|
for (int i = 0; i < n_kv; ++i) {
|
3866
3988
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4212,7 +4334,7 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
4212
4334
|
for (int h = 0; h < 1; ++h) {
|
4213
4335
|
for (int j = 0; j < n_tokens; ++j) {
|
4214
4336
|
const llama_pos pos = batch.pos[j];
|
4215
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4337
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4216
4338
|
|
4217
4339
|
for (int i = 0; i < n_kv; ++i) {
|
4218
4340
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4564,7 +4686,7 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4564
4686
|
for (int h = 0; h < 1; ++h) {
|
4565
4687
|
for (int j = 0; j < n_tokens; ++j) {
|
4566
4688
|
const llama_pos pos = batch.pos[j];
|
4567
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4689
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4568
4690
|
|
4569
4691
|
for (int i = 0; i < n_kv; ++i) {
|
4570
4692
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -4795,7 +4917,7 @@ static struct ggml_cgraph * llm_build_persimmon(
|
|
4795
4917
|
for (int h = 0; h < 1; ++h) {
|
4796
4918
|
for (int j = 0; j < n_tokens; ++j) {
|
4797
4919
|
const llama_pos pos = batch.pos[j];
|
4798
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
4920
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
4799
4921
|
for (int i = 0; i < n_kv; ++i) {
|
4800
4922
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
4801
4923
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
@@ -5193,7 +5315,7 @@ static struct ggml_cgraph * llm_build_bloom(
|
|
5193
5315
|
for (int h = 0; h < 1; ++h) {
|
5194
5316
|
for (int j = 0; j < n_tokens; ++j) {
|
5195
5317
|
const llama_pos pos = batch.pos[j];
|
5196
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5318
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5197
5319
|
|
5198
5320
|
for (int i = 0; i < n_kv; ++i) {
|
5199
5321
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -5363,7 +5485,7 @@ static struct ggml_cgraph * llm_build_mpt(
|
|
5363
5485
|
const int64_t n_layer = hparams.n_layer;
|
5364
5486
|
const int64_t n_ctx = cparams.n_ctx;
|
5365
5487
|
const int64_t n_head = hparams.n_head;
|
5366
|
-
const int64_t n_head_kv = hparams.n_head_kv;
|
5488
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5367
5489
|
const int64_t n_embd_head = hparams.n_embd_head();
|
5368
5490
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5369
5491
|
|
@@ -5461,7 +5583,7 @@ static struct ggml_cgraph * llm_build_mpt(
|
|
5461
5583
|
for (int h = 0; h < 1; ++h) {
|
5462
5584
|
for (int j = 0; j < n_tokens; ++j) {
|
5463
5585
|
const llama_pos pos = batch.pos[j];
|
5464
|
-
const llama_seq_id seq_id = batch.seq_id[j];
|
5586
|
+
const llama_seq_id seq_id = batch.seq_id[j][0];
|
5465
5587
|
|
5466
5588
|
for (int i = 0; i < n_kv; ++i) {
|
5467
5589
|
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
@@ -5761,8 +5883,11 @@ static int llama_decode_internal(
|
|
5761
5883
|
|
5762
5884
|
// helpers for smoother batch API transistion
|
5763
5885
|
// after deprecating the llama_eval calls, these will be removed
|
5764
|
-
std::vector<llama_pos>
|
5765
|
-
|
5886
|
+
std::vector<llama_pos> pos;
|
5887
|
+
|
5888
|
+
std::vector<int32_t> n_seq_id;
|
5889
|
+
std::vector<llama_seq_id *> seq_id_arr;
|
5890
|
+
std::vector<std::vector<llama_seq_id>> seq_id;
|
5766
5891
|
|
5767
5892
|
if (batch.pos == nullptr) {
|
5768
5893
|
pos.resize(n_tokens);
|
@@ -5774,12 +5899,18 @@ static int llama_decode_internal(
|
|
5774
5899
|
}
|
5775
5900
|
|
5776
5901
|
if (batch.seq_id == nullptr) {
|
5902
|
+
n_seq_id.resize(n_tokens);
|
5777
5903
|
seq_id.resize(n_tokens);
|
5904
|
+
seq_id_arr.resize(n_tokens);
|
5778
5905
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
5779
|
-
|
5906
|
+
n_seq_id[i] = 1;
|
5907
|
+
seq_id[i].resize(1);
|
5908
|
+
seq_id[i][0] = batch.all_seq_id;
|
5909
|
+
seq_id_arr[i] = seq_id[i].data();
|
5780
5910
|
}
|
5781
5911
|
|
5782
|
-
batch.
|
5912
|
+
batch.n_seq_id = n_seq_id.data();
|
5913
|
+
batch.seq_id = seq_id_arr.data();
|
5783
5914
|
}
|
5784
5915
|
|
5785
5916
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
@@ -5800,6 +5931,13 @@ static int llama_decode_internal(
|
|
5800
5931
|
|
5801
5932
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
5802
5933
|
|
5934
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
5935
|
+
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
5936
|
+
|
5937
|
+
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5938
|
+
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
5939
|
+
|
5940
|
+
|
5803
5941
|
#ifdef GGML_USE_CUBLAS
|
5804
5942
|
for (int i = 0; i < gf->n_leafs; i++) {
|
5805
5943
|
ggml_tensor * node = gf->leafs[i];
|
@@ -5817,6 +5955,12 @@ static int llama_decode_internal(
|
|
5817
5955
|
}
|
5818
5956
|
|
5819
5957
|
ggml_cuda_set_mul_mat_q(cparams.mul_mat_q);
|
5958
|
+
|
5959
|
+
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
5960
|
+
if (!lctx.embedding.empty()) {
|
5961
|
+
embeddings->backend = GGML_BACKEND_CPU;
|
5962
|
+
}
|
5963
|
+
res->backend = GGML_BACKEND_CPU;
|
5820
5964
|
#endif
|
5821
5965
|
|
5822
5966
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
@@ -5841,12 +5985,6 @@ static int llama_decode_internal(
|
|
5841
5985
|
n_threads = 1;
|
5842
5986
|
}
|
5843
5987
|
|
5844
|
-
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
5845
|
-
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
5846
|
-
|
5847
|
-
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
5848
|
-
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
5849
|
-
|
5850
5988
|
#if GGML_USE_MPI
|
5851
5989
|
const int64_t n_layer = hparams.n_layer;
|
5852
5990
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
@@ -5981,11 +6119,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
5981
6119
|
}
|
5982
6120
|
|
5983
6121
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
6122
|
+
static const char * hex = "0123456789ABCDEF";
|
5984
6123
|
switch (llama_vocab_get_type(vocab)) {
|
5985
6124
|
case LLAMA_VOCAB_TYPE_SPM: {
|
5986
|
-
char buf[7];
|
5987
|
-
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
5988
|
-
GGML_ASSERT(0 <= result && result < 7);
|
6125
|
+
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
5989
6126
|
return vocab.token_to_id.at(buf);
|
5990
6127
|
}
|
5991
6128
|
case LLAMA_VOCAB_TYPE_BPE: {
|
@@ -6199,7 +6336,6 @@ struct llm_tokenizer_bpe {
|
|
6199
6336
|
llm_symbol sym;
|
6200
6337
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
6201
6338
|
sym.text = word.c_str() + offset;
|
6202
|
-
sym.n = 1;
|
6203
6339
|
sym.n = char_len;
|
6204
6340
|
offset += sym.n;
|
6205
6341
|
sym.prev = index - 1;
|
@@ -6459,7 +6595,137 @@ private:
|
|
6459
6595
|
llm_bigram_bpe::queue work_queue;
|
6460
6596
|
};
|
6461
6597
|
|
6462
|
-
|
6598
|
+
typedef enum FRAGMENT_BUFFER_VARIANT_TYPE{
|
6599
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN,
|
6600
|
+
FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT
|
6601
|
+
} FRAGMENT_BUFFER_VARIANT_TYPE;
|
6602
|
+
|
6603
|
+
struct fragment_buffer_variant{
|
6604
|
+
fragment_buffer_variant(llama_vocab::id _token)
|
6605
|
+
:
|
6606
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN),
|
6607
|
+
token(_token),
|
6608
|
+
raw_text(_dummy),
|
6609
|
+
offset(0),
|
6610
|
+
length(0){}
|
6611
|
+
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
6612
|
+
:
|
6613
|
+
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
6614
|
+
token((llama_vocab::id)-1),
|
6615
|
+
raw_text(_raw_text),
|
6616
|
+
offset(_offset),
|
6617
|
+
length(_length){
|
6618
|
+
GGML_ASSERT( _offset >= 0 );
|
6619
|
+
GGML_ASSERT( _length >= 1 );
|
6620
|
+
GGML_ASSERT( offset + length <= raw_text.length() );
|
6621
|
+
}
|
6622
|
+
|
6623
|
+
const FRAGMENT_BUFFER_VARIANT_TYPE type;
|
6624
|
+
const llama_vocab::id token;
|
6625
|
+
const std::string _dummy;
|
6626
|
+
const std::string & raw_text;
|
6627
|
+
const uint64_t offset;
|
6628
|
+
const uint64_t length;
|
6629
|
+
};
|
6630
|
+
|
6631
|
+
// #define PRETOKENIZERDEBUG
|
6632
|
+
|
6633
|
+
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer)
|
6634
|
+
{
|
6635
|
+
// for each special token
|
6636
|
+
for (const auto & st: vocab.special_tokens_cache) {
|
6637
|
+
const auto & special_token = st.first;
|
6638
|
+
const auto & special_id = st.second;
|
6639
|
+
|
6640
|
+
// for each text fragment
|
6641
|
+
std::forward_list<fragment_buffer_variant>::iterator it = buffer.begin();
|
6642
|
+
while (it != buffer.end()) {
|
6643
|
+
auto & fragment = (*it);
|
6644
|
+
|
6645
|
+
// if a fragment is text ( not yet processed )
|
6646
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
6647
|
+
auto * raw_text = &(fragment.raw_text);
|
6648
|
+
|
6649
|
+
auto raw_text_base_offset = fragment.offset;
|
6650
|
+
auto raw_text_base_length = fragment.length;
|
6651
|
+
|
6652
|
+
// loop over the text
|
6653
|
+
while (true) {
|
6654
|
+
// find the first occurence of a given special token in this fragment
|
6655
|
+
// passing offset argument only limit the "search area" but match coordinates
|
6656
|
+
// are still relative to the source full raw_text
|
6657
|
+
auto match = raw_text->find(special_token, raw_text_base_offset);
|
6658
|
+
|
6659
|
+
// no occurences found, stop processing this fragment for a given special token
|
6660
|
+
if (match == std::string::npos) break;
|
6661
|
+
|
6662
|
+
// check if match is within bounds of offset <-> length
|
6663
|
+
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
6664
|
+
|
6665
|
+
#ifdef PRETOKENIZERDEBUG
|
6666
|
+
fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6667
|
+
#endif
|
6668
|
+
auto source = std::distance(buffer.begin(), it);
|
6669
|
+
|
6670
|
+
// if match is further than base offset
|
6671
|
+
// then we have some text to the left of it
|
6672
|
+
if (match > raw_text_base_offset) {
|
6673
|
+
// left
|
6674
|
+
const int64_t left_reminder_offset = raw_text_base_offset + 0;
|
6675
|
+
const int64_t left_reminder_length = match - raw_text_base_offset;
|
6676
|
+
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
6677
|
+
|
6678
|
+
#ifdef PRETOKENIZERDEBUG
|
6679
|
+
fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
6680
|
+
#endif
|
6681
|
+
it++;
|
6682
|
+
}
|
6683
|
+
|
6684
|
+
// special token
|
6685
|
+
buffer.emplace_after(it, special_id);
|
6686
|
+
it++;
|
6687
|
+
|
6688
|
+
// right
|
6689
|
+
if (match + special_token.length() < raw_text_base_offset + raw_text_base_length) {
|
6690
|
+
const int64_t right_reminder_offset = match + special_token.length();
|
6691
|
+
const int64_t right_reminder_length = raw_text_base_length - ((match - raw_text_base_offset) + special_token.length());
|
6692
|
+
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
6693
|
+
|
6694
|
+
#ifdef PRETOKENIZERDEBUG
|
6695
|
+
fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
6696
|
+
#endif
|
6697
|
+
|
6698
|
+
it++;
|
6699
|
+
|
6700
|
+
if (source == 0) {
|
6701
|
+
buffer.erase_after(buffer.before_begin());
|
6702
|
+
} else {
|
6703
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6704
|
+
}
|
6705
|
+
|
6706
|
+
// repeat for the right side
|
6707
|
+
raw_text_base_offset = right_reminder_offset;
|
6708
|
+
raw_text_base_length = right_reminder_length;
|
6709
|
+
|
6710
|
+
#ifdef PRETOKENIZERDEBUG
|
6711
|
+
fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
6712
|
+
#endif
|
6713
|
+
} else {
|
6714
|
+
if (source == 0) {
|
6715
|
+
buffer.erase_after(buffer.before_begin());
|
6716
|
+
} else {
|
6717
|
+
buffer.erase_after(std::next(buffer.begin(), (source-1)));
|
6718
|
+
}
|
6719
|
+
break;
|
6720
|
+
}
|
6721
|
+
}
|
6722
|
+
}
|
6723
|
+
it++;
|
6724
|
+
}
|
6725
|
+
}
|
6726
|
+
}
|
6727
|
+
|
6728
|
+
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) {
|
6463
6729
|
std::vector<llama_vocab::id> output;
|
6464
6730
|
|
6465
6731
|
// OG tokenizer behavior:
|
@@ -6475,20 +6741,58 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
6475
6741
|
return output;
|
6476
6742
|
}
|
6477
6743
|
|
6744
|
+
std::forward_list<fragment_buffer_variant> fragment_buffer;
|
6745
|
+
fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
|
6746
|
+
|
6747
|
+
if (special) tokenizer_st_partition( vocab, fragment_buffer );
|
6748
|
+
|
6478
6749
|
switch (vocab.type) {
|
6479
6750
|
case LLAMA_VOCAB_TYPE_SPM:
|
6480
6751
|
{
|
6481
|
-
|
6482
|
-
|
6752
|
+
for (const auto & fragment: fragment_buffer)
|
6753
|
+
{
|
6754
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6755
|
+
{
|
6756
|
+
// without adding this leading whitespace, we do not get the same results as the original tokenizer
|
6757
|
+
|
6758
|
+
// TODO: It's likely possible to get rid of this string copy entirely
|
6759
|
+
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
6760
|
+
// and passing 'add space prefix' as bool argument
|
6761
|
+
//
|
6762
|
+
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
|
6483
6763
|
|
6484
|
-
|
6485
|
-
|
6486
|
-
|
6764
|
+
#ifdef PRETOKENIZERDEBUG
|
6765
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6766
|
+
#endif
|
6767
|
+
llm_tokenizer_spm tokenizer(vocab);
|
6768
|
+
llama_escape_whitespace(raw_text);
|
6769
|
+
tokenizer.tokenize(raw_text, output);
|
6770
|
+
}
|
6771
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6772
|
+
{
|
6773
|
+
output.push_back(fragment.token);
|
6774
|
+
}
|
6775
|
+
}
|
6487
6776
|
} break;
|
6488
6777
|
case LLAMA_VOCAB_TYPE_BPE:
|
6489
6778
|
{
|
6490
|
-
|
6491
|
-
|
6779
|
+
for (const auto & fragment: fragment_buffer)
|
6780
|
+
{
|
6781
|
+
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT)
|
6782
|
+
{
|
6783
|
+
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
6784
|
+
|
6785
|
+
#ifdef PRETOKENIZERDEBUG
|
6786
|
+
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
6787
|
+
#endif
|
6788
|
+
llm_tokenizer_bpe tokenizer(vocab);
|
6789
|
+
tokenizer.tokenize(raw_text, output);
|
6790
|
+
}
|
6791
|
+
else // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
6792
|
+
{
|
6793
|
+
output.push_back(fragment.token);
|
6794
|
+
}
|
6795
|
+
}
|
6492
6796
|
} break;
|
6493
6797
|
}
|
6494
6798
|
|
@@ -6761,7 +7065,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
6761
7065
|
std::vector<llama_grammar_candidate> rejects;
|
6762
7066
|
|
6763
7067
|
if (stack.empty()) {
|
6764
|
-
for (auto tok : candidates) {
|
7068
|
+
for (const auto & tok : candidates) {
|
6765
7069
|
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
|
6766
7070
|
rejects.push_back(tok);
|
6767
7071
|
}
|
@@ -6772,7 +7076,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
6772
7076
|
const llama_grammar_element * stack_pos = stack.back();
|
6773
7077
|
|
6774
7078
|
std::vector<llama_grammar_candidate> next_candidates;
|
6775
|
-
for (auto tok : candidates) {
|
7079
|
+
for (const auto & tok : candidates) {
|
6776
7080
|
if (*tok.code_points == 0) {
|
6777
7081
|
// reached end of full codepoints in token, reject iff it ended in a partial sequence
|
6778
7082
|
// that cannot satisfy this position in grammar
|
@@ -6798,7 +7102,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
|
6798
7102
|
llama_grammar_advance_stack(rules, stack_after, next_stacks);
|
6799
7103
|
|
6800
7104
|
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
|
6801
|
-
for (auto tok : next_rejects) {
|
7105
|
+
for (const auto & tok : next_rejects) {
|
6802
7106
|
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
|
6803
7107
|
}
|
6804
7108
|
|
@@ -7125,37 +7429,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
|
|
7125
7429
|
llama_sample_temp(ctx, candidates_p, temp);
|
7126
7430
|
}
|
7127
7431
|
|
7128
|
-
void
|
7129
|
-
|
7130
|
-
|
7131
|
-
|
7132
|
-
|
7133
|
-
|
7134
|
-
|
7135
|
-
|
7136
|
-
|
7137
|
-
if (token_iter == last_tokens + last_tokens_size) {
|
7138
|
-
continue;
|
7139
|
-
}
|
7140
|
-
|
7141
|
-
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
7142
|
-
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
7143
|
-
if (candidates->data[i].logit <= 0) {
|
7144
|
-
candidates->data[i].logit *= penalty;
|
7145
|
-
} else {
|
7146
|
-
candidates->data[i].logit /= penalty;
|
7147
|
-
}
|
7148
|
-
}
|
7149
|
-
|
7150
|
-
candidates->sorted = false;
|
7151
|
-
|
7152
|
-
if (ctx) {
|
7153
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
7154
|
-
}
|
7155
|
-
}
|
7156
|
-
|
7157
|
-
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
|
7158
|
-
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
|
7432
|
+
void llama_sample_repetition_penalties(
|
7433
|
+
struct llama_context * ctx,
|
7434
|
+
llama_token_data_array * candidates,
|
7435
|
+
const llama_token * last_tokens,
|
7436
|
+
size_t penalty_last_n,
|
7437
|
+
float penalty_repeat,
|
7438
|
+
float penalty_freq,
|
7439
|
+
float penalty_present) {
|
7440
|
+
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
|
7159
7441
|
return;
|
7160
7442
|
}
|
7161
7443
|
|
@@ -7163,19 +7445,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
|
|
7163
7445
|
|
7164
7446
|
// Create a frequency map to count occurrences of each token in last_tokens
|
7165
7447
|
std::unordered_map<llama_token, int> token_count;
|
7166
|
-
for (size_t i = 0; i <
|
7167
|
-
token_count[
|
7448
|
+
for (size_t i = 0; i < penalty_last_n; ++i) {
|
7449
|
+
token_count[last_tokens[i]]++;
|
7168
7450
|
}
|
7169
7451
|
|
7170
7452
|
// Apply frequency and presence penalties to the candidates
|
7171
7453
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7172
|
-
auto token_iter = token_count.find(candidates->data[i].id);
|
7454
|
+
const auto token_iter = token_count.find(candidates->data[i].id);
|
7173
7455
|
if (token_iter == token_count.end()) {
|
7174
7456
|
continue;
|
7175
7457
|
}
|
7176
7458
|
|
7177
|
-
int count = token_iter->second;
|
7178
|
-
|
7459
|
+
const int count = token_iter->second;
|
7460
|
+
|
7461
|
+
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
7462
|
+
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
7463
|
+
if (candidates->data[i].logit <= 0) {
|
7464
|
+
candidates->data[i].logit *= penalty_repeat;
|
7465
|
+
} else {
|
7466
|
+
candidates->data[i].logit /= penalty_repeat;
|
7467
|
+
}
|
7468
|
+
|
7469
|
+
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
|
7179
7470
|
}
|
7180
7471
|
|
7181
7472
|
candidates->sorted = false;
|
@@ -7197,14 +7488,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7197
7488
|
}
|
7198
7489
|
}
|
7199
7490
|
|
7200
|
-
const llama_token eos = llama_token_eos(ctx);
|
7491
|
+
const llama_token eos = llama_token_eos(&ctx->model);
|
7201
7492
|
|
7202
7493
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
7203
7494
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
7204
7495
|
|
7205
7496
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7206
7497
|
const llama_token id = candidates->data[i].id;
|
7207
|
-
const std::string piece =
|
7498
|
+
const std::string piece = llama_token_to_piece(ctx, id);
|
7208
7499
|
if (id == eos) {
|
7209
7500
|
if (!allow_eos) {
|
7210
7501
|
candidates->data[i].logit = -INFINITY;
|
@@ -7407,7 +7698,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|
7407
7698
|
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
7408
7699
|
const int64_t t_start_sample_us = ggml_time_us();
|
7409
7700
|
|
7410
|
-
if (token == llama_token_eos(ctx)) {
|
7701
|
+
if (token == llama_token_eos(&ctx->model)) {
|
7411
7702
|
for (const auto & stack : grammar->stacks) {
|
7412
7703
|
if (stack.empty()) {
|
7413
7704
|
return;
|
@@ -7416,7 +7707,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7416
7707
|
GGML_ASSERT(false);
|
7417
7708
|
}
|
7418
7709
|
|
7419
|
-
const std::string piece =
|
7710
|
+
const std::string piece = llama_token_to_piece(ctx, token);
|
7420
7711
|
|
7421
7712
|
// Note terminating 0 in decoded string
|
7422
7713
|
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
@@ -8616,7 +8907,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8616
8907
|
// build worst-case graph
|
8617
8908
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
8618
8909
|
int n_past = cparams.n_ctx - n_tokens;
|
8619
|
-
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
8910
|
+
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
8620
8911
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
8621
8912
|
|
8622
8913
|
#ifdef GGML_USE_METAL
|
@@ -8831,6 +9122,9 @@ void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llam
|
|
8831
9122
|
}
|
8832
9123
|
|
8833
9124
|
void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
|
9125
|
+
if (seq_id_src == seq_id_dst) {
|
9126
|
+
return;
|
9127
|
+
}
|
8834
9128
|
llama_kv_cache_seq_cp(ctx->kv_self, seq_id_src, seq_id_dst, p0, p1);
|
8835
9129
|
}
|
8836
9130
|
|
@@ -9283,7 +9577,7 @@ int llama_eval_embd(
|
|
9283
9577
|
int n_past) {
|
9284
9578
|
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
|
9285
9579
|
|
9286
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
9580
|
+
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
9287
9581
|
|
9288
9582
|
const int ret = llama_decode_internal(*ctx, batch);
|
9289
9583
|
if (ret < 0) {
|
@@ -9304,20 +9598,21 @@ struct llama_batch llama_batch_get_one(
|
|
9304
9598
|
llama_pos pos_0,
|
9305
9599
|
llama_seq_id seq_id) {
|
9306
9600
|
return {
|
9307
|
-
/*n_tokens
|
9308
|
-
/*tokens
|
9309
|
-
/*embd
|
9310
|
-
/*pos
|
9311
|
-
/*
|
9312
|
-
/*
|
9313
|
-
/*
|
9314
|
-
/*
|
9315
|
-
/*
|
9601
|
+
/*n_tokens =*/ n_tokens,
|
9602
|
+
/*tokens =*/ tokens,
|
9603
|
+
/*embd =*/ nullptr,
|
9604
|
+
/*pos =*/ nullptr,
|
9605
|
+
/*n_seq_id =*/ nullptr,
|
9606
|
+
/*seq_id =*/ nullptr,
|
9607
|
+
/*logits =*/ nullptr,
|
9608
|
+
/*all_pos_0 =*/ pos_0,
|
9609
|
+
/*all_pos_1 =*/ 1,
|
9610
|
+
/*all_seq_id =*/ seq_id,
|
9316
9611
|
};
|
9317
9612
|
}
|
9318
9613
|
|
9319
|
-
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
9320
|
-
llama_batch batch = {
|
9614
|
+
struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
|
9615
|
+
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, };
|
9321
9616
|
|
9322
9617
|
if (embd) {
|
9323
9618
|
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
|
@@ -9325,19 +9620,29 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd) {
|
|
9325
9620
|
batch.token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
|
9326
9621
|
}
|
9327
9622
|
|
9328
|
-
batch.pos
|
9329
|
-
batch.
|
9330
|
-
batch.
|
9623
|
+
batch.pos = (llama_pos *) malloc(sizeof(llama_pos) * n_tokens);
|
9624
|
+
batch.n_seq_id = (int32_t *) malloc(sizeof(int32_t) * n_tokens);
|
9625
|
+
batch.seq_id = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
|
9626
|
+
for (int i = 0; i < n_tokens; ++i) {
|
9627
|
+
batch.seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
|
9628
|
+
}
|
9629
|
+
batch.logits = (int8_t *) malloc(sizeof(int8_t) * n_tokens);
|
9331
9630
|
|
9332
9631
|
return batch;
|
9333
9632
|
}
|
9334
9633
|
|
9335
9634
|
void llama_batch_free(struct llama_batch batch) {
|
9336
|
-
if (batch.token)
|
9337
|
-
if (batch.embd)
|
9338
|
-
if (batch.pos)
|
9339
|
-
if (batch.
|
9340
|
-
if (batch.
|
9635
|
+
if (batch.token) free(batch.token);
|
9636
|
+
if (batch.embd) free(batch.embd);
|
9637
|
+
if (batch.pos) free(batch.pos);
|
9638
|
+
if (batch.n_seq_id) free(batch.n_seq_id);
|
9639
|
+
if (batch.seq_id) {
|
9640
|
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
9641
|
+
free(batch.seq_id[i]);
|
9642
|
+
}
|
9643
|
+
free(batch.seq_id);
|
9644
|
+
}
|
9645
|
+
if (batch.logits) free(batch.logits);
|
9341
9646
|
}
|
9342
9647
|
|
9343
9648
|
int llama_decode(
|
@@ -9363,45 +9668,45 @@ float * llama_get_embeddings(struct llama_context * ctx) {
|
|
9363
9668
|
return ctx->embedding.data();
|
9364
9669
|
}
|
9365
9670
|
|
9366
|
-
const char * llama_token_get_text(const struct
|
9367
|
-
return
|
9671
|
+
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
9672
|
+
return model->vocab.id_to_token[token].text.c_str();
|
9368
9673
|
}
|
9369
9674
|
|
9370
|
-
float llama_token_get_score(const struct
|
9371
|
-
return
|
9675
|
+
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
9676
|
+
return model->vocab.id_to_token[token].score;
|
9372
9677
|
}
|
9373
9678
|
|
9374
|
-
llama_token_type llama_token_get_type(const struct
|
9375
|
-
return
|
9679
|
+
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
9680
|
+
return model->vocab.id_to_token[token].type;
|
9376
9681
|
}
|
9377
9682
|
|
9378
|
-
llama_token llama_token_bos(const struct
|
9379
|
-
return
|
9683
|
+
llama_token llama_token_bos(const struct llama_model * model) {
|
9684
|
+
return model->vocab.special_bos_id;
|
9380
9685
|
}
|
9381
9686
|
|
9382
|
-
llama_token llama_token_eos(const struct
|
9383
|
-
return
|
9687
|
+
llama_token llama_token_eos(const struct llama_model * model) {
|
9688
|
+
return model->vocab.special_eos_id;
|
9384
9689
|
}
|
9385
9690
|
|
9386
|
-
llama_token llama_token_nl(const struct
|
9387
|
-
return
|
9388
|
-
}
|
9389
|
-
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
9390
|
-
return ctx->model.vocab.special_prefix_id;
|
9691
|
+
llama_token llama_token_nl(const struct llama_model * model) {
|
9692
|
+
return model->vocab.linefeed_id;
|
9391
9693
|
}
|
9392
9694
|
|
9393
|
-
llama_token
|
9394
|
-
return
|
9695
|
+
llama_token llama_token_prefix(const struct llama_model * model) {
|
9696
|
+
return model->vocab.special_prefix_id;
|
9395
9697
|
}
|
9396
9698
|
|
9397
|
-
llama_token
|
9398
|
-
return
|
9699
|
+
llama_token llama_token_middle(const struct llama_model * model) {
|
9700
|
+
return model->vocab.special_middle_id;
|
9399
9701
|
}
|
9400
9702
|
|
9401
|
-
llama_token
|
9402
|
-
return
|
9703
|
+
llama_token llama_token_suffix(const struct llama_model * model) {
|
9704
|
+
return model->vocab.special_suffix_id;
|
9403
9705
|
}
|
9404
9706
|
|
9707
|
+
llama_token llama_token_eot(const struct llama_model * model) {
|
9708
|
+
return model->vocab.special_eot_id;
|
9709
|
+
}
|
9405
9710
|
|
9406
9711
|
int llama_tokenize(
|
9407
9712
|
const struct llama_model * model,
|
@@ -9409,8 +9714,9 @@ int llama_tokenize(
|
|
9409
9714
|
int text_len,
|
9410
9715
|
llama_token * tokens,
|
9411
9716
|
int n_max_tokens,
|
9412
|
-
bool add_bos
|
9413
|
-
|
9717
|
+
bool add_bos,
|
9718
|
+
bool special) {
|
9719
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special);
|
9414
9720
|
|
9415
9721
|
if (n_max_tokens < (int) res.size()) {
|
9416
9722
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|