llama_cpp 0.6.0 → 0.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +122 -72
- data/ext/llama_cpp/src/ggml-metal.m +4 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +755 -320
- data/ext/llama_cpp/src/ggml.h +13 -0
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +779 -113
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
2
|
#include "llama.h"
|
3
3
|
|
4
|
+
#include "unicode.h"
|
5
|
+
|
4
6
|
#include "ggml.h"
|
5
7
|
|
6
8
|
#include "ggml-alloc.h"
|
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
|
123
125
|
}
|
124
126
|
s = std::move(result);
|
125
127
|
}
|
128
|
+
|
129
|
+
static bool is_float_close(float a, float b, float abs_tol) {
|
130
|
+
// Check for non-negative tolerance
|
131
|
+
if (abs_tol < 0.0) {
|
132
|
+
throw std::invalid_argument("Tolerance must be non-negative");
|
133
|
+
}
|
134
|
+
|
135
|
+
// Exact equality check
|
136
|
+
if (a == b) {
|
137
|
+
return true;
|
138
|
+
}
|
139
|
+
|
140
|
+
// Check for infinities
|
141
|
+
if (std::isinf(a) || std::isinf(b)) {
|
142
|
+
return false;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Regular comparison using the provided absolute tolerance
|
146
|
+
return std::fabs(b - a) <= abs_tol;
|
147
|
+
}
|
148
|
+
|
126
149
|
#ifdef GGML_USE_CPU_HBM
|
127
150
|
#include <hbwmalloc.h>
|
128
151
|
#endif
|
@@ -163,6 +186,7 @@ enum llm_arch {
|
|
163
186
|
LLM_ARCH_GPTNEOX,
|
164
187
|
LLM_ARCH_MPT,
|
165
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_REFACT,
|
166
190
|
LLM_ARCH_UNKNOWN,
|
167
191
|
};
|
168
192
|
|
@@ -175,6 +199,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
175
199
|
{ LLM_ARCH_MPT, "mpt" },
|
176
200
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
177
201
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
+
{ LLM_ARCH_REFACT, "refact" },
|
178
203
|
};
|
179
204
|
|
180
205
|
enum llm_kv {
|
@@ -395,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
395
420
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
396
421
|
},
|
397
422
|
},
|
423
|
+
{
|
424
|
+
LLM_ARCH_REFACT,
|
425
|
+
{
|
426
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
427
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
428
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
429
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
430
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
431
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
432
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
433
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
434
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
|
+
},
|
439
|
+
},
|
398
440
|
{
|
399
441
|
LLM_ARCH_UNKNOWN,
|
400
442
|
{
|
@@ -943,7 +985,24 @@ struct llama_hparams {
|
|
943
985
|
float rope_freq_scale_train;
|
944
986
|
|
945
987
|
bool operator!=(const llama_hparams & other) const {
|
946
|
-
|
988
|
+
if (this->vocab_only != other.vocab_only) return true;
|
989
|
+
if (this->n_vocab != other.n_vocab) return true;
|
990
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
991
|
+
if (this->n_embd != other.n_embd) return true;
|
992
|
+
if (this->n_head != other.n_head) return true;
|
993
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
994
|
+
if (this->n_layer != other.n_layer) return true;
|
995
|
+
if (this->n_rot != other.n_rot) return true;
|
996
|
+
if (this->n_ff != other.n_ff) return true;
|
997
|
+
|
998
|
+
const float EPSILON = 1e-9;
|
999
|
+
|
1000
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1001
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1002
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1003
|
+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1004
|
+
|
1005
|
+
return false;
|
947
1006
|
}
|
948
1007
|
|
949
1008
|
uint32_t n_gqa() const {
|
@@ -1071,6 +1130,10 @@ struct llama_vocab {
|
|
1071
1130
|
id special_pad_id = -1;
|
1072
1131
|
|
1073
1132
|
id linefeed_id = 13;
|
1133
|
+
id special_prefix_id = 32007;
|
1134
|
+
id special_middle_id = 32009;
|
1135
|
+
id special_suffix_id = 32008;
|
1136
|
+
id special_eot_id = 32010;
|
1074
1137
|
|
1075
1138
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1076
1139
|
replace_all(token_left, " ", "\u0120");
|
@@ -1272,8 +1335,8 @@ static bool llama_kv_cache_init(
|
|
1272
1335
|
// find an empty slot of size "n_tokens" in the cache
|
1273
1336
|
// updates the cache head
|
1274
1337
|
static bool llama_kv_cache_find_slot(
|
1275
|
-
|
1276
|
-
|
1338
|
+
struct llama_kv_cache & cache,
|
1339
|
+
const struct llama_batch & batch) {
|
1277
1340
|
const uint32_t n_ctx = cache.size;
|
1278
1341
|
const uint32_t n_tokens = batch.n_tokens;
|
1279
1342
|
|
@@ -1341,10 +1404,13 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1341
1404
|
}
|
1342
1405
|
|
1343
1406
|
static void llama_kv_cache_seq_rm(
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1407
|
+
struct llama_kv_cache & cache,
|
1408
|
+
llama_seq_id seq_id,
|
1409
|
+
llama_pos p0,
|
1410
|
+
llama_pos p1) {
|
1411
|
+
if (p0 < 0) p0 = 0;
|
1412
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
|
+
|
1348
1414
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
1415
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
1416
|
cache.cells[i].seq_id.erase(seq_id);
|
@@ -1356,11 +1422,14 @@ static void llama_kv_cache_seq_rm(
|
|
1356
1422
|
}
|
1357
1423
|
|
1358
1424
|
static void llama_kv_cache_seq_cp(
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1425
|
+
struct llama_kv_cache & cache,
|
1426
|
+
llama_seq_id seq_id_src,
|
1427
|
+
llama_seq_id seq_id_dst,
|
1428
|
+
llama_pos p0,
|
1429
|
+
llama_pos p1) {
|
1430
|
+
if (p0 < 0) p0 = 0;
|
1431
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
|
+
|
1364
1433
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
1434
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
1435
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1378,11 +1447,14 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1378
1447
|
}
|
1379
1448
|
|
1380
1449
|
static void llama_kv_cache_seq_shift(
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1450
|
+
struct llama_kv_cache & cache,
|
1451
|
+
llama_seq_id seq_id,
|
1452
|
+
llama_pos p0,
|
1453
|
+
llama_pos p1,
|
1454
|
+
llama_pos delta) {
|
1455
|
+
if (p0 < 0) p0 = 0;
|
1456
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
|
+
|
1386
1458
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
1459
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
1460
|
cache.cells[i].pos += delta;
|
@@ -1907,6 +1979,14 @@ static void llm_load_hparams(
|
|
1907
1979
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1908
1980
|
}
|
1909
1981
|
} break;
|
1982
|
+
case LLM_ARCH_REFACT:
|
1983
|
+
{
|
1984
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1985
|
+
switch (hparams.n_layer) {
|
1986
|
+
case 32: model.type = e_model::MODEL_1B; break;
|
1987
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
|
+
}
|
1989
|
+
} break;
|
1910
1990
|
default: (void)0;
|
1911
1991
|
}
|
1912
1992
|
|
@@ -1971,6 +2051,7 @@ static void llm_load_vocab(
|
|
1971
2051
|
|
1972
2052
|
for (int i = 0; i < n_merges; i++) {
|
1973
2053
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
2054
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1974
2055
|
|
1975
2056
|
std::string first;
|
1976
2057
|
std::string second;
|
@@ -2005,6 +2086,7 @@ static void llm_load_vocab(
|
|
2005
2086
|
|
2006
2087
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
2007
2088
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
2089
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
2008
2090
|
|
2009
2091
|
vocab.token_to_id[word] = i;
|
2010
2092
|
|
@@ -2013,12 +2095,13 @@ static void llm_load_vocab(
|
|
2013
2095
|
token_data.score = scores ? scores[i] : 0.0f;
|
2014
2096
|
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
2015
2097
|
}
|
2098
|
+
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
2016
2099
|
|
2017
2100
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
2018
2101
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
2019
2102
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
2020
2103
|
} else {
|
2021
|
-
vocab.linefeed_id = llama_tokenize_internal(vocab, "\
|
2104
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
|
2022
2105
|
}
|
2023
2106
|
|
2024
2107
|
// special tokens
|
@@ -2141,6 +2224,7 @@ static void llm_load_tensors(
|
|
2141
2224
|
const auto tn = LLM_TN(model.arch);
|
2142
2225
|
switch (model.arch) {
|
2143
2226
|
case LLM_ARCH_LLAMA:
|
2227
|
+
case LLM_ARCH_REFACT:
|
2144
2228
|
{
|
2145
2229
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2146
2230
|
|
@@ -3334,6 +3418,353 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3334
3418
|
return gf;
|
3335
3419
|
}
|
3336
3420
|
|
3421
|
+
static struct ggml_cgraph * llm_build_refact(
|
3422
|
+
llama_context & lctx,
|
3423
|
+
const llama_batch & batch) {
|
3424
|
+
const auto & model = lctx.model;
|
3425
|
+
const auto & hparams = model.hparams;
|
3426
|
+
const auto & cparams = lctx.cparams;
|
3427
|
+
|
3428
|
+
const auto & kv_self = lctx.kv_self;
|
3429
|
+
|
3430
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3431
|
+
|
3432
|
+
const int64_t n_embd = hparams.n_embd;
|
3433
|
+
const int64_t n_layer = hparams.n_layer;
|
3434
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3435
|
+
const int64_t n_head = hparams.n_head;
|
3436
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3437
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3438
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3439
|
+
|
3440
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
3441
|
+
|
3442
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
3443
|
+
|
3444
|
+
const int32_t n_tokens = batch.n_tokens;
|
3445
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3446
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3447
|
+
|
3448
|
+
// printf("n_kv = %d\n", n_kv);
|
3449
|
+
|
3450
|
+
auto & buf_compute = lctx.buf_compute;
|
3451
|
+
|
3452
|
+
struct ggml_init_params params = {
|
3453
|
+
/*.mem_size =*/ buf_compute.size,
|
3454
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3455
|
+
/*.no_alloc =*/ false,
|
3456
|
+
};
|
3457
|
+
|
3458
|
+
params.no_alloc = true;
|
3459
|
+
|
3460
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3461
|
+
|
3462
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3463
|
+
|
3464
|
+
struct ggml_tensor * cur;
|
3465
|
+
struct ggml_tensor * inpL;
|
3466
|
+
|
3467
|
+
if (batch.token) {
|
3468
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3469
|
+
|
3470
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3471
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3472
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3473
|
+
}
|
3474
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3475
|
+
|
3476
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3477
|
+
} else {
|
3478
|
+
#ifdef GGML_USE_MPI
|
3479
|
+
GGML_ASSERT(false && "not implemented");
|
3480
|
+
#endif
|
3481
|
+
|
3482
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3483
|
+
|
3484
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
3485
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3486
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3487
|
+
}
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3491
|
+
(void) i_gpu_start;
|
3492
|
+
|
3493
|
+
// offload functions set the tensor output backend to GPU
|
3494
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3495
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3496
|
+
offload_func_t offload_func_kq = llama_nop;
|
3497
|
+
offload_func_t offload_func_v = llama_nop;
|
3498
|
+
|
3499
|
+
#ifdef GGML_USE_CUBLAS
|
3500
|
+
if (n_gpu_layers > n_layer) {
|
3501
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
3502
|
+
}
|
3503
|
+
if (n_gpu_layers > n_layer + 1) {
|
3504
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
3505
|
+
}
|
3506
|
+
if (n_gpu_layers > n_layer + 2) {
|
3507
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
3508
|
+
}
|
3509
|
+
#endif // GGML_USE_CUBLAS
|
3510
|
+
|
3511
|
+
// KQ_scale
|
3512
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3513
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3514
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3515
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3516
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
3517
|
+
}
|
3518
|
+
|
3519
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3520
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3521
|
+
offload_func_kq(KQ_mask);
|
3522
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3523
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3524
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3525
|
+
float * data = (float *) KQ_mask->data;
|
3526
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3527
|
+
|
3528
|
+
for (int h = 0; h < 1; ++h) {
|
3529
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3530
|
+
const llama_pos pos = batch.pos[j];
|
3531
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3532
|
+
|
3533
|
+
for (int i = 0; i < n_kv; ++i) {
|
3534
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3535
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3536
|
+
}
|
3537
|
+
}
|
3538
|
+
}
|
3539
|
+
}
|
3540
|
+
}
|
3541
|
+
|
3542
|
+
for (int il = 0; il < n_layer; ++il) {
|
3543
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
3544
|
+
|
3545
|
+
offload_func_t offload_func = llama_nop;
|
3546
|
+
|
3547
|
+
#ifdef GGML_USE_CUBLAS
|
3548
|
+
if (il >= i_gpu_start) {
|
3549
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
3550
|
+
}
|
3551
|
+
#endif // GGML_USE_CUBLAS
|
3552
|
+
|
3553
|
+
struct ggml_tensor * inpSA = inpL;
|
3554
|
+
|
3555
|
+
// norm
|
3556
|
+
{
|
3557
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
3558
|
+
offload_func(cur);
|
3559
|
+
ggml_set_name(cur, "rms_norm_0");
|
3560
|
+
|
3561
|
+
// cur = cur*attn_norm(broadcasted)
|
3562
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
3563
|
+
offload_func(cur);
|
3564
|
+
ggml_set_name(cur, "attention_norm_0");
|
3565
|
+
}
|
3566
|
+
|
3567
|
+
// self-attention
|
3568
|
+
{
|
3569
|
+
// compute Q and K
|
3570
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3571
|
+
offload_func_kq(tmpk);
|
3572
|
+
ggml_set_name(tmpk, "tmpk");
|
3573
|
+
|
3574
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3575
|
+
offload_func_kq(tmpq);
|
3576
|
+
ggml_set_name(tmpq, "tmpq");
|
3577
|
+
|
3578
|
+
struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
|
3579
|
+
offload_func_kq(Kcur);
|
3580
|
+
ggml_set_name(Kcur, "Kcur");
|
3581
|
+
|
3582
|
+
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
3583
|
+
offload_func_kq(Qcur);
|
3584
|
+
ggml_set_name(Qcur, "Qcur");
|
3585
|
+
|
3586
|
+
// store key and value to memory
|
3587
|
+
{
|
3588
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
3589
|
+
|
3590
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3591
|
+
offload_func_v(tmpv);
|
3592
|
+
ggml_set_name(tmpv, "tmpv");
|
3593
|
+
|
3594
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
3595
|
+
offload_func_v(Vcur);
|
3596
|
+
ggml_set_name(Vcur, "Vcur");
|
3597
|
+
|
3598
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3599
|
+
offload_func_kq(k);
|
3600
|
+
ggml_set_name(k, "k");
|
3601
|
+
|
3602
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3603
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3604
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3605
|
+
offload_func_v(v);
|
3606
|
+
ggml_set_name(v, "v");
|
3607
|
+
|
3608
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3609
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3610
|
+
}
|
3611
|
+
|
3612
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
3613
|
+
offload_func_kq(Q);
|
3614
|
+
ggml_set_name(Q, "Q");
|
3615
|
+
|
3616
|
+
struct ggml_tensor * K =
|
3617
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3618
|
+
n_embd_head, n_kv, n_head_kv,
|
3619
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3620
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3621
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3622
|
+
offload_func_kq(K);
|
3623
|
+
ggml_set_name(K, "K");
|
3624
|
+
|
3625
|
+
// K * Q
|
3626
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3627
|
+
offload_func_kq(KQ);
|
3628
|
+
ggml_set_name(KQ, "KQ");
|
3629
|
+
|
3630
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3631
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
3632
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3633
|
+
offload_func_kq(KQ_scaled);
|
3634
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3635
|
+
|
3636
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3637
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
3638
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
3639
|
+
|
3640
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
3641
|
+
offload_func_kq(KQ_masked);
|
3642
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3643
|
+
|
3644
|
+
// KQ = soft_max(KQ_masked)
|
3645
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3646
|
+
offload_func_v(KQ_soft_max);
|
3647
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3648
|
+
|
3649
|
+
// split cached V into n_head heads
|
3650
|
+
struct ggml_tensor * V =
|
3651
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3652
|
+
n_kv, n_embd_head, n_head_kv,
|
3653
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3654
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3655
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3656
|
+
offload_func_v(V);
|
3657
|
+
ggml_set_name(V, "V");
|
3658
|
+
|
3659
|
+
#if 1
|
3660
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3661
|
+
offload_func_v(KQV);
|
3662
|
+
ggml_set_name(KQV, "KQV");
|
3663
|
+
#else
|
3664
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3665
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3666
|
+
// is there a better way?
|
3667
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
3668
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3669
|
+
#endif
|
3670
|
+
|
3671
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3672
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3673
|
+
offload_func_v(KQV_merged);
|
3674
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3675
|
+
|
3676
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3677
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3678
|
+
offload_func_v(cur);
|
3679
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3680
|
+
|
3681
|
+
// projection (no bias)
|
3682
|
+
cur = ggml_mul_mat(ctx0,
|
3683
|
+
model.layers[il].wo,
|
3684
|
+
cur);
|
3685
|
+
offload_func(cur);
|
3686
|
+
ggml_set_name(cur, "result_wo");
|
3687
|
+
}
|
3688
|
+
|
3689
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
3690
|
+
offload_func(inpFF);
|
3691
|
+
ggml_set_name(inpFF, "inpFF");
|
3692
|
+
|
3693
|
+
// feed-forward network
|
3694
|
+
{
|
3695
|
+
// norm
|
3696
|
+
{
|
3697
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
3698
|
+
offload_func(cur);
|
3699
|
+
ggml_set_name(cur, "rms_norm_1");
|
3700
|
+
|
3701
|
+
// cur = cur*ffn_norm(broadcasted)
|
3702
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
3703
|
+
offload_func(cur);
|
3704
|
+
ggml_set_name(cur, "ffn_norm");
|
3705
|
+
}
|
3706
|
+
|
3707
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
3708
|
+
model.layers[il].w3,
|
3709
|
+
cur);
|
3710
|
+
offload_func(tmp);
|
3711
|
+
ggml_set_name(tmp, "result_w3");
|
3712
|
+
|
3713
|
+
cur = ggml_mul_mat(ctx0,
|
3714
|
+
model.layers[il].w1,
|
3715
|
+
cur);
|
3716
|
+
offload_func(cur);
|
3717
|
+
ggml_set_name(cur, "result_w1");
|
3718
|
+
|
3719
|
+
// SILU activation
|
3720
|
+
cur = ggml_silu(ctx0, cur);
|
3721
|
+
offload_func(cur);
|
3722
|
+
ggml_set_name(cur, "silu");
|
3723
|
+
|
3724
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
3725
|
+
offload_func(cur);
|
3726
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
3727
|
+
|
3728
|
+
cur = ggml_mul_mat(ctx0,
|
3729
|
+
model.layers[il].w2,
|
3730
|
+
cur);
|
3731
|
+
offload_func(cur);
|
3732
|
+
ggml_set_name(cur, "result_w2");
|
3733
|
+
}
|
3734
|
+
|
3735
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
3736
|
+
offload_func(cur);
|
3737
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
3738
|
+
|
3739
|
+
// input for next layer
|
3740
|
+
inpL = cur;
|
3741
|
+
}
|
3742
|
+
|
3743
|
+
cur = inpL;
|
3744
|
+
|
3745
|
+
// norm
|
3746
|
+
{
|
3747
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
3748
|
+
offload_func_nr(cur);
|
3749
|
+
ggml_set_name(cur, "rms_norm_2");
|
3750
|
+
|
3751
|
+
// cur = cur*norm(broadcasted)
|
3752
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
3753
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
3754
|
+
ggml_set_name(cur, "result_norm");
|
3755
|
+
}
|
3756
|
+
|
3757
|
+
// lm_head
|
3758
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3759
|
+
ggml_set_name(cur, "result_output");
|
3760
|
+
|
3761
|
+
ggml_build_forward_expand(gf, cur);
|
3762
|
+
|
3763
|
+
ggml_free(ctx0);
|
3764
|
+
|
3765
|
+
return gf;
|
3766
|
+
}
|
3767
|
+
|
3337
3768
|
static struct ggml_cgraph * llm_build_falcon(
|
3338
3769
|
llama_context & lctx,
|
3339
3770
|
const llama_batch & batch) {
|
@@ -3974,6 +4405,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3974
4405
|
{
|
3975
4406
|
result = llm_build_starcoder(lctx, batch);
|
3976
4407
|
} break;
|
4408
|
+
case LLM_ARCH_REFACT:
|
4409
|
+
{
|
4410
|
+
result = llm_build_refact(lctx, batch);
|
4411
|
+
} break;
|
3977
4412
|
default:
|
3978
4413
|
GGML_ASSERT(false);
|
3979
4414
|
}
|
@@ -4107,7 +4542,8 @@ static int llama_decode_internal(
|
|
4107
4542
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
4108
4543
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4109
4544
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4110
|
-
model.arch == LLM_ARCH_FALCON
|
4545
|
+
model.arch == LLM_ARCH_FALCON ||
|
4546
|
+
model.arch == LLM_ARCH_REFACT;
|
4111
4547
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4112
4548
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4113
4549
|
n_threads = 1;
|
@@ -4227,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
4227
4663
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
4228
4664
|
}
|
4229
4665
|
|
4230
|
-
static
|
4666
|
+
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
4667
|
+
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
4668
|
+
}
|
4669
|
+
|
4670
|
+
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
4231
4671
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
4232
4672
|
const auto& token_data = vocab.id_to_token.at(id);
|
4233
|
-
|
4234
|
-
|
4673
|
+
switch (llama_vocab_get_type(vocab)) {
|
4674
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4675
|
+
auto buf = token_data.text.substr(3, 2);
|
4676
|
+
return strtol(buf.c_str(), NULL, 16);
|
4677
|
+
}
|
4678
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4679
|
+
GGML_ASSERT(false);
|
4680
|
+
return unicode_to_bytes_bpe(token_data.text);
|
4681
|
+
}
|
4682
|
+
default:
|
4683
|
+
GGML_ASSERT(false);
|
4684
|
+
}
|
4235
4685
|
}
|
4236
4686
|
|
4237
4687
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
4688
|
+
switch (llama_vocab_get_type(vocab)) {
|
4689
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4690
|
+
char buf[7];
|
4691
|
+
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
4692
|
+
GGML_ASSERT(0 <= result && result < 7);
|
4693
|
+
return vocab.token_to_id.at(buf);
|
4694
|
+
}
|
4695
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4696
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
4697
|
+
}
|
4698
|
+
default:
|
4699
|
+
GGML_ASSERT(false);
|
4700
|
+
}
|
4242
4701
|
}
|
4243
4702
|
|
4244
4703
|
static void llama_escape_whitespace(std::string & text) {
|
@@ -4518,15 +4977,9 @@ struct llm_tokenizer_bpe {
|
|
4518
4977
|
std::string byte_str(1, *j);
|
4519
4978
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
4520
4979
|
if (token_multibyte == vocab.token_to_id.end()) {
|
4521
|
-
|
4522
|
-
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
4523
|
-
output.push_back(token_byte);
|
4524
|
-
} catch (const std::out_of_range & err) {
|
4525
|
-
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
4526
|
-
}
|
4527
|
-
} else {
|
4528
|
-
output.push_back((*token_multibyte).second);
|
4980
|
+
throw std::runtime_error("ERROR: byte not found in vocab");
|
4529
4981
|
}
|
4982
|
+
output.push_back((*token_multibyte).second);
|
4530
4983
|
}
|
4531
4984
|
} else {
|
4532
4985
|
output.push_back((*token).second);
|
@@ -4563,23 +5016,144 @@ private:
|
|
4563
5016
|
work_queue.push(bigram);
|
4564
5017
|
}
|
4565
5018
|
|
4566
|
-
|
4567
|
-
|
4568
|
-
std::vector<std::string>
|
5019
|
+
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
5020
|
+
std::vector<std::string> bpe_words;
|
5021
|
+
std::vector<std::string> bpe_encoded_words;
|
5022
|
+
|
5023
|
+
std::string token = "";
|
5024
|
+
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
5025
|
+
bool collecting_numeric = false;
|
5026
|
+
bool collecting_letter = false;
|
5027
|
+
bool collecting_special = false;
|
5028
|
+
bool collecting_whitespace_lookahead = false;
|
5029
|
+
bool collecting = false;
|
5030
|
+
|
5031
|
+
std::vector<std::string> text_utf;
|
5032
|
+
text_utf.reserve(text.size());
|
5033
|
+
bpe_words.reserve(text.size());
|
5034
|
+
bpe_encoded_words.reserve(text.size());
|
5035
|
+
|
5036
|
+
auto cps = codepoints_from_utf8(text);
|
5037
|
+
for (size_t i = 0; i < cps.size(); ++i)
|
5038
|
+
text_utf.emplace_back(codepoint_to_utf8(cps[i]));
|
5039
|
+
|
5040
|
+
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
|
+
const std::string & utf_char = text_utf[i];
|
5042
|
+
bool split_condition = false;
|
5043
|
+
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
|
+
int bytes_remain = text_utf.size() - i;
|
5045
|
+
// forward backward lookups
|
5046
|
+
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
5047
|
+
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
5048
|
+
|
5049
|
+
// handling contractions
|
5050
|
+
if (!split_condition && bytes_remain >= 2) {
|
5051
|
+
// 's|'t|'m|'d
|
5052
|
+
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
5053
|
+
split_condition = true;
|
5054
|
+
}
|
5055
|
+
if (split_condition) {
|
5056
|
+
if (token.size()) {
|
5057
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5058
|
+
}
|
5059
|
+
token = utf_char + utf_char_next;
|
5060
|
+
bpe_words.emplace_back(token);
|
5061
|
+
token = "";
|
5062
|
+
i++;
|
5063
|
+
continue;
|
5064
|
+
}
|
5065
|
+
}
|
5066
|
+
if (!split_condition && bytes_remain >= 3) {
|
5067
|
+
// 're|'ve|'ll
|
5068
|
+
if (utf_char == "\'" && (
|
5069
|
+
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
5070
|
+
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
5071
|
+
(utf_char_next == "l" || utf_char_next_next == "l"))
|
5072
|
+
) {
|
5073
|
+
split_condition = true;
|
5074
|
+
}
|
5075
|
+
if (split_condition) {
|
5076
|
+
// current token + next token can be defined
|
5077
|
+
if (token.size()) {
|
5078
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5079
|
+
}
|
5080
|
+
token = utf_char + utf_char_next + utf_char_next_next;
|
5081
|
+
bpe_words.emplace_back(token); // the contraction
|
5082
|
+
token = "";
|
5083
|
+
i += 2;
|
5084
|
+
continue;
|
5085
|
+
}
|
5086
|
+
}
|
5087
|
+
|
5088
|
+
if (!split_condition && !collecting) {
|
5089
|
+
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
5090
|
+
collecting_letter = true;
|
5091
|
+
collecting = true;
|
5092
|
+
}
|
5093
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5094
|
+
collecting_numeric = true;
|
5095
|
+
collecting = true;
|
5096
|
+
}
|
5097
|
+
else if (
|
5098
|
+
((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
5099
|
+
(!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
5100
|
+
) {
|
5101
|
+
collecting_special = true;
|
5102
|
+
collecting = true;
|
5103
|
+
}
|
5104
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
5105
|
+
collecting_whitespace_lookahead = true;
|
5106
|
+
collecting = true;
|
5107
|
+
}
|
5108
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
5109
|
+
split_condition = true;
|
5110
|
+
}
|
5111
|
+
}
|
5112
|
+
else if (!split_condition && collecting) {
|
5113
|
+
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
5114
|
+
split_condition = true;
|
5115
|
+
}
|
5116
|
+
else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
5117
|
+
split_condition = true;
|
5118
|
+
}
|
5119
|
+
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
|
+
split_condition = true;
|
5121
|
+
}
|
5122
|
+
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
5123
|
+
split_condition = true;
|
5124
|
+
}
|
5125
|
+
}
|
4569
5126
|
|
4570
|
-
|
4571
|
-
|
4572
|
-
|
5127
|
+
if (utf_char_next == "") {
|
5128
|
+
split_condition = true; // final
|
5129
|
+
token += utf_char;
|
5130
|
+
}
|
4573
5131
|
|
4574
|
-
|
4575
|
-
|
4576
|
-
|
4577
|
-
|
4578
|
-
|
4579
|
-
|
5132
|
+
if (split_condition) {
|
5133
|
+
if (token.size()) {
|
5134
|
+
bpe_words.emplace_back(token);
|
5135
|
+
}
|
5136
|
+
token = utf_char;
|
5137
|
+
collecting = false;
|
5138
|
+
collecting_letter = false;
|
5139
|
+
collecting_numeric = false;
|
5140
|
+
collecting_special = false;
|
5141
|
+
collecting_whitespace_lookahead = false;
|
5142
|
+
}
|
5143
|
+
else {
|
5144
|
+
token += utf_char;
|
5145
|
+
}
|
5146
|
+
}
|
5147
|
+
|
5148
|
+
for (std::string & word : bpe_words) {
|
5149
|
+
std::string encoded_token = "";
|
5150
|
+
for (char & c : word) {
|
5151
|
+
encoded_token += bytes_to_unicode_bpe(c);
|
5152
|
+
}
|
5153
|
+
bpe_encoded_words.emplace_back(encoded_token);
|
4580
5154
|
}
|
4581
|
-
return words;
|
4582
5155
|
|
5156
|
+
return bpe_encoded_words;
|
4583
5157
|
}
|
4584
5158
|
|
4585
5159
|
const llama_vocab & vocab;
|
@@ -6022,7 +6596,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6022
6596
|
nthread = std::thread::hardware_concurrency();
|
6023
6597
|
}
|
6024
6598
|
|
6025
|
-
|
6599
|
+
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
6600
|
+
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
6601
|
+
#if defined(__linux__) || defined(_WIN32)
|
6602
|
+
constexpr bool use_mmap = true;
|
6603
|
+
#else
|
6604
|
+
constexpr bool use_mmap = false;
|
6605
|
+
#endif
|
6606
|
+
|
6607
|
+
llama_model_loader ml(fname_inp, use_mmap);
|
6608
|
+
if (ml.use_mmap) {
|
6609
|
+
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
6610
|
+
}
|
6026
6611
|
|
6027
6612
|
llama_model model;
|
6028
6613
|
llm_load_arch(ml, model);
|
@@ -6100,10 +6685,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6100
6685
|
|
6101
6686
|
const std::string name = ggml_get_name(tensor);
|
6102
6687
|
|
6103
|
-
if (
|
6104
|
-
read_data.
|
6688
|
+
if (!ml.use_mmap) {
|
6689
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
6690
|
+
read_data.resize(ggml_nbytes(tensor));
|
6691
|
+
}
|
6692
|
+
tensor->data = read_data.data();
|
6105
6693
|
}
|
6106
|
-
tensor->data = read_data.data();
|
6107
6694
|
ml.load_data_for(tensor);
|
6108
6695
|
|
6109
6696
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
@@ -6738,13 +7325,14 @@ struct llama_context * llama_new_context_with_model(
|
|
6738
7325
|
|
6739
7326
|
#ifdef GGML_USE_METAL
|
6740
7327
|
if (model->n_gpu_layers > 0) {
|
7328
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
7329
|
+
|
6741
7330
|
ctx->ctx_metal = ggml_metal_init(1);
|
6742
7331
|
if (!ctx->ctx_metal) {
|
6743
7332
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6744
7333
|
llama_free(ctx);
|
6745
7334
|
return NULL;
|
6746
7335
|
}
|
6747
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
7336
|
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
7337
|
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6750
7338
|
}
|
@@ -6872,6 +7460,10 @@ int llama_n_embd(const struct llama_model * model) {
|
|
6872
7460
|
return model->hparams.n_embd;
|
6873
7461
|
}
|
6874
7462
|
|
7463
|
+
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
7464
|
+
return model->hparams.rope_freq_scale_train;
|
7465
|
+
}
|
7466
|
+
|
6875
7467
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6876
7468
|
return snprintf(buf, buf_size, "%s %s %s",
|
6877
7469
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -7039,16 +7631,6 @@ struct llama_data_file_context : llama_data_context {
|
|
7039
7631
|
*
|
7040
7632
|
*/
|
7041
7633
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
-
// TODO: does not support multi-sequence states
|
7043
|
-
{
|
7044
|
-
const auto & kv_self = ctx->kv_self;
|
7045
|
-
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
-
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
-
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
-
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
-
}
|
7050
|
-
}
|
7051
|
-
|
7052
7634
|
// copy rng
|
7053
7635
|
{
|
7054
7636
|
std::stringstream rng_ss;
|
@@ -7101,36 +7683,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7101
7683
|
const auto & hparams = ctx->model.hparams;
|
7102
7684
|
const auto & cparams = ctx->cparams;
|
7103
7685
|
|
7104
|
-
const
|
7105
|
-
const
|
7106
|
-
const
|
7686
|
+
const auto n_layer = hparams.n_layer;
|
7687
|
+
const auto n_embd = hparams.n_embd_gqa();
|
7688
|
+
const auto n_ctx = cparams.n_ctx;
|
7107
7689
|
|
7108
|
-
const size_t
|
7109
|
-
const
|
7690
|
+
const size_t kv_buf_size = kv_self.buf.size;
|
7691
|
+
const uint32_t kv_head = kv_self.head;
|
7692
|
+
const uint32_t kv_size = kv_self.size;
|
7110
7693
|
|
7111
|
-
data_ctx->write(&
|
7112
|
-
data_ctx->write(&
|
7694
|
+
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
7695
|
+
data_ctx->write(&kv_head, sizeof(kv_head));
|
7696
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
7113
7697
|
|
7114
|
-
if (
|
7698
|
+
if (kv_buf_size) {
|
7115
7699
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7116
7700
|
|
7117
7701
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7118
7702
|
ggml_cgraph gf{};
|
7119
7703
|
|
7120
|
-
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7704
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7121
7705
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
7122
7706
|
kout3d->data = kout3d_data.data();
|
7123
7707
|
|
7124
|
-
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7708
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7125
7709
|
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
7126
7710
|
vout3d->data = vout3d_data.data();
|
7127
7711
|
|
7128
7712
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7129
|
-
n_embd,
|
7713
|
+
n_embd, kv_head, n_layer,
|
7130
7714
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7131
7715
|
|
7132
7716
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7133
|
-
|
7717
|
+
kv_head, n_embd, n_layer,
|
7134
7718
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7135
7719
|
|
7136
7720
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
@@ -7144,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7144
7728
|
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
7145
7729
|
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
7146
7730
|
}
|
7731
|
+
|
7732
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7733
|
+
const auto & cell = kv_self.cells[i];
|
7734
|
+
|
7735
|
+
const llama_pos pos = cell.pos;
|
7736
|
+
const size_t seq_id_size = cell.seq_id.size();
|
7737
|
+
|
7738
|
+
data_ctx->write(&pos, sizeof(pos));
|
7739
|
+
data_ctx->write(&seq_id_size, sizeof(seq_id_size));
|
7740
|
+
|
7741
|
+
for (auto seq_id : cell.seq_id) {
|
7742
|
+
data_ctx->write(&seq_id, sizeof(seq_id));
|
7743
|
+
}
|
7744
|
+
}
|
7147
7745
|
}
|
7148
7746
|
}
|
7149
7747
|
|
@@ -7215,34 +7813,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7215
7813
|
const int n_embd = hparams.n_embd_gqa();
|
7216
7814
|
const int n_ctx = cparams.n_ctx;
|
7217
7815
|
|
7218
|
-
size_t
|
7219
|
-
|
7816
|
+
size_t kv_buf_size;
|
7817
|
+
uint32_t kv_head;
|
7818
|
+
uint32_t kv_size;
|
7220
7819
|
|
7221
|
-
memcpy(&
|
7222
|
-
memcpy(&
|
7820
|
+
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
7821
|
+
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
7822
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
7223
7823
|
|
7224
|
-
if (
|
7225
|
-
GGML_ASSERT(kv_self.buf.size ==
|
7824
|
+
if (kv_buf_size) {
|
7825
|
+
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
7226
7826
|
|
7227
7827
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7228
7828
|
|
7229
7829
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7230
7830
|
ggml_cgraph gf{};
|
7231
7831
|
|
7232
|
-
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7832
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7233
7833
|
kin3d->data = (void *) inp;
|
7234
7834
|
inp += ggml_nbytes(kin3d);
|
7235
7835
|
|
7236
|
-
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7836
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7237
7837
|
vin3d->data = (void *) inp;
|
7238
7838
|
inp += ggml_nbytes(vin3d);
|
7239
7839
|
|
7240
7840
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7241
|
-
n_embd,
|
7841
|
+
n_embd, kv_head, n_layer,
|
7242
7842
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7243
7843
|
|
7244
7844
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7245
|
-
|
7845
|
+
kv_head, n_embd, n_layer,
|
7246
7846
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7247
7847
|
|
7248
7848
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
@@ -7252,8 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7252
7852
|
ggml_free(cpy_ctx);
|
7253
7853
|
}
|
7254
7854
|
|
7255
|
-
ctx->kv_self.head =
|
7855
|
+
ctx->kv_self.head = kv_head;
|
7256
7856
|
ctx->kv_self.size = kv_size;
|
7857
|
+
|
7858
|
+
ctx->kv_self.cells.resize(kv_size);
|
7859
|
+
|
7860
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7861
|
+
llama_pos pos;
|
7862
|
+
size_t seq_id_size;
|
7863
|
+
|
7864
|
+
memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
|
7865
|
+
memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
|
7866
|
+
|
7867
|
+
ctx->kv_self.cells[i].pos = pos;
|
7868
|
+
|
7869
|
+
llama_seq_id seq_id;
|
7870
|
+
|
7871
|
+
for (size_t j = 0; j < seq_id_size; ++j) {
|
7872
|
+
memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
|
7873
|
+
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
7874
|
+
}
|
7875
|
+
}
|
7257
7876
|
}
|
7258
7877
|
|
7259
7878
|
const size_t nread = inp - src;
|
@@ -7471,6 +8090,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
|
|
7471
8090
|
llama_token llama_token_nl(const struct llama_context * ctx) {
|
7472
8091
|
return ctx->model.vocab.linefeed_id;
|
7473
8092
|
}
|
8093
|
+
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
8094
|
+
return ctx->model.vocab.special_prefix_id;
|
8095
|
+
}
|
8096
|
+
|
8097
|
+
llama_token llama_token_middle(const struct llama_context * ctx) {
|
8098
|
+
return ctx->model.vocab.special_middle_id;
|
8099
|
+
}
|
8100
|
+
|
8101
|
+
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
8102
|
+
return ctx->model.vocab.special_suffix_id;
|
8103
|
+
}
|
8104
|
+
|
8105
|
+
llama_token llama_token_eot(const struct llama_context * ctx) {
|
8106
|
+
return ctx->model.vocab.special_eot_id;
|
8107
|
+
}
|
8108
|
+
|
7474
8109
|
|
7475
8110
|
int llama_tokenize(
|
7476
8111
|
const struct llama_model * model,
|
@@ -7493,35 +8128,66 @@ int llama_tokenize(
|
|
7493
8128
|
return res.size();
|
7494
8129
|
}
|
7495
8130
|
|
8131
|
+
static std::string llama_decode_text(const std::string & text) {
|
8132
|
+
std::string decoded_text;
|
8133
|
+
auto unicode_sequences = codepoints_from_utf8(text);
|
8134
|
+
for (auto& unicode_sequence : unicode_sequences) {
|
8135
|
+
decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
|
8136
|
+
}
|
8137
|
+
|
8138
|
+
return decoded_text;
|
8139
|
+
}
|
8140
|
+
|
7496
8141
|
// does not write null-terminator to buf
|
7497
8142
|
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
8143
|
if (0 <= token && token < llama_n_vocab(model)) {
|
7499
|
-
|
7500
|
-
|
7501
|
-
if (
|
8144
|
+
switch (llama_vocab_get_type(model->vocab)) {
|
8145
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
8146
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8147
|
+
std::string result = model->vocab.id_to_token[token].text;
|
7502
8148
|
llama_unescape_whitespace(result);
|
8149
|
+
if (length < (int) result.length()) {
|
8150
|
+
return -result.length();
|
8151
|
+
}
|
8152
|
+
memcpy(buf, result.c_str(), result.length());
|
8153
|
+
return result.length();
|
8154
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
8155
|
+
if (length < 3) {
|
8156
|
+
return -3;
|
8157
|
+
}
|
8158
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
8159
|
+
return 3;
|
8160
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8161
|
+
;
|
8162
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
8163
|
+
if (length < 1) {
|
8164
|
+
return -1;
|
8165
|
+
}
|
8166
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
|
+
return 1;
|
8168
|
+
} else {
|
8169
|
+
GGML_ASSERT(false);
|
7503
8170
|
}
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
|
7508
|
-
|
7509
|
-
|
7510
|
-
|
7511
|
-
|
7512
|
-
|
7513
|
-
|
7514
|
-
|
7515
|
-
|
7516
|
-
|
7517
|
-
|
7518
|
-
|
7519
|
-
} else if (llama_is_byte_token(model->vocab, token)) {
|
7520
|
-
if (length < 1) {
|
7521
|
-
return -1;
|
8171
|
+
break;
|
8172
|
+
}
|
8173
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
8174
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8175
|
+
std::string result = model->vocab.id_to_token[token].text;
|
8176
|
+
result = llama_decode_text(result);
|
8177
|
+
if (length < (int) result.length()) {
|
8178
|
+
return -result.length();
|
8179
|
+
}
|
8180
|
+
memcpy(buf, result.c_str(), result.length());
|
8181
|
+
return result.length();
|
8182
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
|
+
;
|
8184
|
+
} else {
|
8185
|
+
GGML_ASSERT(false);
|
7522
8186
|
}
|
7523
|
-
|
7524
|
-
|
8187
|
+
break;
|
8188
|
+
}
|
8189
|
+
default:
|
8190
|
+
GGML_ASSERT(false);
|
7525
8191
|
}
|
7526
8192
|
}
|
7527
8193
|
return 0;
|
@@ -7548,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
7548
8214
|
const llama_timings timings = llama_get_timings(ctx);
|
7549
8215
|
|
7550
8216
|
LLAMA_LOG_INFO("\n");
|
7551
|
-
LLAMA_LOG_INFO("%s: load time = %
|
7552
|
-
LLAMA_LOG_INFO("%s: sample time = %
|
8217
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
8218
|
+
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7553
8219
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
7554
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %
|
8220
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
7555
8221
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
7556
|
-
LLAMA_LOG_INFO("%s: eval time = %
|
8222
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7557
8223
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
7558
|
-
LLAMA_LOG_INFO("%s: total time = %
|
8224
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
7559
8225
|
}
|
7560
8226
|
|
7561
8227
|
void llama_reset_timings(struct llama_context * ctx) {
|