llama_cpp 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -3
- data/ext/llama_cpp/src/ggml-cuda.cu +122 -72
- data/ext/llama_cpp/src/ggml-metal.m +4 -5
- data/ext/llama_cpp/src/ggml-metal.metal +9 -2
- data/ext/llama_cpp/src/ggml-opencl.cpp +119 -53
- data/ext/llama_cpp/src/ggml.c +755 -320
- data/ext/llama_cpp/src/ggml.h +13 -0
- data/ext/llama_cpp/src/k_quants.c +744 -2
- data/ext/llama_cpp/src/llama.cpp +779 -113
- data/ext/llama_cpp/src/llama.h +22 -6
- data/ext/llama_cpp/src/unicode.h +462 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -0
- metadata +3 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
2
|
#include "llama.h"
|
3
3
|
|
4
|
+
#include "unicode.h"
|
5
|
+
|
4
6
|
#include "ggml.h"
|
5
7
|
|
6
8
|
#include "ggml-alloc.h"
|
@@ -123,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std::
|
|
123
125
|
}
|
124
126
|
s = std::move(result);
|
125
127
|
}
|
128
|
+
|
129
|
+
static bool is_float_close(float a, float b, float abs_tol) {
|
130
|
+
// Check for non-negative tolerance
|
131
|
+
if (abs_tol < 0.0) {
|
132
|
+
throw std::invalid_argument("Tolerance must be non-negative");
|
133
|
+
}
|
134
|
+
|
135
|
+
// Exact equality check
|
136
|
+
if (a == b) {
|
137
|
+
return true;
|
138
|
+
}
|
139
|
+
|
140
|
+
// Check for infinities
|
141
|
+
if (std::isinf(a) || std::isinf(b)) {
|
142
|
+
return false;
|
143
|
+
}
|
144
|
+
|
145
|
+
// Regular comparison using the provided absolute tolerance
|
146
|
+
return std::fabs(b - a) <= abs_tol;
|
147
|
+
}
|
148
|
+
|
126
149
|
#ifdef GGML_USE_CPU_HBM
|
127
150
|
#include <hbwmalloc.h>
|
128
151
|
#endif
|
@@ -163,6 +186,7 @@ enum llm_arch {
|
|
163
186
|
LLM_ARCH_GPTNEOX,
|
164
187
|
LLM_ARCH_MPT,
|
165
188
|
LLM_ARCH_STARCODER,
|
189
|
+
LLM_ARCH_REFACT,
|
166
190
|
LLM_ARCH_UNKNOWN,
|
167
191
|
};
|
168
192
|
|
@@ -175,6 +199,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
175
199
|
{ LLM_ARCH_MPT, "mpt" },
|
176
200
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
177
201
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
202
|
+
{ LLM_ARCH_REFACT, "refact" },
|
178
203
|
};
|
179
204
|
|
180
205
|
enum llm_kv {
|
@@ -395,6 +420,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
395
420
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
396
421
|
},
|
397
422
|
},
|
423
|
+
{
|
424
|
+
LLM_ARCH_REFACT,
|
425
|
+
{
|
426
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
427
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
428
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
429
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
430
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
431
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
432
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
433
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
434
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
436
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
437
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
438
|
+
},
|
439
|
+
},
|
398
440
|
{
|
399
441
|
LLM_ARCH_UNKNOWN,
|
400
442
|
{
|
@@ -943,7 +985,24 @@ struct llama_hparams {
|
|
943
985
|
float rope_freq_scale_train;
|
944
986
|
|
945
987
|
bool operator!=(const llama_hparams & other) const {
|
946
|
-
|
988
|
+
if (this->vocab_only != other.vocab_only) return true;
|
989
|
+
if (this->n_vocab != other.n_vocab) return true;
|
990
|
+
if (this->n_ctx_train != other.n_ctx_train) return true;
|
991
|
+
if (this->n_embd != other.n_embd) return true;
|
992
|
+
if (this->n_head != other.n_head) return true;
|
993
|
+
if (this->n_head_kv != other.n_head_kv) return true;
|
994
|
+
if (this->n_layer != other.n_layer) return true;
|
995
|
+
if (this->n_rot != other.n_rot) return true;
|
996
|
+
if (this->n_ff != other.n_ff) return true;
|
997
|
+
|
998
|
+
const float EPSILON = 1e-9;
|
999
|
+
|
1000
|
+
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1001
|
+
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1002
|
+
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1003
|
+
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1004
|
+
|
1005
|
+
return false;
|
947
1006
|
}
|
948
1007
|
|
949
1008
|
uint32_t n_gqa() const {
|
@@ -1071,6 +1130,10 @@ struct llama_vocab {
|
|
1071
1130
|
id special_pad_id = -1;
|
1072
1131
|
|
1073
1132
|
id linefeed_id = 13;
|
1133
|
+
id special_prefix_id = 32007;
|
1134
|
+
id special_middle_id = 32009;
|
1135
|
+
id special_suffix_id = 32008;
|
1136
|
+
id special_eot_id = 32010;
|
1074
1137
|
|
1075
1138
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
1076
1139
|
replace_all(token_left, " ", "\u0120");
|
@@ -1272,8 +1335,8 @@ static bool llama_kv_cache_init(
|
|
1272
1335
|
// find an empty slot of size "n_tokens" in the cache
|
1273
1336
|
// updates the cache head
|
1274
1337
|
static bool llama_kv_cache_find_slot(
|
1275
|
-
|
1276
|
-
|
1338
|
+
struct llama_kv_cache & cache,
|
1339
|
+
const struct llama_batch & batch) {
|
1277
1340
|
const uint32_t n_ctx = cache.size;
|
1278
1341
|
const uint32_t n_tokens = batch.n_tokens;
|
1279
1342
|
|
@@ -1341,10 +1404,13 @@ static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0,
|
|
1341
1404
|
}
|
1342
1405
|
|
1343
1406
|
static void llama_kv_cache_seq_rm(
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1407
|
+
struct llama_kv_cache & cache,
|
1408
|
+
llama_seq_id seq_id,
|
1409
|
+
llama_pos p0,
|
1410
|
+
llama_pos p1) {
|
1411
|
+
if (p0 < 0) p0 = 0;
|
1412
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1413
|
+
|
1348
1414
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1349
1415
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1350
1416
|
cache.cells[i].seq_id.erase(seq_id);
|
@@ -1356,11 +1422,14 @@ static void llama_kv_cache_seq_rm(
|
|
1356
1422
|
}
|
1357
1423
|
|
1358
1424
|
static void llama_kv_cache_seq_cp(
|
1359
|
-
|
1360
|
-
|
1361
|
-
|
1362
|
-
|
1363
|
-
|
1425
|
+
struct llama_kv_cache & cache,
|
1426
|
+
llama_seq_id seq_id_src,
|
1427
|
+
llama_seq_id seq_id_dst,
|
1428
|
+
llama_pos p0,
|
1429
|
+
llama_pos p1) {
|
1430
|
+
if (p0 < 0) p0 = 0;
|
1431
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1432
|
+
|
1364
1433
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1365
1434
|
if (cache.cells[i].has_seq_id(seq_id_src) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1366
1435
|
cache.cells[i].seq_id.insert(seq_id_dst);
|
@@ -1378,11 +1447,14 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
1378
1447
|
}
|
1379
1448
|
|
1380
1449
|
static void llama_kv_cache_seq_shift(
|
1381
|
-
|
1382
|
-
|
1383
|
-
|
1384
|
-
|
1385
|
-
|
1450
|
+
struct llama_kv_cache & cache,
|
1451
|
+
llama_seq_id seq_id,
|
1452
|
+
llama_pos p0,
|
1453
|
+
llama_pos p1,
|
1454
|
+
llama_pos delta) {
|
1455
|
+
if (p0 < 0) p0 = 0;
|
1456
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
1457
|
+
|
1386
1458
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
1387
1459
|
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
1388
1460
|
cache.cells[i].pos += delta;
|
@@ -1907,6 +1979,14 @@ static void llm_load_hparams(
|
|
1907
1979
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1908
1980
|
}
|
1909
1981
|
} break;
|
1982
|
+
case LLM_ARCH_REFACT:
|
1983
|
+
{
|
1984
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1985
|
+
switch (hparams.n_layer) {
|
1986
|
+
case 32: model.type = e_model::MODEL_1B; break;
|
1987
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1988
|
+
}
|
1989
|
+
} break;
|
1910
1990
|
default: (void)0;
|
1911
1991
|
}
|
1912
1992
|
|
@@ -1971,6 +2051,7 @@ static void llm_load_vocab(
|
|
1971
2051
|
|
1972
2052
|
for (int i = 0; i < n_merges; i++) {
|
1973
2053
|
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
|
2054
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
1974
2055
|
|
1975
2056
|
std::string first;
|
1976
2057
|
std::string second;
|
@@ -2005,6 +2086,7 @@ static void llm_load_vocab(
|
|
2005
2086
|
|
2006
2087
|
for (uint32_t i = 0; i < n_vocab; i++) {
|
2007
2088
|
std::string word = gguf_get_arr_str(ctx, token_idx, i);
|
2089
|
+
GGML_ASSERT(codepoints_from_utf8(word).size() > 0);
|
2008
2090
|
|
2009
2091
|
vocab.token_to_id[word] = i;
|
2010
2092
|
|
@@ -2013,12 +2095,13 @@ static void llm_load_vocab(
|
|
2013
2095
|
token_data.score = scores ? scores[i] : 0.0f;
|
2014
2096
|
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
2015
2097
|
}
|
2098
|
+
GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
|
2016
2099
|
|
2017
2100
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
2018
2101
|
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
|
2019
2102
|
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
|
2020
2103
|
} else {
|
2021
|
-
vocab.linefeed_id = llama_tokenize_internal(vocab, "\
|
2104
|
+
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
|
2022
2105
|
}
|
2023
2106
|
|
2024
2107
|
// special tokens
|
@@ -2141,6 +2224,7 @@ static void llm_load_tensors(
|
|
2141
2224
|
const auto tn = LLM_TN(model.arch);
|
2142
2225
|
switch (model.arch) {
|
2143
2226
|
case LLM_ARCH_LLAMA:
|
2227
|
+
case LLM_ARCH_REFACT:
|
2144
2228
|
{
|
2145
2229
|
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2146
2230
|
|
@@ -3334,6 +3418,353 @@ static struct ggml_cgraph * llm_build_baichaun(
|
|
3334
3418
|
return gf;
|
3335
3419
|
}
|
3336
3420
|
|
3421
|
+
static struct ggml_cgraph * llm_build_refact(
|
3422
|
+
llama_context & lctx,
|
3423
|
+
const llama_batch & batch) {
|
3424
|
+
const auto & model = lctx.model;
|
3425
|
+
const auto & hparams = model.hparams;
|
3426
|
+
const auto & cparams = lctx.cparams;
|
3427
|
+
|
3428
|
+
const auto & kv_self = lctx.kv_self;
|
3429
|
+
|
3430
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3431
|
+
|
3432
|
+
const int64_t n_embd = hparams.n_embd;
|
3433
|
+
const int64_t n_layer = hparams.n_layer;
|
3434
|
+
const int64_t n_ctx = cparams.n_ctx;
|
3435
|
+
const int64_t n_head = hparams.n_head;
|
3436
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3437
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3438
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3439
|
+
|
3440
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
3441
|
+
|
3442
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
3443
|
+
|
3444
|
+
const int32_t n_tokens = batch.n_tokens;
|
3445
|
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
3446
|
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
3447
|
+
|
3448
|
+
// printf("n_kv = %d\n", n_kv);
|
3449
|
+
|
3450
|
+
auto & buf_compute = lctx.buf_compute;
|
3451
|
+
|
3452
|
+
struct ggml_init_params params = {
|
3453
|
+
/*.mem_size =*/ buf_compute.size,
|
3454
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3455
|
+
/*.no_alloc =*/ false,
|
3456
|
+
};
|
3457
|
+
|
3458
|
+
params.no_alloc = true;
|
3459
|
+
|
3460
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3461
|
+
|
3462
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3463
|
+
|
3464
|
+
struct ggml_tensor * cur;
|
3465
|
+
struct ggml_tensor * inpL;
|
3466
|
+
|
3467
|
+
if (batch.token) {
|
3468
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
3469
|
+
|
3470
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3471
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3472
|
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
3473
|
+
}
|
3474
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3475
|
+
|
3476
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3477
|
+
} else {
|
3478
|
+
#ifdef GGML_USE_MPI
|
3479
|
+
GGML_ASSERT(false && "not implemented");
|
3480
|
+
#endif
|
3481
|
+
|
3482
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
3483
|
+
|
3484
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
3485
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3486
|
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
3487
|
+
}
|
3488
|
+
}
|
3489
|
+
|
3490
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3491
|
+
(void) i_gpu_start;
|
3492
|
+
|
3493
|
+
// offload functions set the tensor output backend to GPU
|
3494
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
3495
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
3496
|
+
offload_func_t offload_func_kq = llama_nop;
|
3497
|
+
offload_func_t offload_func_v = llama_nop;
|
3498
|
+
|
3499
|
+
#ifdef GGML_USE_CUBLAS
|
3500
|
+
if (n_gpu_layers > n_layer) {
|
3501
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
3502
|
+
}
|
3503
|
+
if (n_gpu_layers > n_layer + 1) {
|
3504
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
3505
|
+
}
|
3506
|
+
if (n_gpu_layers > n_layer + 2) {
|
3507
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
3508
|
+
}
|
3509
|
+
#endif // GGML_USE_CUBLAS
|
3510
|
+
|
3511
|
+
// KQ_scale
|
3512
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3513
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3514
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3515
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3516
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
3517
|
+
}
|
3518
|
+
|
3519
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
3520
|
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
3521
|
+
offload_func_kq(KQ_mask);
|
3522
|
+
ggml_set_name(KQ_mask, "KQ_mask");
|
3523
|
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
3524
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3525
|
+
float * data = (float *) KQ_mask->data;
|
3526
|
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
3527
|
+
|
3528
|
+
for (int h = 0; h < 1; ++h) {
|
3529
|
+
for (int j = 0; j < n_tokens; ++j) {
|
3530
|
+
const llama_pos pos = batch.pos[j];
|
3531
|
+
const llama_seq_id seq_id = batch.seq_id[j];
|
3532
|
+
|
3533
|
+
for (int i = 0; i < n_kv; ++i) {
|
3534
|
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
3535
|
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
3536
|
+
}
|
3537
|
+
}
|
3538
|
+
}
|
3539
|
+
}
|
3540
|
+
}
|
3541
|
+
|
3542
|
+
for (int il = 0; il < n_layer; ++il) {
|
3543
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
3544
|
+
|
3545
|
+
offload_func_t offload_func = llama_nop;
|
3546
|
+
|
3547
|
+
#ifdef GGML_USE_CUBLAS
|
3548
|
+
if (il >= i_gpu_start) {
|
3549
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
3550
|
+
}
|
3551
|
+
#endif // GGML_USE_CUBLAS
|
3552
|
+
|
3553
|
+
struct ggml_tensor * inpSA = inpL;
|
3554
|
+
|
3555
|
+
// norm
|
3556
|
+
{
|
3557
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
3558
|
+
offload_func(cur);
|
3559
|
+
ggml_set_name(cur, "rms_norm_0");
|
3560
|
+
|
3561
|
+
// cur = cur*attn_norm(broadcasted)
|
3562
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
3563
|
+
offload_func(cur);
|
3564
|
+
ggml_set_name(cur, "attention_norm_0");
|
3565
|
+
}
|
3566
|
+
|
3567
|
+
// self-attention
|
3568
|
+
{
|
3569
|
+
// compute Q and K
|
3570
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3571
|
+
offload_func_kq(tmpk);
|
3572
|
+
ggml_set_name(tmpk, "tmpk");
|
3573
|
+
|
3574
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3575
|
+
offload_func_kq(tmpq);
|
3576
|
+
ggml_set_name(tmpq, "tmpq");
|
3577
|
+
|
3578
|
+
struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens);
|
3579
|
+
offload_func_kq(Kcur);
|
3580
|
+
ggml_set_name(Kcur, "Kcur");
|
3581
|
+
|
3582
|
+
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
|
3583
|
+
offload_func_kq(Qcur);
|
3584
|
+
ggml_set_name(Qcur, "Qcur");
|
3585
|
+
|
3586
|
+
// store key and value to memory
|
3587
|
+
{
|
3588
|
+
// compute the transposed [n_tokens, n_embd] V matrix
|
3589
|
+
|
3590
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3591
|
+
offload_func_v(tmpv);
|
3592
|
+
ggml_set_name(tmpv, "tmpv");
|
3593
|
+
|
3594
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
3595
|
+
offload_func_v(Vcur);
|
3596
|
+
ggml_set_name(Vcur, "Vcur");
|
3597
|
+
|
3598
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
3599
|
+
offload_func_kq(k);
|
3600
|
+
ggml_set_name(k, "k");
|
3601
|
+
|
3602
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
3603
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3604
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
3605
|
+
offload_func_v(v);
|
3606
|
+
ggml_set_name(v, "v");
|
3607
|
+
|
3608
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3609
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3610
|
+
}
|
3611
|
+
|
3612
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
3613
|
+
offload_func_kq(Q);
|
3614
|
+
ggml_set_name(Q, "Q");
|
3615
|
+
|
3616
|
+
struct ggml_tensor * K =
|
3617
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3618
|
+
n_embd_head, n_kv, n_head_kv,
|
3619
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3620
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3621
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3622
|
+
offload_func_kq(K);
|
3623
|
+
ggml_set_name(K, "K");
|
3624
|
+
|
3625
|
+
// K * Q
|
3626
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3627
|
+
offload_func_kq(KQ);
|
3628
|
+
ggml_set_name(KQ, "KQ");
|
3629
|
+
|
3630
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3631
|
+
// KQ_scaled shape [n_kv, n_tokens, n_head, 1]
|
3632
|
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
3633
|
+
offload_func_kq(KQ_scaled);
|
3634
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3635
|
+
|
3636
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3637
|
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8);
|
3638
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
3639
|
+
|
3640
|
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
3641
|
+
offload_func_kq(KQ_masked);
|
3642
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3643
|
+
|
3644
|
+
// KQ = soft_max(KQ_masked)
|
3645
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
3646
|
+
offload_func_v(KQ_soft_max);
|
3647
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3648
|
+
|
3649
|
+
// split cached V into n_head heads
|
3650
|
+
struct ggml_tensor * V =
|
3651
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3652
|
+
n_kv, n_embd_head, n_head_kv,
|
3653
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3654
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3655
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3656
|
+
offload_func_v(V);
|
3657
|
+
ggml_set_name(V, "V");
|
3658
|
+
|
3659
|
+
#if 1
|
3660
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3661
|
+
offload_func_v(KQV);
|
3662
|
+
ggml_set_name(KQV, "KQV");
|
3663
|
+
#else
|
3664
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
3665
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
3666
|
+
// is there a better way?
|
3667
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head));
|
3668
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
3669
|
+
#endif
|
3670
|
+
|
3671
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3672
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3673
|
+
offload_func_v(KQV_merged);
|
3674
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3675
|
+
|
3676
|
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
3677
|
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
3678
|
+
offload_func_v(cur);
|
3679
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3680
|
+
|
3681
|
+
// projection (no bias)
|
3682
|
+
cur = ggml_mul_mat(ctx0,
|
3683
|
+
model.layers[il].wo,
|
3684
|
+
cur);
|
3685
|
+
offload_func(cur);
|
3686
|
+
ggml_set_name(cur, "result_wo");
|
3687
|
+
}
|
3688
|
+
|
3689
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
3690
|
+
offload_func(inpFF);
|
3691
|
+
ggml_set_name(inpFF, "inpFF");
|
3692
|
+
|
3693
|
+
// feed-forward network
|
3694
|
+
{
|
3695
|
+
// norm
|
3696
|
+
{
|
3697
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
3698
|
+
offload_func(cur);
|
3699
|
+
ggml_set_name(cur, "rms_norm_1");
|
3700
|
+
|
3701
|
+
// cur = cur*ffn_norm(broadcasted)
|
3702
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
3703
|
+
offload_func(cur);
|
3704
|
+
ggml_set_name(cur, "ffn_norm");
|
3705
|
+
}
|
3706
|
+
|
3707
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
3708
|
+
model.layers[il].w3,
|
3709
|
+
cur);
|
3710
|
+
offload_func(tmp);
|
3711
|
+
ggml_set_name(tmp, "result_w3");
|
3712
|
+
|
3713
|
+
cur = ggml_mul_mat(ctx0,
|
3714
|
+
model.layers[il].w1,
|
3715
|
+
cur);
|
3716
|
+
offload_func(cur);
|
3717
|
+
ggml_set_name(cur, "result_w1");
|
3718
|
+
|
3719
|
+
// SILU activation
|
3720
|
+
cur = ggml_silu(ctx0, cur);
|
3721
|
+
offload_func(cur);
|
3722
|
+
ggml_set_name(cur, "silu");
|
3723
|
+
|
3724
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
3725
|
+
offload_func(cur);
|
3726
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
3727
|
+
|
3728
|
+
cur = ggml_mul_mat(ctx0,
|
3729
|
+
model.layers[il].w2,
|
3730
|
+
cur);
|
3731
|
+
offload_func(cur);
|
3732
|
+
ggml_set_name(cur, "result_w2");
|
3733
|
+
}
|
3734
|
+
|
3735
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
3736
|
+
offload_func(cur);
|
3737
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
3738
|
+
|
3739
|
+
// input for next layer
|
3740
|
+
inpL = cur;
|
3741
|
+
}
|
3742
|
+
|
3743
|
+
cur = inpL;
|
3744
|
+
|
3745
|
+
// norm
|
3746
|
+
{
|
3747
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
3748
|
+
offload_func_nr(cur);
|
3749
|
+
ggml_set_name(cur, "rms_norm_2");
|
3750
|
+
|
3751
|
+
// cur = cur*norm(broadcasted)
|
3752
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
3753
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
3754
|
+
ggml_set_name(cur, "result_norm");
|
3755
|
+
}
|
3756
|
+
|
3757
|
+
// lm_head
|
3758
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3759
|
+
ggml_set_name(cur, "result_output");
|
3760
|
+
|
3761
|
+
ggml_build_forward_expand(gf, cur);
|
3762
|
+
|
3763
|
+
ggml_free(ctx0);
|
3764
|
+
|
3765
|
+
return gf;
|
3766
|
+
}
|
3767
|
+
|
3337
3768
|
static struct ggml_cgraph * llm_build_falcon(
|
3338
3769
|
llama_context & lctx,
|
3339
3770
|
const llama_batch & batch) {
|
@@ -3974,6 +4405,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3974
4405
|
{
|
3975
4406
|
result = llm_build_starcoder(lctx, batch);
|
3976
4407
|
} break;
|
4408
|
+
case LLM_ARCH_REFACT:
|
4409
|
+
{
|
4410
|
+
result = llm_build_refact(lctx, batch);
|
4411
|
+
} break;
|
3977
4412
|
default:
|
3978
4413
|
GGML_ASSERT(false);
|
3979
4414
|
}
|
@@ -4107,7 +4542,8 @@ static int llama_decode_internal(
|
|
4107
4542
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
4108
4543
|
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
4109
4544
|
model.arch == LLM_ARCH_BAICHUAN ||
|
4110
|
-
model.arch == LLM_ARCH_FALCON
|
4545
|
+
model.arch == LLM_ARCH_FALCON ||
|
4546
|
+
model.arch == LLM_ARCH_REFACT;
|
4111
4547
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
4112
4548
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
4113
4549
|
n_threads = 1;
|
@@ -4227,18 +4663,41 @@ static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
|
4227
4663
|
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
4228
4664
|
}
|
4229
4665
|
|
4230
|
-
static
|
4666
|
+
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
4667
|
+
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
4668
|
+
}
|
4669
|
+
|
4670
|
+
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
4231
4671
|
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
4232
4672
|
const auto& token_data = vocab.id_to_token.at(id);
|
4233
|
-
|
4234
|
-
|
4673
|
+
switch (llama_vocab_get_type(vocab)) {
|
4674
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4675
|
+
auto buf = token_data.text.substr(3, 2);
|
4676
|
+
return strtol(buf.c_str(), NULL, 16);
|
4677
|
+
}
|
4678
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4679
|
+
GGML_ASSERT(false);
|
4680
|
+
return unicode_to_bytes_bpe(token_data.text);
|
4681
|
+
}
|
4682
|
+
default:
|
4683
|
+
GGML_ASSERT(false);
|
4684
|
+
}
|
4235
4685
|
}
|
4236
4686
|
|
4237
4687
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
4238
|
-
|
4239
|
-
|
4240
|
-
|
4241
|
-
|
4688
|
+
switch (llama_vocab_get_type(vocab)) {
|
4689
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
4690
|
+
char buf[7];
|
4691
|
+
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
|
4692
|
+
GGML_ASSERT(0 <= result && result < 7);
|
4693
|
+
return vocab.token_to_id.at(buf);
|
4694
|
+
}
|
4695
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
4696
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
4697
|
+
}
|
4698
|
+
default:
|
4699
|
+
GGML_ASSERT(false);
|
4700
|
+
}
|
4242
4701
|
}
|
4243
4702
|
|
4244
4703
|
static void llama_escape_whitespace(std::string & text) {
|
@@ -4518,15 +4977,9 @@ struct llm_tokenizer_bpe {
|
|
4518
4977
|
std::string byte_str(1, *j);
|
4519
4978
|
auto token_multibyte = vocab.token_to_id.find(byte_str);
|
4520
4979
|
if (token_multibyte == vocab.token_to_id.end()) {
|
4521
|
-
|
4522
|
-
llama_token token_byte = llama_byte_to_token(vocab, *j);
|
4523
|
-
output.push_back(token_byte);
|
4524
|
-
} catch (const std::out_of_range & err) {
|
4525
|
-
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
|
4526
|
-
}
|
4527
|
-
} else {
|
4528
|
-
output.push_back((*token_multibyte).second);
|
4980
|
+
throw std::runtime_error("ERROR: byte not found in vocab");
|
4529
4981
|
}
|
4982
|
+
output.push_back((*token_multibyte).second);
|
4530
4983
|
}
|
4531
4984
|
} else {
|
4532
4985
|
output.push_back((*token).second);
|
@@ -4563,23 +5016,144 @@ private:
|
|
4563
5016
|
work_queue.push(bigram);
|
4564
5017
|
}
|
4565
5018
|
|
4566
|
-
|
4567
|
-
|
4568
|
-
std::vector<std::string>
|
5019
|
+
std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
|
5020
|
+
std::vector<std::string> bpe_words;
|
5021
|
+
std::vector<std::string> bpe_encoded_words;
|
5022
|
+
|
5023
|
+
std::string token = "";
|
5024
|
+
// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
|
5025
|
+
bool collecting_numeric = false;
|
5026
|
+
bool collecting_letter = false;
|
5027
|
+
bool collecting_special = false;
|
5028
|
+
bool collecting_whitespace_lookahead = false;
|
5029
|
+
bool collecting = false;
|
5030
|
+
|
5031
|
+
std::vector<std::string> text_utf;
|
5032
|
+
text_utf.reserve(text.size());
|
5033
|
+
bpe_words.reserve(text.size());
|
5034
|
+
bpe_encoded_words.reserve(text.size());
|
5035
|
+
|
5036
|
+
auto cps = codepoints_from_utf8(text);
|
5037
|
+
for (size_t i = 0; i < cps.size(); ++i)
|
5038
|
+
text_utf.emplace_back(codepoint_to_utf8(cps[i]));
|
5039
|
+
|
5040
|
+
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5041
|
+
const std::string & utf_char = text_utf[i];
|
5042
|
+
bool split_condition = false;
|
5043
|
+
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5044
|
+
int bytes_remain = text_utf.size() - i;
|
5045
|
+
// forward backward lookups
|
5046
|
+
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
5047
|
+
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
5048
|
+
|
5049
|
+
// handling contractions
|
5050
|
+
if (!split_condition && bytes_remain >= 2) {
|
5051
|
+
// 's|'t|'m|'d
|
5052
|
+
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
5053
|
+
split_condition = true;
|
5054
|
+
}
|
5055
|
+
if (split_condition) {
|
5056
|
+
if (token.size()) {
|
5057
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5058
|
+
}
|
5059
|
+
token = utf_char + utf_char_next;
|
5060
|
+
bpe_words.emplace_back(token);
|
5061
|
+
token = "";
|
5062
|
+
i++;
|
5063
|
+
continue;
|
5064
|
+
}
|
5065
|
+
}
|
5066
|
+
if (!split_condition && bytes_remain >= 3) {
|
5067
|
+
// 're|'ve|'ll
|
5068
|
+
if (utf_char == "\'" && (
|
5069
|
+
(utf_char_next == "r" || utf_char_next_next == "e") ||
|
5070
|
+
(utf_char_next == "v" || utf_char_next_next == "e") ||
|
5071
|
+
(utf_char_next == "l" || utf_char_next_next == "l"))
|
5072
|
+
) {
|
5073
|
+
split_condition = true;
|
5074
|
+
}
|
5075
|
+
if (split_condition) {
|
5076
|
+
// current token + next token can be defined
|
5077
|
+
if (token.size()) {
|
5078
|
+
bpe_words.emplace_back(token); // push previous content as token
|
5079
|
+
}
|
5080
|
+
token = utf_char + utf_char_next + utf_char_next_next;
|
5081
|
+
bpe_words.emplace_back(token); // the contraction
|
5082
|
+
token = "";
|
5083
|
+
i += 2;
|
5084
|
+
continue;
|
5085
|
+
}
|
5086
|
+
}
|
5087
|
+
|
5088
|
+
if (!split_condition && !collecting) {
|
5089
|
+
if (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
5090
|
+
collecting_letter = true;
|
5091
|
+
collecting = true;
|
5092
|
+
}
|
5093
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
5094
|
+
collecting_numeric = true;
|
5095
|
+
collecting = true;
|
5096
|
+
}
|
5097
|
+
else if (
|
5098
|
+
((codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (codepoint_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
5099
|
+
(!token.size() && utf_char == " " && codepoint_type(utf_char_next) != CODEPOINT_TYPE_LETTER && codepoint_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
5100
|
+
) {
|
5101
|
+
collecting_special = true;
|
5102
|
+
collecting = true;
|
5103
|
+
}
|
5104
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && codepoint_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
5105
|
+
collecting_whitespace_lookahead = true;
|
5106
|
+
collecting = true;
|
5107
|
+
}
|
5108
|
+
else if (codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
5109
|
+
split_condition = true;
|
5110
|
+
}
|
5111
|
+
}
|
5112
|
+
else if (!split_condition && collecting) {
|
5113
|
+
if (collecting_letter && codepoint_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
5114
|
+
split_condition = true;
|
5115
|
+
}
|
5116
|
+
else if (collecting_numeric && codepoint_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
|
5117
|
+
split_condition = true;
|
5118
|
+
}
|
5119
|
+
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5120
|
+
split_condition = true;
|
5121
|
+
}
|
5122
|
+
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
|
5123
|
+
split_condition = true;
|
5124
|
+
}
|
5125
|
+
}
|
4569
5126
|
|
4570
|
-
|
4571
|
-
|
4572
|
-
|
5127
|
+
if (utf_char_next == "") {
|
5128
|
+
split_condition = true; // final
|
5129
|
+
token += utf_char;
|
5130
|
+
}
|
4573
5131
|
|
4574
|
-
|
4575
|
-
|
4576
|
-
|
4577
|
-
|
4578
|
-
|
4579
|
-
|
5132
|
+
if (split_condition) {
|
5133
|
+
if (token.size()) {
|
5134
|
+
bpe_words.emplace_back(token);
|
5135
|
+
}
|
5136
|
+
token = utf_char;
|
5137
|
+
collecting = false;
|
5138
|
+
collecting_letter = false;
|
5139
|
+
collecting_numeric = false;
|
5140
|
+
collecting_special = false;
|
5141
|
+
collecting_whitespace_lookahead = false;
|
5142
|
+
}
|
5143
|
+
else {
|
5144
|
+
token += utf_char;
|
5145
|
+
}
|
5146
|
+
}
|
5147
|
+
|
5148
|
+
for (std::string & word : bpe_words) {
|
5149
|
+
std::string encoded_token = "";
|
5150
|
+
for (char & c : word) {
|
5151
|
+
encoded_token += bytes_to_unicode_bpe(c);
|
5152
|
+
}
|
5153
|
+
bpe_encoded_words.emplace_back(encoded_token);
|
4580
5154
|
}
|
4581
|
-
return words;
|
4582
5155
|
|
5156
|
+
return bpe_encoded_words;
|
4583
5157
|
}
|
4584
5158
|
|
4585
5159
|
const llama_vocab & vocab;
|
@@ -6022,7 +6596,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6022
6596
|
nthread = std::thread::hardware_concurrency();
|
6023
6597
|
}
|
6024
6598
|
|
6025
|
-
|
6599
|
+
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
6600
|
+
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
6601
|
+
#if defined(__linux__) || defined(_WIN32)
|
6602
|
+
constexpr bool use_mmap = true;
|
6603
|
+
#else
|
6604
|
+
constexpr bool use_mmap = false;
|
6605
|
+
#endif
|
6606
|
+
|
6607
|
+
llama_model_loader ml(fname_inp, use_mmap);
|
6608
|
+
if (ml.use_mmap) {
|
6609
|
+
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
6610
|
+
}
|
6026
6611
|
|
6027
6612
|
llama_model model;
|
6028
6613
|
llm_load_arch(ml, model);
|
@@ -6100,10 +6685,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
6100
6685
|
|
6101
6686
|
const std::string name = ggml_get_name(tensor);
|
6102
6687
|
|
6103
|
-
if (
|
6104
|
-
read_data.
|
6688
|
+
if (!ml.use_mmap) {
|
6689
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
6690
|
+
read_data.resize(ggml_nbytes(tensor));
|
6691
|
+
}
|
6692
|
+
tensor->data = read_data.data();
|
6105
6693
|
}
|
6106
|
-
tensor->data = read_data.data();
|
6107
6694
|
ml.load_data_for(tensor);
|
6108
6695
|
|
6109
6696
|
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
|
@@ -6738,13 +7325,14 @@ struct llama_context * llama_new_context_with_model(
|
|
6738
7325
|
|
6739
7326
|
#ifdef GGML_USE_METAL
|
6740
7327
|
if (model->n_gpu_layers > 0) {
|
7328
|
+
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
7329
|
+
|
6741
7330
|
ctx->ctx_metal = ggml_metal_init(1);
|
6742
7331
|
if (!ctx->ctx_metal) {
|
6743
7332
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
6744
7333
|
llama_free(ctx);
|
6745
7334
|
return NULL;
|
6746
7335
|
}
|
6747
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
6748
7336
|
//ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
|
6749
7337
|
//ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
|
6750
7338
|
}
|
@@ -6872,6 +7460,10 @@ int llama_n_embd(const struct llama_model * model) {
|
|
6872
7460
|
return model->hparams.n_embd;
|
6873
7461
|
}
|
6874
7462
|
|
7463
|
+
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
7464
|
+
return model->hparams.rope_freq_scale_train;
|
7465
|
+
}
|
7466
|
+
|
6875
7467
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
6876
7468
|
return snprintf(buf, buf_size, "%s %s %s",
|
6877
7469
|
llama_model_arch_name(model->arch).c_str(),
|
@@ -7039,16 +7631,6 @@ struct llama_data_file_context : llama_data_context {
|
|
7039
7631
|
*
|
7040
7632
|
*/
|
7041
7633
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
7042
|
-
// TODO: does not support multi-sequence states
|
7043
|
-
{
|
7044
|
-
const auto & kv_self = ctx->kv_self;
|
7045
|
-
for (uint32_t i = 0; i < kv_self.head; ++i) {
|
7046
|
-
GGML_ASSERT(kv_self.cells[i].pos == (int32_t) i);
|
7047
|
-
GGML_ASSERT(kv_self.cells[i].seq_id.size() == 1);
|
7048
|
-
GGML_ASSERT(kv_self.cells[i].has_seq_id(0));
|
7049
|
-
}
|
7050
|
-
}
|
7051
|
-
|
7052
7634
|
// copy rng
|
7053
7635
|
{
|
7054
7636
|
std::stringstream rng_ss;
|
@@ -7101,36 +7683,38 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7101
7683
|
const auto & hparams = ctx->model.hparams;
|
7102
7684
|
const auto & cparams = ctx->cparams;
|
7103
7685
|
|
7104
|
-
const
|
7105
|
-
const
|
7106
|
-
const
|
7686
|
+
const auto n_layer = hparams.n_layer;
|
7687
|
+
const auto n_embd = hparams.n_embd_gqa();
|
7688
|
+
const auto n_ctx = cparams.n_ctx;
|
7107
7689
|
|
7108
|
-
const size_t
|
7109
|
-
const
|
7690
|
+
const size_t kv_buf_size = kv_self.buf.size;
|
7691
|
+
const uint32_t kv_head = kv_self.head;
|
7692
|
+
const uint32_t kv_size = kv_self.size;
|
7110
7693
|
|
7111
|
-
data_ctx->write(&
|
7112
|
-
data_ctx->write(&
|
7694
|
+
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
7695
|
+
data_ctx->write(&kv_head, sizeof(kv_head));
|
7696
|
+
data_ctx->write(&kv_size, sizeof(kv_size));
|
7113
7697
|
|
7114
|
-
if (
|
7698
|
+
if (kv_buf_size) {
|
7115
7699
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7116
7700
|
|
7117
7701
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7118
7702
|
ggml_cgraph gf{};
|
7119
7703
|
|
7120
|
-
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7704
|
+
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7121
7705
|
std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
|
7122
7706
|
kout3d->data = kout3d_data.data();
|
7123
7707
|
|
7124
|
-
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7708
|
+
ggml_tensor * vout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7125
7709
|
std::vector<uint8_t> vout3d_data(ggml_nbytes(vout3d), 0);
|
7126
7710
|
vout3d->data = vout3d_data.data();
|
7127
7711
|
|
7128
7712
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7129
|
-
n_embd,
|
7713
|
+
n_embd, kv_head, n_layer,
|
7130
7714
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7131
7715
|
|
7132
7716
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7133
|
-
|
7717
|
+
kv_head, n_embd, n_layer,
|
7134
7718
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7135
7719
|
|
7136
7720
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
@@ -7144,6 +7728,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
7144
7728
|
data_ctx->write(kout3d_data.data(), kout3d_data.size());
|
7145
7729
|
data_ctx->write(vout3d_data.data(), vout3d_data.size());
|
7146
7730
|
}
|
7731
|
+
|
7732
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7733
|
+
const auto & cell = kv_self.cells[i];
|
7734
|
+
|
7735
|
+
const llama_pos pos = cell.pos;
|
7736
|
+
const size_t seq_id_size = cell.seq_id.size();
|
7737
|
+
|
7738
|
+
data_ctx->write(&pos, sizeof(pos));
|
7739
|
+
data_ctx->write(&seq_id_size, sizeof(seq_id_size));
|
7740
|
+
|
7741
|
+
for (auto seq_id : cell.seq_id) {
|
7742
|
+
data_ctx->write(&seq_id, sizeof(seq_id));
|
7743
|
+
}
|
7744
|
+
}
|
7147
7745
|
}
|
7148
7746
|
}
|
7149
7747
|
|
@@ -7215,34 +7813,36 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7215
7813
|
const int n_embd = hparams.n_embd_gqa();
|
7216
7814
|
const int n_ctx = cparams.n_ctx;
|
7217
7815
|
|
7218
|
-
size_t
|
7219
|
-
|
7816
|
+
size_t kv_buf_size;
|
7817
|
+
uint32_t kv_head;
|
7818
|
+
uint32_t kv_size;
|
7220
7819
|
|
7221
|
-
memcpy(&
|
7222
|
-
memcpy(&
|
7820
|
+
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
7821
|
+
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
7822
|
+
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
7223
7823
|
|
7224
|
-
if (
|
7225
|
-
GGML_ASSERT(kv_self.buf.size ==
|
7824
|
+
if (kv_buf_size) {
|
7825
|
+
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
7226
7826
|
|
7227
7827
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
7228
7828
|
|
7229
7829
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
7230
7830
|
ggml_cgraph gf{};
|
7231
7831
|
|
7232
|
-
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd,
|
7832
|
+
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
|
7233
7833
|
kin3d->data = (void *) inp;
|
7234
7834
|
inp += ggml_nbytes(kin3d);
|
7235
7835
|
|
7236
|
-
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type,
|
7836
|
+
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_head, n_embd, n_layer);
|
7237
7837
|
vin3d->data = (void *) inp;
|
7238
7838
|
inp += ggml_nbytes(vin3d);
|
7239
7839
|
|
7240
7840
|
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
7241
|
-
n_embd,
|
7841
|
+
n_embd, kv_head, n_layer,
|
7242
7842
|
elt_size*n_embd, elt_size*n_embd*n_ctx, 0);
|
7243
7843
|
|
7244
7844
|
ggml_tensor * v3d = ggml_view_3d(cpy_ctx, kv_self.v,
|
7245
|
-
|
7845
|
+
kv_head, n_embd, n_layer,
|
7246
7846
|
elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
|
7247
7847
|
|
7248
7848
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
@@ -7252,8 +7852,27 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
7252
7852
|
ggml_free(cpy_ctx);
|
7253
7853
|
}
|
7254
7854
|
|
7255
|
-
ctx->kv_self.head =
|
7855
|
+
ctx->kv_self.head = kv_head;
|
7256
7856
|
ctx->kv_self.size = kv_size;
|
7857
|
+
|
7858
|
+
ctx->kv_self.cells.resize(kv_size);
|
7859
|
+
|
7860
|
+
for (uint32_t i = 0; i < kv_size; ++i) {
|
7861
|
+
llama_pos pos;
|
7862
|
+
size_t seq_id_size;
|
7863
|
+
|
7864
|
+
memcpy(&pos, inp, sizeof(pos)); inp += sizeof(pos);
|
7865
|
+
memcpy(&seq_id_size, inp, sizeof(seq_id_size)); inp += sizeof(seq_id_size);
|
7866
|
+
|
7867
|
+
ctx->kv_self.cells[i].pos = pos;
|
7868
|
+
|
7869
|
+
llama_seq_id seq_id;
|
7870
|
+
|
7871
|
+
for (size_t j = 0; j < seq_id_size; ++j) {
|
7872
|
+
memcpy(&seq_id, inp, sizeof(seq_id)); inp += sizeof(seq_id);
|
7873
|
+
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
7874
|
+
}
|
7875
|
+
}
|
7257
7876
|
}
|
7258
7877
|
|
7259
7878
|
const size_t nread = inp - src;
|
@@ -7471,6 +8090,22 @@ llama_token llama_token_eos(const struct llama_context * ctx) {
|
|
7471
8090
|
llama_token llama_token_nl(const struct llama_context * ctx) {
|
7472
8091
|
return ctx->model.vocab.linefeed_id;
|
7473
8092
|
}
|
8093
|
+
llama_token llama_token_prefix(const struct llama_context * ctx) {
|
8094
|
+
return ctx->model.vocab.special_prefix_id;
|
8095
|
+
}
|
8096
|
+
|
8097
|
+
llama_token llama_token_middle(const struct llama_context * ctx) {
|
8098
|
+
return ctx->model.vocab.special_middle_id;
|
8099
|
+
}
|
8100
|
+
|
8101
|
+
llama_token llama_token_suffix(const struct llama_context * ctx) {
|
8102
|
+
return ctx->model.vocab.special_suffix_id;
|
8103
|
+
}
|
8104
|
+
|
8105
|
+
llama_token llama_token_eot(const struct llama_context * ctx) {
|
8106
|
+
return ctx->model.vocab.special_eot_id;
|
8107
|
+
}
|
8108
|
+
|
7474
8109
|
|
7475
8110
|
int llama_tokenize(
|
7476
8111
|
const struct llama_model * model,
|
@@ -7493,35 +8128,66 @@ int llama_tokenize(
|
|
7493
8128
|
return res.size();
|
7494
8129
|
}
|
7495
8130
|
|
8131
|
+
static std::string llama_decode_text(const std::string & text) {
|
8132
|
+
std::string decoded_text;
|
8133
|
+
auto unicode_sequences = codepoints_from_utf8(text);
|
8134
|
+
for (auto& unicode_sequence : unicode_sequences) {
|
8135
|
+
decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence));
|
8136
|
+
}
|
8137
|
+
|
8138
|
+
return decoded_text;
|
8139
|
+
}
|
8140
|
+
|
7496
8141
|
// does not write null-terminator to buf
|
7497
8142
|
int llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int length) {
|
7498
8143
|
if (0 <= token && token < llama_n_vocab(model)) {
|
7499
|
-
|
7500
|
-
|
7501
|
-
if (
|
8144
|
+
switch (llama_vocab_get_type(model->vocab)) {
|
8145
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
8146
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8147
|
+
std::string result = model->vocab.id_to_token[token].text;
|
7502
8148
|
llama_unescape_whitespace(result);
|
8149
|
+
if (length < (int) result.length()) {
|
8150
|
+
return -result.length();
|
8151
|
+
}
|
8152
|
+
memcpy(buf, result.c_str(), result.length());
|
8153
|
+
return result.length();
|
8154
|
+
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
8155
|
+
if (length < 3) {
|
8156
|
+
return -3;
|
8157
|
+
}
|
8158
|
+
memcpy(buf, "\xe2\x96\x85", 3);
|
8159
|
+
return 3;
|
8160
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8161
|
+
;
|
8162
|
+
} else if (llama_is_byte_token(model->vocab, token)) {
|
8163
|
+
if (length < 1) {
|
8164
|
+
return -1;
|
8165
|
+
}
|
8166
|
+
buf[0] = llama_token_to_byte(model->vocab, token);
|
8167
|
+
return 1;
|
8168
|
+
} else {
|
8169
|
+
GGML_ASSERT(false);
|
7503
8170
|
}
|
7504
|
-
|
7505
|
-
|
7506
|
-
|
7507
|
-
|
7508
|
-
|
7509
|
-
|
7510
|
-
|
7511
|
-
|
7512
|
-
|
7513
|
-
|
7514
|
-
|
7515
|
-
|
7516
|
-
|
7517
|
-
|
7518
|
-
|
7519
|
-
} else if (llama_is_byte_token(model->vocab, token)) {
|
7520
|
-
if (length < 1) {
|
7521
|
-
return -1;
|
8171
|
+
break;
|
8172
|
+
}
|
8173
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
8174
|
+
if (llama_is_normal_token(model->vocab, token)) {
|
8175
|
+
std::string result = model->vocab.id_to_token[token].text;
|
8176
|
+
result = llama_decode_text(result);
|
8177
|
+
if (length < (int) result.length()) {
|
8178
|
+
return -result.length();
|
8179
|
+
}
|
8180
|
+
memcpy(buf, result.c_str(), result.length());
|
8181
|
+
return result.length();
|
8182
|
+
} else if (llama_is_control_token(model->vocab, token)) {
|
8183
|
+
;
|
8184
|
+
} else {
|
8185
|
+
GGML_ASSERT(false);
|
7522
8186
|
}
|
7523
|
-
|
7524
|
-
|
8187
|
+
break;
|
8188
|
+
}
|
8189
|
+
default:
|
8190
|
+
GGML_ASSERT(false);
|
7525
8191
|
}
|
7526
8192
|
}
|
7527
8193
|
return 0;
|
@@ -7548,14 +8214,14 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
7548
8214
|
const llama_timings timings = llama_get_timings(ctx);
|
7549
8215
|
|
7550
8216
|
LLAMA_LOG_INFO("\n");
|
7551
|
-
LLAMA_LOG_INFO("%s: load time = %
|
7552
|
-
LLAMA_LOG_INFO("%s: sample time = %
|
8217
|
+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, timings.t_load_ms);
|
8218
|
+
LLAMA_LOG_INFO("%s: sample time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7553
8219
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
7554
|
-
LLAMA_LOG_INFO("%s: prompt eval time = %
|
8220
|
+
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
7555
8221
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
7556
|
-
LLAMA_LOG_INFO("%s: eval time = %
|
8222
|
+
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
7557
8223
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
7558
|
-
LLAMA_LOG_INFO("%s: total time = %
|
8224
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
7559
8225
|
}
|
7560
8226
|
|
7561
8227
|
void llama_reset_timings(struct llama_context * ctx) {
|