llama_cpp 0.5.1 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +32 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -11
- data/ext/llama_cpp/src/ggml-cuda.cu +1108 -699
- data/ext/llama_cpp/src/ggml-metal.m +93 -24
- data/ext/llama_cpp/src/ggml-metal.metal +407 -174
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +75 -43
- data/ext/llama_cpp/src/ggml.h +42 -32
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +1040 -201
- data/ext/llama_cpp/src/llama.h +13 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,8 +1,4 @@
|
|
1
|
-
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
1
|
+
#define LLAMA_API_INTERNAL
|
6
2
|
#include "llama.h"
|
7
3
|
|
8
4
|
#include "ggml.h"
|
@@ -113,7 +109,7 @@ static size_t utf8_len(char src) {
|
|
113
109
|
return lookup[highbits];
|
114
110
|
}
|
115
111
|
|
116
|
-
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
117
113
|
std::string result;
|
118
114
|
for (size_t pos = 0; ; pos += search.length()) {
|
119
115
|
auto new_pos = s.find(search, pos);
|
@@ -160,20 +156,24 @@ static std::string format(const char * fmt, ...) {
|
|
160
156
|
enum llm_arch {
|
161
157
|
LLM_ARCH_LLAMA,
|
162
158
|
LLM_ARCH_FALCON,
|
159
|
+
LLM_ARCH_BAICHUAN,
|
163
160
|
LLM_ARCH_GPT2,
|
164
161
|
LLM_ARCH_GPTJ,
|
165
162
|
LLM_ARCH_GPTNEOX,
|
166
163
|
LLM_ARCH_MPT,
|
164
|
+
LLM_ARCH_STARCODER,
|
167
165
|
LLM_ARCH_UNKNOWN,
|
168
166
|
};
|
169
167
|
|
170
168
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
171
|
-
{ LLM_ARCH_LLAMA,
|
172
|
-
{ LLM_ARCH_FALCON,
|
173
|
-
{ LLM_ARCH_GPT2,
|
174
|
-
{ LLM_ARCH_GPTJ,
|
175
|
-
{ LLM_ARCH_GPTNEOX,
|
176
|
-
{ LLM_ARCH_MPT,
|
169
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
170
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
171
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
172
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
173
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
174
|
+
{ LLM_ARCH_MPT, "mpt" },
|
175
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
177
177
|
};
|
178
178
|
|
179
179
|
enum llm_kv {
|
@@ -314,6 +314,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
314
314
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
315
315
|
},
|
316
316
|
},
|
317
|
+
{
|
318
|
+
LLM_ARCH_BAICHUAN,
|
319
|
+
{
|
320
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
321
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
322
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
323
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
324
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
325
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
326
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
327
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
328
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
329
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
330
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
331
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
332
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
333
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
334
|
+
},
|
335
|
+
},
|
317
336
|
{
|
318
337
|
LLM_ARCH_FALCON,
|
319
338
|
{
|
@@ -360,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
360
379
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
361
380
|
},
|
362
381
|
},
|
382
|
+
{
|
383
|
+
LLM_ARCH_STARCODER,
|
384
|
+
{
|
385
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
386
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
387
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
388
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
389
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
390
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
391
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
392
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
393
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
394
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
395
|
+
},
|
396
|
+
},
|
363
397
|
{
|
364
398
|
LLM_ARCH_UNKNOWN,
|
365
399
|
{
|
@@ -658,9 +692,7 @@ struct llama_mmap {
|
|
658
692
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
659
693
|
if (prefetch) {
|
660
694
|
// Advise the kernel to preload the mapped memory
|
661
|
-
|
662
695
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
-
|
664
696
|
range.VirtualAddress = addr;
|
665
697
|
range.NumberOfBytes = (SIZE_T)size;
|
666
698
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
@@ -876,9 +908,11 @@ static llama_state g_state;
|
|
876
908
|
// available llama models
|
877
909
|
enum e_model {
|
878
910
|
MODEL_UNKNOWN,
|
911
|
+
MODEL_1B,
|
879
912
|
MODEL_3B,
|
880
913
|
MODEL_7B,
|
881
914
|
MODEL_13B,
|
915
|
+
MODEL_15B,
|
882
916
|
MODEL_30B,
|
883
917
|
MODEL_34B,
|
884
918
|
MODEL_40B,
|
@@ -888,24 +922,24 @@ enum e_model {
|
|
888
922
|
|
889
923
|
static const size_t kB = 1024;
|
890
924
|
static const size_t MB = kB*kB;
|
925
|
+
static const size_t GB = kB*kB*kB;
|
891
926
|
|
892
|
-
// default hparams (LLaMA 7B)
|
893
927
|
struct llama_hparams {
|
894
|
-
uint32_t n_vocab
|
895
|
-
uint32_t n_ctx_train
|
896
|
-
uint32_t n_ctx
|
897
|
-
uint32_t n_embd
|
898
|
-
uint32_t n_head
|
899
|
-
uint32_t n_head_kv
|
900
|
-
uint32_t n_layer
|
901
|
-
uint32_t n_rot
|
902
|
-
uint32_t n_ff
|
903
|
-
|
904
|
-
float f_norm_eps
|
905
|
-
float f_norm_rms_eps
|
906
|
-
|
907
|
-
float rope_freq_base
|
908
|
-
float rope_freq_scale
|
928
|
+
uint32_t n_vocab;
|
929
|
+
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
+
uint32_t n_ctx; // context size used during inference
|
931
|
+
uint32_t n_embd;
|
932
|
+
uint32_t n_head;
|
933
|
+
uint32_t n_head_kv;
|
934
|
+
uint32_t n_layer;
|
935
|
+
uint32_t n_rot;
|
936
|
+
uint32_t n_ff;
|
937
|
+
|
938
|
+
float f_norm_eps;
|
939
|
+
float f_norm_rms_eps;
|
940
|
+
|
941
|
+
float rope_freq_base;
|
942
|
+
float rope_freq_scale;
|
909
943
|
|
910
944
|
bool operator!=(const llama_hparams & other) const {
|
911
945
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -947,13 +981,22 @@ struct llama_layer {
|
|
947
981
|
struct ggml_tensor * wo;
|
948
982
|
struct ggml_tensor * wqkv;
|
949
983
|
|
984
|
+
// attention bias
|
985
|
+
struct ggml_tensor * bo;
|
986
|
+
struct ggml_tensor * bqkv;
|
987
|
+
|
950
988
|
// normalization
|
951
989
|
struct ggml_tensor * ffn_norm;
|
990
|
+
struct ggml_tensor * ffn_norm_b;
|
952
991
|
|
953
992
|
// ff
|
954
993
|
struct ggml_tensor * w1; // ffn_gate
|
955
994
|
struct ggml_tensor * w2; // ffn_down
|
956
995
|
struct ggml_tensor * w3; // ffn_up
|
996
|
+
|
997
|
+
// ff bias
|
998
|
+
struct ggml_tensor * b2; // ffn_down
|
999
|
+
struct ggml_tensor * b3; // ffn_up
|
957
1000
|
};
|
958
1001
|
|
959
1002
|
struct llama_kv_cache {
|
@@ -1027,10 +1070,11 @@ struct llama_model {
|
|
1027
1070
|
|
1028
1071
|
std::string name = "n/a";
|
1029
1072
|
|
1030
|
-
llama_hparams hparams;
|
1073
|
+
llama_hparams hparams = {};
|
1031
1074
|
llama_vocab vocab;
|
1032
1075
|
|
1033
1076
|
struct ggml_tensor * tok_embeddings;
|
1077
|
+
struct ggml_tensor * pos_embeddings;
|
1034
1078
|
|
1035
1079
|
struct ggml_tensor * output_norm;
|
1036
1080
|
struct ggml_tensor * output_norm_b;
|
@@ -1231,6 +1275,7 @@ struct llama_model_loader {
|
|
1231
1275
|
int n_created = 0;
|
1232
1276
|
|
1233
1277
|
int64_t n_elements = 0;
|
1278
|
+
size_t n_bytes = 0;
|
1234
1279
|
|
1235
1280
|
bool use_mmap = false;
|
1236
1281
|
|
@@ -1263,6 +1308,7 @@ struct llama_model_loader {
|
|
1263
1308
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
1264
1309
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
1265
1310
|
n_elements += ggml_nelements(t);
|
1311
|
+
n_bytes += ggml_nbytes(t);
|
1266
1312
|
}
|
1267
1313
|
|
1268
1314
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -1541,7 +1587,7 @@ struct llama_model_loader {
|
|
1541
1587
|
// load LLaMA models
|
1542
1588
|
//
|
1543
1589
|
|
1544
|
-
std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1590
|
+
static std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1545
1591
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1546
1592
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1547
1593
|
}
|
@@ -1574,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|
1574
1620
|
|
1575
1621
|
static const char * llama_model_type_name(e_model type) {
|
1576
1622
|
switch (type) {
|
1623
|
+
case MODEL_1B: return "1B";
|
1577
1624
|
case MODEL_3B: return "3B";
|
1578
1625
|
case MODEL_7B: return "7B";
|
1579
1626
|
case MODEL_13B: return "13B";
|
1627
|
+
case MODEL_15B: return "15B";
|
1580
1628
|
case MODEL_30B: return "30B";
|
1581
1629
|
case MODEL_34B: return "34B";
|
1582
1630
|
case MODEL_40B: return "40B";
|
@@ -1620,28 +1668,17 @@ static void llm_load_hparams(
|
|
1620
1668
|
hparams.n_head_kv = hparams.n_head;
|
1621
1669
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1622
1670
|
|
1623
|
-
//
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
llama_context_params defaults = llama_context_default_params();
|
1628
|
-
|
1629
|
-
// rope_freq_base
|
1630
|
-
{
|
1631
|
-
float ropebase = 10000.0f;
|
1632
|
-
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1633
|
-
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
1634
|
-
rope_freq_base = ropebase;
|
1635
|
-
}
|
1671
|
+
// rope_freq_base (optional)
|
1672
|
+
if (rope_freq_base == 0.0f) {
|
1673
|
+
rope_freq_base = 10000.0f;
|
1674
|
+
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1636
1675
|
}
|
1637
1676
|
|
1638
1677
|
// rope_freq_scale (inverse of the kv) is optional
|
1639
|
-
{
|
1678
|
+
if (rope_freq_scale == 0.0f) {
|
1640
1679
|
float ropescale = 1.0f;
|
1641
1680
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1642
|
-
|
1643
|
-
rope_freq_scale = 1.0f/ropescale;
|
1644
|
-
}
|
1681
|
+
rope_freq_scale = 1.0f/ropescale;
|
1645
1682
|
}
|
1646
1683
|
|
1647
1684
|
// sanity check for n_rot (optional)
|
@@ -1685,6 +1722,26 @@ static void llm_load_hparams(
|
|
1685
1722
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1686
1723
|
}
|
1687
1724
|
} break;
|
1725
|
+
case LLM_ARCH_BAICHUAN:
|
1726
|
+
{
|
1727
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1728
|
+
switch (hparams.n_layer) {
|
1729
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
1730
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
1731
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1732
|
+
}
|
1733
|
+
} break;
|
1734
|
+
case LLM_ARCH_STARCODER:
|
1735
|
+
{
|
1736
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
1737
|
+
switch (hparams.n_layer) {
|
1738
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
1739
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
1740
|
+
case 42: model.type = e_model::MODEL_7B; break;
|
1741
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
1742
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1743
|
+
}
|
1744
|
+
} break;
|
1688
1745
|
default: (void)0;
|
1689
1746
|
};
|
1690
1747
|
|
@@ -1838,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1838
1895
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1839
1896
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
1840
1897
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
1841
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
|
+
if (ml.n_bytes < GB) {
|
1900
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
|
+
} else {
|
1902
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
|
+
}
|
1842
1904
|
|
1843
1905
|
// general kv
|
1844
1906
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
@@ -1925,7 +1987,6 @@ static void llm_load_tensors(
|
|
1925
1987
|
const int64_t n_vocab = hparams.n_vocab;
|
1926
1988
|
|
1927
1989
|
const auto tn = LLM_TN(model.arch);
|
1928
|
-
|
1929
1990
|
switch (model.arch) {
|
1930
1991
|
case LLM_ARCH_LLAMA:
|
1931
1992
|
{
|
@@ -1968,6 +2029,72 @@ static void llm_load_tensors(
|
|
1968
2029
|
|
1969
2030
|
model.layers.resize(n_layer);
|
1970
2031
|
|
2032
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2033
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2034
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2035
|
+
|
2036
|
+
auto & layer = model.layers[i];
|
2037
|
+
|
2038
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2039
|
+
|
2040
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
2041
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2042
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2043
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2044
|
+
|
2045
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2046
|
+
|
2047
|
+
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
2048
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2049
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2050
|
+
|
2051
|
+
if (backend == GGML_BACKEND_GPU) {
|
2052
|
+
vram_weights +=
|
2053
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
2054
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
2055
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
2056
|
+
}
|
2057
|
+
}
|
2058
|
+
} break;
|
2059
|
+
case LLM_ARCH_BAICHUAN:
|
2060
|
+
{
|
2061
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2062
|
+
{
|
2063
|
+
ggml_backend backend_norm;
|
2064
|
+
ggml_backend backend_output;
|
2065
|
+
|
2066
|
+
if (n_gpu_layers > int(n_layer)) {
|
2067
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2068
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2069
|
+
#ifndef _WIN32
|
2070
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2071
|
+
#else
|
2072
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2073
|
+
#endif // _WIN32
|
2074
|
+
|
2075
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2076
|
+
} else {
|
2077
|
+
backend_norm = GGML_BACKEND_CPU;
|
2078
|
+
backend_output = GGML_BACKEND_CPU;
|
2079
|
+
}
|
2080
|
+
|
2081
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2082
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2083
|
+
|
2084
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2085
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2086
|
+
}
|
2087
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2088
|
+
vram_weights += ggml_nbytes(model.output);
|
2089
|
+
}
|
2090
|
+
}
|
2091
|
+
|
2092
|
+
const uint32_t n_ff = hparams.n_ff;
|
2093
|
+
|
2094
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2095
|
+
|
2096
|
+
model.layers.resize(n_layer);
|
2097
|
+
|
1971
2098
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1972
2099
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1973
2100
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
@@ -2073,6 +2200,85 @@ static void llm_load_tensors(
|
|
2073
2200
|
}
|
2074
2201
|
}
|
2075
2202
|
} break;
|
2203
|
+
case LLM_ARCH_STARCODER:
|
2204
|
+
{
|
2205
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2206
|
+
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
2207
|
+
|
2208
|
+
// output
|
2209
|
+
{
|
2210
|
+
ggml_backend backend_norm;
|
2211
|
+
ggml_backend backend_output;
|
2212
|
+
|
2213
|
+
if (n_gpu_layers > int(n_layer)) {
|
2214
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2216
|
+
#ifndef _WIN32
|
2217
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2218
|
+
#else
|
2219
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
|
+
#endif // _WIN32
|
2221
|
+
|
2222
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2223
|
+
} else {
|
2224
|
+
backend_norm = GGML_BACKEND_CPU;
|
2225
|
+
backend_output = GGML_BACKEND_CPU;
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2229
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2230
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2231
|
+
|
2232
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2233
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2234
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2235
|
+
}
|
2236
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2237
|
+
vram_weights += ggml_nbytes(model.output);
|
2238
|
+
}
|
2239
|
+
}
|
2240
|
+
|
2241
|
+
const uint32_t n_ff = hparams.n_ff;
|
2242
|
+
|
2243
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2244
|
+
|
2245
|
+
model.layers.resize(n_layer);
|
2246
|
+
|
2247
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2248
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2249
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2250
|
+
|
2251
|
+
auto & layer = model.layers[i];
|
2252
|
+
|
2253
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2254
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2255
|
+
|
2256
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2257
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2258
|
+
|
2259
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2260
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2261
|
+
|
2262
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2263
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2264
|
+
|
2265
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2266
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2267
|
+
|
2268
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2269
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2270
|
+
|
2271
|
+
if (backend == GGML_BACKEND_GPU) {
|
2272
|
+
vram_weights +=
|
2273
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2274
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2275
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2276
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2277
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
|
2278
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
|
2279
|
+
}
|
2280
|
+
}
|
2281
|
+
} break;
|
2076
2282
|
default:
|
2077
2283
|
throw std::runtime_error("unknown architecture");
|
2078
2284
|
};
|
@@ -2354,11 +2560,356 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2354
2560
|
offload_func_kq(tmpq);
|
2355
2561
|
ggml_set_name(tmpq, "tmpq");
|
2356
2562
|
|
2357
|
-
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2563
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2564
|
+
offload_func_kq(Kcur);
|
2565
|
+
ggml_set_name(Kcur, "Kcur");
|
2566
|
+
|
2567
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2568
|
+
offload_func_kq(Qcur);
|
2569
|
+
ggml_set_name(Qcur, "Qcur");
|
2570
|
+
|
2571
|
+
// store key and value to memory
|
2572
|
+
{
|
2573
|
+
// compute the transposed [N, n_embd] V matrix
|
2574
|
+
|
2575
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2576
|
+
offload_func_v(tmpv);
|
2577
|
+
ggml_set_name(tmpv, "tmpv");
|
2578
|
+
|
2579
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
2580
|
+
offload_func_v(Vcur);
|
2581
|
+
ggml_set_name(Vcur, "Vcur");
|
2582
|
+
|
2583
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
2584
|
+
offload_func_kq(k);
|
2585
|
+
ggml_set_name(k, "k");
|
2586
|
+
|
2587
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
2588
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
2589
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
2590
|
+
offload_func_v(v);
|
2591
|
+
ggml_set_name(v, "v");
|
2592
|
+
|
2593
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
2594
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
2595
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
2596
|
+
}
|
2597
|
+
|
2598
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
2599
|
+
offload_func_kq(Q);
|
2600
|
+
ggml_set_name(Q, "Q");
|
2601
|
+
|
2602
|
+
struct ggml_tensor * K =
|
2603
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2604
|
+
n_embd_head, n_past + N, n_head_kv,
|
2605
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2606
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2607
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
2608
|
+
offload_func_kq(K);
|
2609
|
+
ggml_set_name(K, "K");
|
2610
|
+
|
2611
|
+
// K * Q
|
2612
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
2613
|
+
offload_func_kq(KQ);
|
2614
|
+
ggml_set_name(KQ, "KQ");
|
2615
|
+
|
2616
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2617
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
2618
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
2619
|
+
offload_func_kq(KQ_scaled);
|
2620
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2621
|
+
|
2622
|
+
// KQ_masked = mask_past(KQ_scaled)
|
2623
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2624
|
+
offload_func_kq(KQ_masked);
|
2625
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
2626
|
+
|
2627
|
+
// KQ = soft_max(KQ_masked)
|
2628
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
2629
|
+
offload_func_v(KQ_soft_max);
|
2630
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2631
|
+
|
2632
|
+
// split cached V into n_head heads
|
2633
|
+
struct ggml_tensor * V =
|
2634
|
+
ggml_view_3d(ctx0, kv_self.v,
|
2635
|
+
n_past + N, n_embd_head, n_head_kv,
|
2636
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
2637
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2638
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2639
|
+
offload_func_v(V);
|
2640
|
+
ggml_set_name(V, "V");
|
2641
|
+
|
2642
|
+
#if 1
|
2643
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2644
|
+
offload_func_v(KQV);
|
2645
|
+
ggml_set_name(KQV, "KQV");
|
2646
|
+
#else
|
2647
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2648
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2649
|
+
// is there a better way?
|
2650
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2651
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2652
|
+
#endif
|
2653
|
+
|
2654
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2655
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2656
|
+
offload_func_v(KQV_merged);
|
2657
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
2658
|
+
|
2659
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
2660
|
+
cur = ggml_cpy(ctx0,
|
2661
|
+
KQV_merged,
|
2662
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2663
|
+
offload_func_v(cur);
|
2664
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
2665
|
+
|
2666
|
+
// projection (no bias)
|
2667
|
+
cur = ggml_mul_mat(ctx0,
|
2668
|
+
model.layers[il].wo,
|
2669
|
+
cur);
|
2670
|
+
offload_func(cur);
|
2671
|
+
ggml_set_name(cur, "result_wo");
|
2672
|
+
}
|
2673
|
+
|
2674
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
2675
|
+
offload_func(inpFF);
|
2676
|
+
ggml_set_name(inpFF, "inpFF");
|
2677
|
+
|
2678
|
+
// feed-forward network
|
2679
|
+
{
|
2680
|
+
// norm
|
2681
|
+
{
|
2682
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
2683
|
+
offload_func(cur);
|
2684
|
+
ggml_set_name(cur, "rms_norm_1");
|
2685
|
+
|
2686
|
+
// cur = cur*ffn_norm(broadcasted)
|
2687
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
2688
|
+
offload_func(cur);
|
2689
|
+
ggml_set_name(cur, "ffn_norm");
|
2690
|
+
}
|
2691
|
+
|
2692
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
2693
|
+
model.layers[il].w3,
|
2694
|
+
cur);
|
2695
|
+
offload_func(tmp);
|
2696
|
+
ggml_set_name(tmp, "result_w3");
|
2697
|
+
|
2698
|
+
cur = ggml_mul_mat(ctx0,
|
2699
|
+
model.layers[il].w1,
|
2700
|
+
cur);
|
2701
|
+
offload_func(cur);
|
2702
|
+
ggml_set_name(cur, "result_w1");
|
2703
|
+
|
2704
|
+
// SILU activation
|
2705
|
+
cur = ggml_silu(ctx0, cur);
|
2706
|
+
offload_func(cur);
|
2707
|
+
ggml_set_name(cur, "silu");
|
2708
|
+
|
2709
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
2710
|
+
offload_func(cur);
|
2711
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
2712
|
+
|
2713
|
+
cur = ggml_mul_mat(ctx0,
|
2714
|
+
model.layers[il].w2,
|
2715
|
+
cur);
|
2716
|
+
offload_func(cur);
|
2717
|
+
ggml_set_name(cur, "result_w2");
|
2718
|
+
}
|
2719
|
+
|
2720
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
2721
|
+
offload_func(cur);
|
2722
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
2723
|
+
|
2724
|
+
// input for next layer
|
2725
|
+
inpL = cur;
|
2726
|
+
}
|
2727
|
+
|
2728
|
+
cur = inpL;
|
2729
|
+
|
2730
|
+
// norm
|
2731
|
+
{
|
2732
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
2733
|
+
offload_func_nr(cur);
|
2734
|
+
ggml_set_name(cur, "rms_norm_2");
|
2735
|
+
|
2736
|
+
// cur = cur*norm(broadcasted)
|
2737
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
2738
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
2739
|
+
ggml_set_name(cur, "result_norm");
|
2740
|
+
}
|
2741
|
+
|
2742
|
+
// lm_head
|
2743
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
2744
|
+
ggml_set_name(cur, "result_output");
|
2745
|
+
|
2746
|
+
ggml_build_forward_expand(gf, cur);
|
2747
|
+
|
2748
|
+
ggml_free(ctx0);
|
2749
|
+
|
2750
|
+
return gf;
|
2751
|
+
}
|
2752
|
+
|
2753
|
+
|
2754
|
+
static struct ggml_cgraph * llm_build_baichaun(
|
2755
|
+
llama_context & lctx,
|
2756
|
+
const llama_token * tokens,
|
2757
|
+
const float * embd,
|
2758
|
+
int n_tokens,
|
2759
|
+
int n_past) {
|
2760
|
+
|
2761
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2762
|
+
|
2763
|
+
const int N = n_tokens;
|
2764
|
+
|
2765
|
+
const auto & model = lctx.model;
|
2766
|
+
const auto & hparams = model.hparams;
|
2767
|
+
|
2768
|
+
const auto & kv_self = lctx.kv_self;
|
2769
|
+
|
2770
|
+
GGML_ASSERT(!!kv_self.ctx);
|
2771
|
+
|
2772
|
+
const int64_t n_embd = hparams.n_embd;
|
2773
|
+
const int64_t n_layer = hparams.n_layer;
|
2774
|
+
const int64_t n_ctx = hparams.n_ctx;
|
2775
|
+
const int64_t n_head = hparams.n_head;
|
2776
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
2777
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
2778
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
2779
|
+
|
2780
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2781
|
+
|
2782
|
+
const float freq_base = hparams.rope_freq_base;
|
2783
|
+
const float freq_scale = hparams.rope_freq_scale;
|
2784
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2785
|
+
|
2786
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
2787
|
+
|
2788
|
+
auto & buf_compute = lctx.buf_compute;
|
2789
|
+
|
2790
|
+
struct ggml_init_params params = {
|
2791
|
+
/*.mem_size =*/ buf_compute.size,
|
2792
|
+
/*.mem_buffer =*/ buf_compute.data,
|
2793
|
+
/*.no_alloc =*/ false,
|
2794
|
+
};
|
2795
|
+
|
2796
|
+
params.no_alloc = true;
|
2797
|
+
|
2798
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
2799
|
+
|
2800
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
2801
|
+
|
2802
|
+
struct ggml_tensor * cur;
|
2803
|
+
struct ggml_tensor * inpL;
|
2804
|
+
|
2805
|
+
if (tokens) {
|
2806
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
2807
|
+
|
2808
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2809
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2810
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
2811
|
+
}
|
2812
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
2813
|
+
|
2814
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
2815
|
+
} else {
|
2816
|
+
#ifdef GGML_USE_MPI
|
2817
|
+
GGML_ASSERT(false && "not implemented");
|
2818
|
+
#endif
|
2819
|
+
|
2820
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
2821
|
+
|
2822
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
2823
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2824
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
2825
|
+
}
|
2826
|
+
}
|
2827
|
+
|
2828
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2829
|
+
(void) i_gpu_start;
|
2830
|
+
|
2831
|
+
// offload functions set the tensor output backend to GPU
|
2832
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2833
|
+
//
|
2834
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2835
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
2836
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2837
|
+
offload_func_t offload_func_kq = llama_nop;
|
2838
|
+
offload_func_t offload_func_v = llama_nop;
|
2839
|
+
|
2840
|
+
#ifdef GGML_USE_CUBLAS
|
2841
|
+
if (n_gpu_layers > n_layer) {
|
2842
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
2843
|
+
}
|
2844
|
+
if (n_gpu_layers > n_layer + 1) {
|
2845
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
2846
|
+
}
|
2847
|
+
if (n_gpu_layers > n_layer + 2) {
|
2848
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
2849
|
+
}
|
2850
|
+
#endif // GGML_USE_CUBLAS
|
2851
|
+
|
2852
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2853
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2854
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2855
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2856
|
+
}
|
2857
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2858
|
+
|
2859
|
+
for (int il = 0; il < n_layer; ++il) {
|
2860
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
2861
|
+
|
2862
|
+
offload_func_t offload_func = llama_nop;
|
2863
|
+
|
2864
|
+
#ifdef GGML_USE_CUBLAS
|
2865
|
+
if (il >= i_gpu_start) {
|
2866
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
2867
|
+
}
|
2868
|
+
#endif // GGML_USE_CUBLAS
|
2869
|
+
|
2870
|
+
struct ggml_tensor * inpSA = inpL;
|
2871
|
+
|
2872
|
+
// norm
|
2873
|
+
{
|
2874
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
2875
|
+
offload_func(cur);
|
2876
|
+
ggml_set_name(cur, "rms_norm_0");
|
2877
|
+
|
2878
|
+
// cur = cur*attn_norm(broadcasted)
|
2879
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
2880
|
+
offload_func(cur);
|
2881
|
+
ggml_set_name(cur, "attention_norm_0");
|
2882
|
+
}
|
2883
|
+
|
2884
|
+
// self-attention
|
2885
|
+
{
|
2886
|
+
// compute Q and K and RoPE them
|
2887
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
2888
|
+
offload_func_kq(tmpk);
|
2889
|
+
ggml_set_name(tmpk, "tmpk");
|
2890
|
+
|
2891
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
2892
|
+
offload_func_kq(tmpq);
|
2893
|
+
ggml_set_name(tmpq, "tmpq");
|
2894
|
+
|
2895
|
+
struct ggml_tensor * Kcur;
|
2896
|
+
struct ggml_tensor * Qcur;
|
2897
|
+
switch (model.type) {
|
2898
|
+
case MODEL_7B:
|
2899
|
+
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2900
|
+
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2901
|
+
break;
|
2902
|
+
case MODEL_13B:
|
2903
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
2904
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
2905
|
+
break;
|
2906
|
+
default:
|
2907
|
+
GGML_ASSERT(false);
|
2908
|
+
}
|
2909
|
+
|
2358
2910
|
offload_func_kq(Kcur);
|
2359
2911
|
ggml_set_name(Kcur, "Kcur");
|
2360
2912
|
|
2361
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2362
2913
|
offload_func_kq(Qcur);
|
2363
2914
|
ggml_set_name(Qcur, "Qcur");
|
2364
2915
|
|
@@ -2413,10 +2964,26 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2413
2964
|
offload_func_kq(KQ_scaled);
|
2414
2965
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2415
2966
|
|
2967
|
+
struct ggml_tensor * KQ_masked;
|
2968
|
+
struct ggml_tensor * KQ_scaled_alibi;
|
2969
|
+
|
2970
|
+
switch (model.type) {
|
2971
|
+
case MODEL_7B:
|
2972
|
+
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2973
|
+
break;
|
2974
|
+
case MODEL_13B:
|
2975
|
+
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
2976
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2977
|
+
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2978
|
+
break;
|
2979
|
+
default:
|
2980
|
+
GGML_ASSERT(false);
|
2981
|
+
}
|
2416
2982
|
// KQ_masked = mask_past(KQ_scaled)
|
2417
|
-
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2418
|
-
|
2419
|
-
|
2983
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2984
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2985
|
+
// offload_func_kq(KQ_masked);
|
2986
|
+
// ggml_set_name(KQ_masked, "KQ_masked");
|
2420
2987
|
|
2421
2988
|
// KQ = soft_max(KQ_masked)
|
2422
2989
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
@@ -2851,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
2851
3418
|
return gf;
|
2852
3419
|
}
|
2853
3420
|
|
3421
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
3422
|
+
llama_context & lctx,
|
3423
|
+
const llama_token * tokens,
|
3424
|
+
const float * embd,
|
3425
|
+
int n_tokens,
|
3426
|
+
int n_past) {
|
3427
|
+
|
3428
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
+
|
3430
|
+
const int N = n_tokens;
|
3431
|
+
|
3432
|
+
const auto & model = lctx.model;
|
3433
|
+
const auto & hparams = model.hparams;
|
3434
|
+
|
3435
|
+
const auto & kv_self = lctx.kv_self;
|
3436
|
+
|
3437
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3438
|
+
|
3439
|
+
const int64_t n_embd = hparams.n_embd;
|
3440
|
+
const int64_t n_layer = hparams.n_layer;
|
3441
|
+
const int64_t n_ctx = hparams.n_ctx;
|
3442
|
+
const int64_t n_head = hparams.n_head;
|
3443
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3445
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3446
|
+
|
3447
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
|
+
|
3449
|
+
const float norm_eps = hparams.f_norm_eps;
|
3450
|
+
|
3451
|
+
auto & buf_compute = lctx.buf_compute;
|
3452
|
+
|
3453
|
+
struct ggml_init_params params = {
|
3454
|
+
/*.mem_size =*/ buf_compute.size,
|
3455
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3456
|
+
/*.no_alloc =*/ false,
|
3457
|
+
};
|
3458
|
+
|
3459
|
+
params.no_alloc = true;
|
3460
|
+
|
3461
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3462
|
+
|
3463
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3464
|
+
|
3465
|
+
struct ggml_tensor * cur;
|
3466
|
+
struct ggml_tensor * token;
|
3467
|
+
struct ggml_tensor * position;
|
3468
|
+
struct ggml_tensor * inpL;
|
3469
|
+
|
3470
|
+
if (tokens) {
|
3471
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3472
|
+
|
3473
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
3476
|
+
}
|
3477
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
|
+
|
3479
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3480
|
+
} else {
|
3481
|
+
#ifdef GGML_USE_MPI
|
3482
|
+
GGML_ASSERT(false && "not implemented");
|
3483
|
+
#endif
|
3484
|
+
|
3485
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
3486
|
+
|
3487
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
3488
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
+
memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
|
3490
|
+
}
|
3491
|
+
}
|
3492
|
+
|
3493
|
+
{
|
3494
|
+
// Compute position embeddings.
|
3495
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3496
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
+
for (int i = 0; i < N; ++i) {
|
3499
|
+
((int32_t *) inp_positions->data)[i] = n_past + i;
|
3500
|
+
}
|
3501
|
+
}
|
3502
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3503
|
+
|
3504
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
|
+
}
|
3506
|
+
|
3507
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3508
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
|
+
}
|
3512
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3513
|
+
|
3514
|
+
inpL = ggml_add(ctx0, token, position);
|
3515
|
+
ggml_set_name(inpL, "inpL");
|
3516
|
+
|
3517
|
+
for (int il = 0; il < n_layer; ++il) {
|
3518
|
+
{
|
3519
|
+
// Norm
|
3520
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3521
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
3522
|
+
}
|
3523
|
+
|
3524
|
+
{
|
3525
|
+
// Self Attention
|
3526
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
|
+
|
3528
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
3529
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
|
3530
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
|
+
|
3532
|
+
struct ggml_tensor * Qcur = tmpq;
|
3533
|
+
struct ggml_tensor * Kcur = tmpk;
|
3534
|
+
|
3535
|
+
{
|
3536
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
|
3537
|
+
ggml_set_name(Vcur, "Vcur");
|
3538
|
+
|
3539
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
3540
|
+
ggml_set_name(k, "k");
|
3541
|
+
|
3542
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
3543
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
3545
|
+
|
3546
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3548
|
+
}
|
3549
|
+
|
3550
|
+
struct ggml_tensor * Q =
|
3551
|
+
ggml_permute(ctx0,
|
3552
|
+
ggml_cpy(ctx0,
|
3553
|
+
Qcur,
|
3554
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
|
3555
|
+
0, 2, 1, 3);
|
3556
|
+
ggml_set_name(Q, "Q");
|
3557
|
+
|
3558
|
+
struct ggml_tensor * K =
|
3559
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
+
n_embd_head, n_past + N, n_head_kv,
|
3561
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3564
|
+
ggml_set_name(K, "K");
|
3565
|
+
|
3566
|
+
// K * Q
|
3567
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3568
|
+
ggml_set_name(KQ, "KQ");
|
3569
|
+
|
3570
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
3572
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
|
+
|
3575
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
3577
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
|
+
|
3579
|
+
// KQ = soft_max(KQ_masked)
|
3580
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
3581
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3582
|
+
|
3583
|
+
// split cached V into n_head heads
|
3584
|
+
struct ggml_tensor * V =
|
3585
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
+
n_past + N, n_embd_head, n_head_kv,
|
3587
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3588
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3590
|
+
ggml_set_name(V, "V");
|
3591
|
+
|
3592
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3593
|
+
ggml_set_name(KQV, "KQV");
|
3594
|
+
|
3595
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3596
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
|
+
|
3599
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
3600
|
+
cur = ggml_cpy(ctx0,
|
3601
|
+
KQV_merged,
|
3602
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3603
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
|
+
}
|
3605
|
+
|
3606
|
+
// Projection
|
3607
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
3608
|
+
|
3609
|
+
// Add the input
|
3610
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3611
|
+
|
3612
|
+
struct ggml_tensor * inpFF = cur;
|
3613
|
+
|
3614
|
+
// FF
|
3615
|
+
{
|
3616
|
+
// Norm
|
3617
|
+
{
|
3618
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
3619
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
3620
|
+
}
|
3621
|
+
|
3622
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
3623
|
+
|
3624
|
+
// GELU activation
|
3625
|
+
cur = ggml_gelu(ctx0, cur);
|
3626
|
+
|
3627
|
+
// Projection
|
3628
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
3629
|
+
}
|
3630
|
+
|
3631
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
3632
|
+
}
|
3633
|
+
|
3634
|
+
// Output Norm
|
3635
|
+
{
|
3636
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3637
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
3638
|
+
}
|
3639
|
+
ggml_set_name(cur, "result_norm");
|
3640
|
+
|
3641
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3642
|
+
ggml_set_name(cur, "result_output");
|
3643
|
+
|
3644
|
+
ggml_build_forward_expand(gf, cur);
|
3645
|
+
ggml_free(ctx0);
|
3646
|
+
|
3647
|
+
return gf;
|
3648
|
+
}
|
3649
|
+
|
2854
3650
|
static struct ggml_cgraph * llama_build_graph(
|
2855
3651
|
llama_context & lctx,
|
2856
3652
|
const llama_token * tokens,
|
@@ -2866,10 +3662,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
2866
3662
|
{
|
2867
3663
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
2868
3664
|
} break;
|
3665
|
+
case LLM_ARCH_BAICHUAN:
|
3666
|
+
{
|
3667
|
+
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
3668
|
+
} break;
|
2869
3669
|
case LLM_ARCH_FALCON:
|
2870
3670
|
{
|
2871
3671
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
2872
3672
|
} break;
|
3673
|
+
case LLM_ARCH_STARCODER:
|
3674
|
+
{
|
3675
|
+
result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
|
3676
|
+
} break;
|
2873
3677
|
default:
|
2874
3678
|
GGML_ASSERT(false);
|
2875
3679
|
};
|
@@ -2956,6 +3760,15 @@ static bool llama_eval_internal(
|
|
2956
3760
|
n_threads = std::min(4, n_threads);
|
2957
3761
|
}
|
2958
3762
|
|
3763
|
+
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
3764
|
+
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
3765
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
3766
|
+
model.arch == LLM_ARCH_FALCON;
|
3767
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
3768
|
+
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
3769
|
+
n_threads = 1;
|
3770
|
+
}
|
3771
|
+
|
2959
3772
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2960
3773
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
2961
3774
|
|
@@ -2971,10 +3784,6 @@ static bool llama_eval_internal(
|
|
2971
3784
|
if (lctx.ctx_metal) {
|
2972
3785
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
2973
3786
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
2974
|
-
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
2975
|
-
if (!lctx.embedding.empty()) {
|
2976
|
-
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
2977
|
-
}
|
2978
3787
|
} else {
|
2979
3788
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
2980
3789
|
}
|
@@ -3123,10 +3932,9 @@ struct llm_tokenizer_spm {
|
|
3123
3932
|
while (offs < text.size()) {
|
3124
3933
|
llm_symbol sym;
|
3125
3934
|
size_t len = utf8_len(text[offs]);
|
3126
|
-
GGML_ASSERT(offs + len <= text.size());
|
3127
3935
|
sym.text = text.c_str() + offs;
|
3128
|
-
sym.n = len;
|
3129
|
-
offs +=
|
3936
|
+
sym.n = std::min(len, text.size() - offs);
|
3937
|
+
offs += sym.n;
|
3130
3938
|
sym.prev = index - 1;
|
3131
3939
|
sym.next = offs == text.size() ? -1 : index + 1;
|
3132
3940
|
index++;
|
@@ -3488,7 +4296,7 @@ struct llama_grammar_candidate {
|
|
3488
4296
|
|
3489
4297
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
3490
4298
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
3491
|
-
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
4299
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
3492
4300
|
const char * src,
|
3493
4301
|
llama_partial_utf8 partial_start) {
|
3494
4302
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -4642,7 +5450,16 @@ void llama_beam_search(llama_context * ctx,
|
|
4642
5450
|
// quantization
|
4643
5451
|
//
|
4644
5452
|
|
4645
|
-
|
5453
|
+
template <typename T>
|
5454
|
+
struct no_init {
|
5455
|
+
T value;
|
5456
|
+
no_init() { /* do nothing */ }
|
5457
|
+
};
|
5458
|
+
|
5459
|
+
static void llama_convert_tensor_internal(
|
5460
|
+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
5461
|
+
const size_t nelements, const int nthread
|
5462
|
+
) {
|
4646
5463
|
if (output.size() < nelements) {
|
4647
5464
|
output.resize(nelements);
|
4648
5465
|
}
|
@@ -4677,7 +5494,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4677
5494
|
auto blocks_per_thread = nblocks / nthread;
|
4678
5495
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
4679
5496
|
|
4680
|
-
std::vector<std::thread> workers;
|
4681
5497
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
4682
5498
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
4683
5499
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
@@ -4690,14 +5506,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4690
5506
|
qtype.to_float(inbuf, outbuf, nels);
|
4691
5507
|
}
|
4692
5508
|
};
|
4693
|
-
workers.
|
5509
|
+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
4694
5510
|
in_buff_offs += thr_block_bytes;
|
4695
5511
|
out_buff_offs += thr_elems;
|
4696
5512
|
}
|
4697
|
-
for (auto &
|
4698
|
-
|
5513
|
+
for (auto & w : workers) { w.join(); }
|
5514
|
+
workers.clear();
|
5515
|
+
}
|
5516
|
+
|
5517
|
+
#ifdef GGML_USE_K_QUANTS
|
5518
|
+
static ggml_type get_k_quant_type(
|
5519
|
+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
5520
|
+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
5521
|
+
) {
|
5522
|
+
const std::string name = ggml_get_name(tensor);
|
5523
|
+
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
5524
|
+
const auto tn = LLM_TN(model.arch);
|
5525
|
+
|
5526
|
+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
5527
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
5528
|
+
};
|
5529
|
+
|
5530
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5531
|
+
int nx = tensor->ne[0];
|
5532
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
5533
|
+
new_type = GGML_TYPE_Q8_0;
|
5534
|
+
}
|
5535
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
5536
|
+
new_type = GGML_TYPE_Q6_K;
|
5537
|
+
}
|
5538
|
+
} else if (name.find("attn_v.weight") != std::string::npos) {
|
5539
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5540
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5541
|
+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5542
|
+
}
|
5543
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5544
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
5545
|
+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
5546
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
5547
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
5548
|
+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
5549
|
+
if (model.type == MODEL_70B) {
|
5550
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
5551
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
5552
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
5553
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
5554
|
+
}
|
5555
|
+
++*i_attention_wv;
|
5556
|
+
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
5557
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5558
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5559
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
5560
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
5561
|
+
: GGML_TYPE_Q3_K;
|
5562
|
+
}
|
5563
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
5564
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
5565
|
+
}
|
5566
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
5567
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
5568
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
5569
|
+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5570
|
+
} else {
|
5571
|
+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5572
|
+
}
|
5573
|
+
}
|
5574
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5575
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
5576
|
+
new_type = GGML_TYPE_Q5_K;
|
5577
|
+
}
|
5578
|
+
++*i_feed_forward_w2;
|
5579
|
+
} else if (name.find("attn_output.weight") != std::string::npos) {
|
5580
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
5581
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
5582
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
5583
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5584
|
+
} else {
|
5585
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5586
|
+
}
|
5587
|
+
}
|
5588
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
5589
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5590
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
5591
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
5592
|
+
}
|
5593
|
+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
5594
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5595
|
+
}
|
5596
|
+
// This can be used to reduce the size of the Q5_K_S model.
|
5597
|
+
// The associated PPL increase is fully in line with the size reduction
|
5598
|
+
//else {
|
5599
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
5600
|
+
//}
|
5601
|
+
bool convert_incompatible_tensor = false;
|
5602
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
5603
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
5604
|
+
int nx = tensor->ne[0];
|
5605
|
+
int ny = tensor->ne[1];
|
5606
|
+
if (nx % QK_K != 0) {
|
5607
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
5608
|
+
convert_incompatible_tensor = true;
|
5609
|
+
}
|
5610
|
+
}
|
5611
|
+
if (convert_incompatible_tensor) {
|
5612
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5613
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
5614
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
5615
|
+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
5616
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
5617
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
5618
|
+
} else {
|
5619
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
5620
|
+
}
|
4699
5621
|
}
|
5622
|
+
|
5623
|
+
return new_type;
|
4700
5624
|
}
|
5625
|
+
#endif
|
4701
5626
|
|
4702
5627
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
4703
5628
|
ggml_type quantized_type;
|
@@ -4782,18 +5707,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4782
5707
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
4783
5708
|
|
4784
5709
|
std::vector<std::thread> workers;
|
5710
|
+
workers.reserve(nthread);
|
4785
5711
|
std::mutex mutex;
|
4786
5712
|
|
4787
|
-
#ifdef GGML_USE_K_QUANTS
|
4788
|
-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4789
|
-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4790
|
-
};
|
4791
|
-
#endif
|
4792
|
-
|
4793
5713
|
int idx = 0;
|
4794
5714
|
|
4795
|
-
std::vector<uint8_t
|
4796
|
-
std::vector<uint8_t
|
5715
|
+
std::vector<no_init<uint8_t>> read_data;
|
5716
|
+
std::vector<no_init<uint8_t>> work;
|
5717
|
+
std::vector<no_init<float>> f32_conv_buf;
|
4797
5718
|
|
4798
5719
|
// populate the original tensors so we get an initial meta data
|
4799
5720
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
@@ -4815,7 +5736,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4815
5736
|
|
4816
5737
|
const std::string name = ggml_get_name(tensor);
|
4817
5738
|
|
4818
|
-
read_data.
|
5739
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
5740
|
+
read_data.resize(ggml_nbytes(tensor));
|
5741
|
+
}
|
4819
5742
|
tensor->data = read_data.data();
|
4820
5743
|
ml->load_data_for(tensor);
|
4821
5744
|
|
@@ -4840,101 +5763,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4840
5763
|
if (quantize) {
|
4841
5764
|
new_type = quantized_type;
|
4842
5765
|
#ifdef GGML_USE_K_QUANTS
|
4843
|
-
|
4844
|
-
|
4845
|
-
|
4846
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4847
|
-
int nx = tensor->ne[0];
|
4848
|
-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4849
|
-
new_type = GGML_TYPE_Q8_0;
|
4850
|
-
}
|
4851
|
-
else if (new_type != GGML_TYPE_Q8_0) {
|
4852
|
-
new_type = GGML_TYPE_Q6_K;
|
4853
|
-
}
|
4854
|
-
} else if (name.find("attn_v.weight") != std::string::npos) {
|
4855
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4856
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4857
|
-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4858
|
-
}
|
4859
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4860
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4861
|
-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
4862
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4863
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4864
|
-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4865
|
-
if (model.type == MODEL_70B) {
|
4866
|
-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4867
|
-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4868
|
-
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4869
|
-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4870
|
-
}
|
4871
|
-
++i_attention_wv;
|
4872
|
-
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4873
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4874
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4875
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4876
|
-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4877
|
-
: GGML_TYPE_Q3_K;
|
4878
|
-
}
|
4879
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4880
|
-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4881
|
-
}
|
4882
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4883
|
-
if (model.arch == LLM_ARCH_FALCON) {
|
4884
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4885
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4886
|
-
} else {
|
4887
|
-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4888
|
-
}
|
4889
|
-
}
|
4890
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4891
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4892
|
-
new_type = GGML_TYPE_Q5_K;
|
4893
|
-
}
|
4894
|
-
++i_feed_forward_w2;
|
4895
|
-
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4896
|
-
if (model.arch != LLM_ARCH_FALCON) {
|
4897
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4898
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4899
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4900
|
-
} else {
|
4901
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4902
|
-
}
|
4903
|
-
}
|
4904
|
-
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4905
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4906
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4907
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4908
|
-
}
|
4909
|
-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4910
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4911
|
-
}
|
4912
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
4913
|
-
// The associated PPL increase is fully in line with the size reduction
|
4914
|
-
//else {
|
4915
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
4916
|
-
//}
|
4917
|
-
bool convert_incompatible_tensor = false;
|
4918
|
-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
4919
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4920
|
-
int nx = tensor->ne[0];
|
4921
|
-
int ny = tensor->ne[1];
|
4922
|
-
if (nx % QK_K != 0) {
|
4923
|
-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4924
|
-
convert_incompatible_tensor = true;
|
4925
|
-
}
|
4926
|
-
}
|
4927
|
-
if (convert_incompatible_tensor) {
|
4928
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4929
|
-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
4930
|
-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
4931
|
-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
4932
|
-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
4933
|
-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
4934
|
-
} else {
|
4935
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
4936
|
-
}
|
4937
|
-
}
|
5766
|
+
new_type = get_k_quant_type(
|
5767
|
+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
5768
|
+
);
|
4938
5769
|
#endif
|
4939
5770
|
// If we've decided to quantize to the same type the tensor is already
|
4940
5771
|
// in then there's nothing to do.
|
@@ -4949,23 +5780,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4949
5780
|
const size_t nelements = ggml_nelements(tensor);
|
4950
5781
|
|
4951
5782
|
float * f32_data;
|
4952
|
-
std::vector<float> f32_conv_buf;
|
4953
5783
|
|
4954
5784
|
if (tensor->type == GGML_TYPE_F32) {
|
4955
5785
|
f32_data = (float *) tensor->data;
|
4956
5786
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
4957
5787
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
4958
5788
|
} else {
|
4959
|
-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
5789
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
4960
5790
|
f32_data = (float *) f32_conv_buf.data();
|
4961
5791
|
}
|
4962
5792
|
|
4963
5793
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
4964
5794
|
fflush(stdout);
|
4965
5795
|
|
4966
|
-
work.
|
5796
|
+
if (work.size() < nelements * 4) {
|
5797
|
+
work.resize(nelements * 4); // upper bound on size
|
5798
|
+
}
|
4967
5799
|
new_data = work.data();
|
4968
|
-
std::
|
5800
|
+
std::array<int64_t, 1 << 4> hist_cur = {};
|
4969
5801
|
|
4970
5802
|
static const int chunk_size = 32 * 512;
|
4971
5803
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
@@ -4976,13 +5808,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4976
5808
|
size_t counter = 0;
|
4977
5809
|
new_size = 0;
|
4978
5810
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
4979
|
-
std::
|
5811
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
4980
5812
|
size_t local_size = 0;
|
4981
5813
|
while (true) {
|
4982
5814
|
std::unique_lock<std::mutex> lock(mutex);
|
4983
5815
|
size_t first = counter; counter += chunk_size;
|
4984
5816
|
if (first >= nelements) {
|
4985
|
-
if (
|
5817
|
+
if (local_size > 0) {
|
4986
5818
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
4987
5819
|
hist_cur[j] += local_hist[j];
|
4988
5820
|
}
|
@@ -4992,22 +5824,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4992
5824
|
}
|
4993
5825
|
lock.unlock();
|
4994
5826
|
size_t last = std::min(nelements, first + chunk_size);
|
4995
|
-
if (local_hist.empty()) {
|
4996
|
-
local_hist.resize(hist_cur.size(), 0);
|
4997
|
-
}
|
4998
5827
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
4999
5828
|
}
|
5000
5829
|
};
|
5001
|
-
if ((int) workers.size() < nthread_use - 1) {
|
5002
|
-
workers.resize(nthread_use - 1);
|
5003
|
-
}
|
5004
5830
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
5005
|
-
workers
|
5831
|
+
workers.emplace_back(compute);
|
5006
5832
|
}
|
5007
5833
|
compute();
|
5008
|
-
for (
|
5009
|
-
|
5010
|
-
}
|
5834
|
+
for (auto & w : workers) { w.join(); }
|
5835
|
+
workers.clear();
|
5011
5836
|
}
|
5012
5837
|
|
5013
5838
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -5069,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5069
5894
|
}
|
5070
5895
|
|
5071
5896
|
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5072
|
-
int llama_apply_lora_from_file_internal(
|
5897
|
+
static int llama_apply_lora_from_file_internal(
|
5898
|
+
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
5899
|
+
) {
|
5073
5900
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5074
5901
|
|
5075
5902
|
const int64_t t_start_lora_us = ggml_time_us();
|
@@ -5353,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
|
|
5353
6180
|
/*.n_gpu_layers =*/ 0,
|
5354
6181
|
/*.main_gpu =*/ 0,
|
5355
6182
|
/*.tensor_split =*/ nullptr,
|
5356
|
-
/*.rope_freq_base =*/
|
5357
|
-
/*.rope_freq_scale =*/
|
6183
|
+
/*.rope_freq_base =*/ 0.0f,
|
6184
|
+
/*.rope_freq_scale =*/ 0.0f,
|
5358
6185
|
/*.progress_callback =*/ nullptr,
|
5359
6186
|
/*.progress_callback_user_data =*/ nullptr,
|
5360
6187
|
/*.low_vram =*/ false,
|
@@ -5616,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
|
|
5616
6443
|
return ctx;
|
5617
6444
|
}
|
5618
6445
|
|
5619
|
-
struct llama_context * llama_init_from_file(
|
6446
|
+
static struct llama_context * llama_init_from_file(
|
5620
6447
|
const char * path_model,
|
5621
6448
|
struct llama_context_params params) {
|
5622
6449
|
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
@@ -5635,15 +6462,19 @@ void llama_free(struct llama_context * ctx) {
|
|
5635
6462
|
}
|
5636
6463
|
|
5637
6464
|
int llama_n_vocab(const struct llama_context * ctx) {
|
5638
|
-
return ctx->model
|
6465
|
+
return llama_model_n_vocab(&ctx->model);
|
5639
6466
|
}
|
5640
6467
|
|
5641
6468
|
int llama_n_ctx(const struct llama_context * ctx) {
|
5642
|
-
return ctx->model
|
6469
|
+
return llama_model_n_ctx(&ctx->model);
|
6470
|
+
}
|
6471
|
+
|
6472
|
+
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6473
|
+
return llama_model_n_ctx_train(&ctx->model);
|
5643
6474
|
}
|
5644
6475
|
|
5645
6476
|
int llama_n_embd(const struct llama_context * ctx) {
|
5646
|
-
return ctx->model
|
6477
|
+
return llama_model_n_embd(&ctx->model);
|
5647
6478
|
}
|
5648
6479
|
|
5649
6480
|
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
@@ -5658,6 +6489,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
|
|
5658
6489
|
return model->hparams.n_ctx;
|
5659
6490
|
}
|
5660
6491
|
|
6492
|
+
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6493
|
+
return model->hparams.n_ctx_train;
|
6494
|
+
}
|
6495
|
+
|
5661
6496
|
int llama_model_n_embd(const struct llama_model * model) {
|
5662
6497
|
return model->hparams.n_embd;
|
5663
6498
|
}
|
@@ -5813,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
|
|
5813
6648
|
* llama_copy_state_data(ctx, &data_ctx);
|
5814
6649
|
*
|
5815
6650
|
*/
|
5816
|
-
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
6651
|
+
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
5817
6652
|
// copy rng
|
5818
6653
|
{
|
5819
6654
|
std::stringstream rng_ss;
|
@@ -6197,22 +7032,24 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
6197
7032
|
int llama_tokenize(
|
6198
7033
|
struct llama_context * ctx,
|
6199
7034
|
const char * text,
|
7035
|
+
int text_len,
|
6200
7036
|
llama_token * tokens,
|
6201
7037
|
int n_max_tokens,
|
6202
7038
|
bool add_bos) {
|
6203
|
-
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
7039
|
+
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
6204
7040
|
}
|
6205
7041
|
|
6206
7042
|
int llama_tokenize_with_model(
|
6207
7043
|
const struct llama_model * model,
|
6208
7044
|
const char * text,
|
7045
|
+
int text_len,
|
6209
7046
|
llama_token * tokens,
|
6210
7047
|
int n_max_tokens,
|
6211
7048
|
bool add_bos) {
|
6212
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
7049
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
|
6213
7050
|
|
6214
7051
|
if (n_max_tokens < (int) res.size()) {
|
6215
|
-
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
7052
|
+
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6216
7053
|
return -((int) res.size());
|
6217
7054
|
}
|
6218
7055
|
|
@@ -6351,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
6351
7188
|
}
|
6352
7189
|
|
6353
7190
|
// For internal test use
|
6354
|
-
const std::vector<std::pair<std::string, struct ggml_tensor
|
7191
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
7192
|
+
struct llama_context * ctx
|
7193
|
+
) {
|
6355
7194
|
return ctx->model.tensors_by_name;
|
6356
7195
|
}
|
6357
7196
|
|