llama_cpp 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -3
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +32 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -11
- data/ext/llama_cpp/src/ggml-cuda.cu +1108 -699
- data/ext/llama_cpp/src/ggml-metal.m +93 -24
- data/ext/llama_cpp/src/ggml-metal.metal +407 -174
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +75 -43
- data/ext/llama_cpp/src/ggml.h +42 -32
- data/ext/llama_cpp/src/k_quants.c +4 -1
- data/ext/llama_cpp/src/llama.cpp +1040 -201
- data/ext/llama_cpp/src/llama.h +13 -7
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,8 +1,4 @@
|
|
1
|
-
|
2
|
-
#ifndef _GNU_SOURCE
|
3
|
-
#define _GNU_SOURCE
|
4
|
-
#endif
|
5
|
-
|
1
|
+
#define LLAMA_API_INTERNAL
|
6
2
|
#include "llama.h"
|
7
3
|
|
8
4
|
#include "ggml.h"
|
@@ -113,7 +109,7 @@ static size_t utf8_len(char src) {
|
|
113
109
|
return lookup[highbits];
|
114
110
|
}
|
115
111
|
|
116
|
-
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
117
113
|
std::string result;
|
118
114
|
for (size_t pos = 0; ; pos += search.length()) {
|
119
115
|
auto new_pos = s.find(search, pos);
|
@@ -160,20 +156,24 @@ static std::string format(const char * fmt, ...) {
|
|
160
156
|
enum llm_arch {
|
161
157
|
LLM_ARCH_LLAMA,
|
162
158
|
LLM_ARCH_FALCON,
|
159
|
+
LLM_ARCH_BAICHUAN,
|
163
160
|
LLM_ARCH_GPT2,
|
164
161
|
LLM_ARCH_GPTJ,
|
165
162
|
LLM_ARCH_GPTNEOX,
|
166
163
|
LLM_ARCH_MPT,
|
164
|
+
LLM_ARCH_STARCODER,
|
167
165
|
LLM_ARCH_UNKNOWN,
|
168
166
|
};
|
169
167
|
|
170
168
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
171
|
-
{ LLM_ARCH_LLAMA,
|
172
|
-
{ LLM_ARCH_FALCON,
|
173
|
-
{ LLM_ARCH_GPT2,
|
174
|
-
{ LLM_ARCH_GPTJ,
|
175
|
-
{ LLM_ARCH_GPTNEOX,
|
176
|
-
{ LLM_ARCH_MPT,
|
169
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
170
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
171
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
172
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
173
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
174
|
+
{ LLM_ARCH_MPT, "mpt" },
|
175
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
177
177
|
};
|
178
178
|
|
179
179
|
enum llm_kv {
|
@@ -314,6 +314,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
314
314
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
315
315
|
},
|
316
316
|
},
|
317
|
+
{
|
318
|
+
LLM_ARCH_BAICHUAN,
|
319
|
+
{
|
320
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
321
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
322
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
323
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
324
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
325
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
326
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
327
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
328
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
329
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
330
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
331
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
332
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
333
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
334
|
+
},
|
335
|
+
},
|
317
336
|
{
|
318
337
|
LLM_ARCH_FALCON,
|
319
338
|
{
|
@@ -360,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
360
379
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
361
380
|
},
|
362
381
|
},
|
382
|
+
{
|
383
|
+
LLM_ARCH_STARCODER,
|
384
|
+
{
|
385
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
386
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
387
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
388
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
389
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
390
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
391
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
392
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
393
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
394
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
395
|
+
},
|
396
|
+
},
|
363
397
|
{
|
364
398
|
LLM_ARCH_UNKNOWN,
|
365
399
|
{
|
@@ -658,9 +692,7 @@ struct llama_mmap {
|
|
658
692
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
659
693
|
if (prefetch) {
|
660
694
|
// Advise the kernel to preload the mapped memory
|
661
|
-
|
662
695
|
WIN32_MEMORY_RANGE_ENTRY range;
|
663
|
-
|
664
696
|
range.VirtualAddress = addr;
|
665
697
|
range.NumberOfBytes = (SIZE_T)size;
|
666
698
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
@@ -876,9 +908,11 @@ static llama_state g_state;
|
|
876
908
|
// available llama models
|
877
909
|
enum e_model {
|
878
910
|
MODEL_UNKNOWN,
|
911
|
+
MODEL_1B,
|
879
912
|
MODEL_3B,
|
880
913
|
MODEL_7B,
|
881
914
|
MODEL_13B,
|
915
|
+
MODEL_15B,
|
882
916
|
MODEL_30B,
|
883
917
|
MODEL_34B,
|
884
918
|
MODEL_40B,
|
@@ -888,24 +922,24 @@ enum e_model {
|
|
888
922
|
|
889
923
|
static const size_t kB = 1024;
|
890
924
|
static const size_t MB = kB*kB;
|
925
|
+
static const size_t GB = kB*kB*kB;
|
891
926
|
|
892
|
-
// default hparams (LLaMA 7B)
|
893
927
|
struct llama_hparams {
|
894
|
-
uint32_t n_vocab
|
895
|
-
uint32_t n_ctx_train
|
896
|
-
uint32_t n_ctx
|
897
|
-
uint32_t n_embd
|
898
|
-
uint32_t n_head
|
899
|
-
uint32_t n_head_kv
|
900
|
-
uint32_t n_layer
|
901
|
-
uint32_t n_rot
|
902
|
-
uint32_t n_ff
|
903
|
-
|
904
|
-
float f_norm_eps
|
905
|
-
float f_norm_rms_eps
|
906
|
-
|
907
|
-
float rope_freq_base
|
908
|
-
float rope_freq_scale
|
928
|
+
uint32_t n_vocab;
|
929
|
+
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
+
uint32_t n_ctx; // context size used during inference
|
931
|
+
uint32_t n_embd;
|
932
|
+
uint32_t n_head;
|
933
|
+
uint32_t n_head_kv;
|
934
|
+
uint32_t n_layer;
|
935
|
+
uint32_t n_rot;
|
936
|
+
uint32_t n_ff;
|
937
|
+
|
938
|
+
float f_norm_eps;
|
939
|
+
float f_norm_rms_eps;
|
940
|
+
|
941
|
+
float rope_freq_base;
|
942
|
+
float rope_freq_scale;
|
909
943
|
|
910
944
|
bool operator!=(const llama_hparams & other) const {
|
911
945
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -947,13 +981,22 @@ struct llama_layer {
|
|
947
981
|
struct ggml_tensor * wo;
|
948
982
|
struct ggml_tensor * wqkv;
|
949
983
|
|
984
|
+
// attention bias
|
985
|
+
struct ggml_tensor * bo;
|
986
|
+
struct ggml_tensor * bqkv;
|
987
|
+
|
950
988
|
// normalization
|
951
989
|
struct ggml_tensor * ffn_norm;
|
990
|
+
struct ggml_tensor * ffn_norm_b;
|
952
991
|
|
953
992
|
// ff
|
954
993
|
struct ggml_tensor * w1; // ffn_gate
|
955
994
|
struct ggml_tensor * w2; // ffn_down
|
956
995
|
struct ggml_tensor * w3; // ffn_up
|
996
|
+
|
997
|
+
// ff bias
|
998
|
+
struct ggml_tensor * b2; // ffn_down
|
999
|
+
struct ggml_tensor * b3; // ffn_up
|
957
1000
|
};
|
958
1001
|
|
959
1002
|
struct llama_kv_cache {
|
@@ -1027,10 +1070,11 @@ struct llama_model {
|
|
1027
1070
|
|
1028
1071
|
std::string name = "n/a";
|
1029
1072
|
|
1030
|
-
llama_hparams hparams;
|
1073
|
+
llama_hparams hparams = {};
|
1031
1074
|
llama_vocab vocab;
|
1032
1075
|
|
1033
1076
|
struct ggml_tensor * tok_embeddings;
|
1077
|
+
struct ggml_tensor * pos_embeddings;
|
1034
1078
|
|
1035
1079
|
struct ggml_tensor * output_norm;
|
1036
1080
|
struct ggml_tensor * output_norm_b;
|
@@ -1231,6 +1275,7 @@ struct llama_model_loader {
|
|
1231
1275
|
int n_created = 0;
|
1232
1276
|
|
1233
1277
|
int64_t n_elements = 0;
|
1278
|
+
size_t n_bytes = 0;
|
1234
1279
|
|
1235
1280
|
bool use_mmap = false;
|
1236
1281
|
|
@@ -1263,6 +1308,7 @@ struct llama_model_loader {
|
|
1263
1308
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
1264
1309
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
1265
1310
|
n_elements += ggml_nelements(t);
|
1311
|
+
n_bytes += ggml_nbytes(t);
|
1266
1312
|
}
|
1267
1313
|
|
1268
1314
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -1541,7 +1587,7 @@ struct llama_model_loader {
|
|
1541
1587
|
// load LLaMA models
|
1542
1588
|
//
|
1543
1589
|
|
1544
|
-
std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1590
|
+
static std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1545
1591
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1546
1592
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1547
1593
|
}
|
@@ -1574,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|
1574
1620
|
|
1575
1621
|
static const char * llama_model_type_name(e_model type) {
|
1576
1622
|
switch (type) {
|
1623
|
+
case MODEL_1B: return "1B";
|
1577
1624
|
case MODEL_3B: return "3B";
|
1578
1625
|
case MODEL_7B: return "7B";
|
1579
1626
|
case MODEL_13B: return "13B";
|
1627
|
+
case MODEL_15B: return "15B";
|
1580
1628
|
case MODEL_30B: return "30B";
|
1581
1629
|
case MODEL_34B: return "34B";
|
1582
1630
|
case MODEL_40B: return "40B";
|
@@ -1620,28 +1668,17 @@ static void llm_load_hparams(
|
|
1620
1668
|
hparams.n_head_kv = hparams.n_head;
|
1621
1669
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1622
1670
|
|
1623
|
-
//
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
llama_context_params defaults = llama_context_default_params();
|
1628
|
-
|
1629
|
-
// rope_freq_base
|
1630
|
-
{
|
1631
|
-
float ropebase = 10000.0f;
|
1632
|
-
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1633
|
-
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
1634
|
-
rope_freq_base = ropebase;
|
1635
|
-
}
|
1671
|
+
// rope_freq_base (optional)
|
1672
|
+
if (rope_freq_base == 0.0f) {
|
1673
|
+
rope_freq_base = 10000.0f;
|
1674
|
+
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1636
1675
|
}
|
1637
1676
|
|
1638
1677
|
// rope_freq_scale (inverse of the kv) is optional
|
1639
|
-
{
|
1678
|
+
if (rope_freq_scale == 0.0f) {
|
1640
1679
|
float ropescale = 1.0f;
|
1641
1680
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1642
|
-
|
1643
|
-
rope_freq_scale = 1.0f/ropescale;
|
1644
|
-
}
|
1681
|
+
rope_freq_scale = 1.0f/ropescale;
|
1645
1682
|
}
|
1646
1683
|
|
1647
1684
|
// sanity check for n_rot (optional)
|
@@ -1685,6 +1722,26 @@ static void llm_load_hparams(
|
|
1685
1722
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1686
1723
|
}
|
1687
1724
|
} break;
|
1725
|
+
case LLM_ARCH_BAICHUAN:
|
1726
|
+
{
|
1727
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
1728
|
+
switch (hparams.n_layer) {
|
1729
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
1730
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
1731
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1732
|
+
}
|
1733
|
+
} break;
|
1734
|
+
case LLM_ARCH_STARCODER:
|
1735
|
+
{
|
1736
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
1737
|
+
switch (hparams.n_layer) {
|
1738
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
1739
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
1740
|
+
case 42: model.type = e_model::MODEL_7B; break;
|
1741
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
1742
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1743
|
+
}
|
1744
|
+
} break;
|
1688
1745
|
default: (void)0;
|
1689
1746
|
};
|
1690
1747
|
|
@@ -1838,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1838
1895
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1839
1896
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
1840
1897
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
1841
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
|
+
if (ml.n_bytes < GB) {
|
1900
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
|
+
} else {
|
1902
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
|
+
}
|
1842
1904
|
|
1843
1905
|
// general kv
|
1844
1906
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
@@ -1925,7 +1987,6 @@ static void llm_load_tensors(
|
|
1925
1987
|
const int64_t n_vocab = hparams.n_vocab;
|
1926
1988
|
|
1927
1989
|
const auto tn = LLM_TN(model.arch);
|
1928
|
-
|
1929
1990
|
switch (model.arch) {
|
1930
1991
|
case LLM_ARCH_LLAMA:
|
1931
1992
|
{
|
@@ -1968,6 +2029,72 @@ static void llm_load_tensors(
|
|
1968
2029
|
|
1969
2030
|
model.layers.resize(n_layer);
|
1970
2031
|
|
2032
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2033
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2034
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2035
|
+
|
2036
|
+
auto & layer = model.layers[i];
|
2037
|
+
|
2038
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2039
|
+
|
2040
|
+
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
|
2041
|
+
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2042
|
+
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2043
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2044
|
+
|
2045
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2046
|
+
|
2047
|
+
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
2048
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2049
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2050
|
+
|
2051
|
+
if (backend == GGML_BACKEND_GPU) {
|
2052
|
+
vram_weights +=
|
2053
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
2054
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
2055
|
+
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
2056
|
+
}
|
2057
|
+
}
|
2058
|
+
} break;
|
2059
|
+
case LLM_ARCH_BAICHUAN:
|
2060
|
+
{
|
2061
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2062
|
+
{
|
2063
|
+
ggml_backend backend_norm;
|
2064
|
+
ggml_backend backend_output;
|
2065
|
+
|
2066
|
+
if (n_gpu_layers > int(n_layer)) {
|
2067
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2068
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2069
|
+
#ifndef _WIN32
|
2070
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2071
|
+
#else
|
2072
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2073
|
+
#endif // _WIN32
|
2074
|
+
|
2075
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2076
|
+
} else {
|
2077
|
+
backend_norm = GGML_BACKEND_CPU;
|
2078
|
+
backend_output = GGML_BACKEND_CPU;
|
2079
|
+
}
|
2080
|
+
|
2081
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2082
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2083
|
+
|
2084
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2085
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2086
|
+
}
|
2087
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2088
|
+
vram_weights += ggml_nbytes(model.output);
|
2089
|
+
}
|
2090
|
+
}
|
2091
|
+
|
2092
|
+
const uint32_t n_ff = hparams.n_ff;
|
2093
|
+
|
2094
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2095
|
+
|
2096
|
+
model.layers.resize(n_layer);
|
2097
|
+
|
1971
2098
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
1972
2099
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
1973
2100
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
@@ -2073,6 +2200,85 @@ static void llm_load_tensors(
|
|
2073
2200
|
}
|
2074
2201
|
}
|
2075
2202
|
} break;
|
2203
|
+
case LLM_ARCH_STARCODER:
|
2204
|
+
{
|
2205
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2206
|
+
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
2207
|
+
|
2208
|
+
// output
|
2209
|
+
{
|
2210
|
+
ggml_backend backend_norm;
|
2211
|
+
ggml_backend backend_output;
|
2212
|
+
|
2213
|
+
if (n_gpu_layers > int(n_layer)) {
|
2214
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2216
|
+
#ifndef _WIN32
|
2217
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2218
|
+
#else
|
2219
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
|
+
#endif // _WIN32
|
2221
|
+
|
2222
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2223
|
+
} else {
|
2224
|
+
backend_norm = GGML_BACKEND_CPU;
|
2225
|
+
backend_output = GGML_BACKEND_CPU;
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2229
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2230
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2231
|
+
|
2232
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2233
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2234
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2235
|
+
}
|
2236
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2237
|
+
vram_weights += ggml_nbytes(model.output);
|
2238
|
+
}
|
2239
|
+
}
|
2240
|
+
|
2241
|
+
const uint32_t n_ff = hparams.n_ff;
|
2242
|
+
|
2243
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2244
|
+
|
2245
|
+
model.layers.resize(n_layer);
|
2246
|
+
|
2247
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2248
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2249
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2250
|
+
|
2251
|
+
auto & layer = model.layers[i];
|
2252
|
+
|
2253
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2254
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2255
|
+
|
2256
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2257
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2258
|
+
|
2259
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2260
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2261
|
+
|
2262
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2263
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2264
|
+
|
2265
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2266
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2267
|
+
|
2268
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2269
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2270
|
+
|
2271
|
+
if (backend == GGML_BACKEND_GPU) {
|
2272
|
+
vram_weights +=
|
2273
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2274
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2275
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2276
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2277
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
|
2278
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
|
2279
|
+
}
|
2280
|
+
}
|
2281
|
+
} break;
|
2076
2282
|
default:
|
2077
2283
|
throw std::runtime_error("unknown architecture");
|
2078
2284
|
};
|
@@ -2354,11 +2560,356 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2354
2560
|
offload_func_kq(tmpq);
|
2355
2561
|
ggml_set_name(tmpq, "tmpq");
|
2356
2562
|
|
2357
|
-
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2563
|
+
struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2564
|
+
offload_func_kq(Kcur);
|
2565
|
+
ggml_set_name(Kcur, "Kcur");
|
2566
|
+
|
2567
|
+
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2568
|
+
offload_func_kq(Qcur);
|
2569
|
+
ggml_set_name(Qcur, "Qcur");
|
2570
|
+
|
2571
|
+
// store key and value to memory
|
2572
|
+
{
|
2573
|
+
// compute the transposed [N, n_embd] V matrix
|
2574
|
+
|
2575
|
+
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
2576
|
+
offload_func_v(tmpv);
|
2577
|
+
ggml_set_name(tmpv, "tmpv");
|
2578
|
+
|
2579
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
|
2580
|
+
offload_func_v(Vcur);
|
2581
|
+
ggml_set_name(Vcur, "Vcur");
|
2582
|
+
|
2583
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
2584
|
+
offload_func_kq(k);
|
2585
|
+
ggml_set_name(k, "k");
|
2586
|
+
|
2587
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
2588
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
2589
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
2590
|
+
offload_func_v(v);
|
2591
|
+
ggml_set_name(v, "v");
|
2592
|
+
|
2593
|
+
// important: storing RoPE-ed version of K in the KV cache!
|
2594
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
2595
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
2596
|
+
}
|
2597
|
+
|
2598
|
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
2599
|
+
offload_func_kq(Q);
|
2600
|
+
ggml_set_name(Q, "Q");
|
2601
|
+
|
2602
|
+
struct ggml_tensor * K =
|
2603
|
+
ggml_view_3d(ctx0, kv_self.k,
|
2604
|
+
n_embd_head, n_past + N, n_head_kv,
|
2605
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
2606
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
2607
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
2608
|
+
offload_func_kq(K);
|
2609
|
+
ggml_set_name(K, "K");
|
2610
|
+
|
2611
|
+
// K * Q
|
2612
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
2613
|
+
offload_func_kq(KQ);
|
2614
|
+
ggml_set_name(KQ, "KQ");
|
2615
|
+
|
2616
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
2617
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
2618
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
2619
|
+
offload_func_kq(KQ_scaled);
|
2620
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2621
|
+
|
2622
|
+
// KQ_masked = mask_past(KQ_scaled)
|
2623
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2624
|
+
offload_func_kq(KQ_masked);
|
2625
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
2626
|
+
|
2627
|
+
// KQ = soft_max(KQ_masked)
|
2628
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
2629
|
+
offload_func_v(KQ_soft_max);
|
2630
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
2631
|
+
|
2632
|
+
// split cached V into n_head heads
|
2633
|
+
struct ggml_tensor * V =
|
2634
|
+
ggml_view_3d(ctx0, kv_self.v,
|
2635
|
+
n_past + N, n_embd_head, n_head_kv,
|
2636
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
2637
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
2638
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
2639
|
+
offload_func_v(V);
|
2640
|
+
ggml_set_name(V, "V");
|
2641
|
+
|
2642
|
+
#if 1
|
2643
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
2644
|
+
offload_func_v(KQV);
|
2645
|
+
ggml_set_name(KQV, "KQV");
|
2646
|
+
#else
|
2647
|
+
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
|
2648
|
+
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
|
2649
|
+
// is there a better way?
|
2650
|
+
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
|
2651
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
|
2652
|
+
#endif
|
2653
|
+
|
2654
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
2655
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
2656
|
+
offload_func_v(KQV_merged);
|
2657
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
2658
|
+
|
2659
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
2660
|
+
cur = ggml_cpy(ctx0,
|
2661
|
+
KQV_merged,
|
2662
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
2663
|
+
offload_func_v(cur);
|
2664
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
2665
|
+
|
2666
|
+
// projection (no bias)
|
2667
|
+
cur = ggml_mul_mat(ctx0,
|
2668
|
+
model.layers[il].wo,
|
2669
|
+
cur);
|
2670
|
+
offload_func(cur);
|
2671
|
+
ggml_set_name(cur, "result_wo");
|
2672
|
+
}
|
2673
|
+
|
2674
|
+
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
2675
|
+
offload_func(inpFF);
|
2676
|
+
ggml_set_name(inpFF, "inpFF");
|
2677
|
+
|
2678
|
+
// feed-forward network
|
2679
|
+
{
|
2680
|
+
// norm
|
2681
|
+
{
|
2682
|
+
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
|
2683
|
+
offload_func(cur);
|
2684
|
+
ggml_set_name(cur, "rms_norm_1");
|
2685
|
+
|
2686
|
+
// cur = cur*ffn_norm(broadcasted)
|
2687
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
2688
|
+
offload_func(cur);
|
2689
|
+
ggml_set_name(cur, "ffn_norm");
|
2690
|
+
}
|
2691
|
+
|
2692
|
+
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
|
2693
|
+
model.layers[il].w3,
|
2694
|
+
cur);
|
2695
|
+
offload_func(tmp);
|
2696
|
+
ggml_set_name(tmp, "result_w3");
|
2697
|
+
|
2698
|
+
cur = ggml_mul_mat(ctx0,
|
2699
|
+
model.layers[il].w1,
|
2700
|
+
cur);
|
2701
|
+
offload_func(cur);
|
2702
|
+
ggml_set_name(cur, "result_w1");
|
2703
|
+
|
2704
|
+
// SILU activation
|
2705
|
+
cur = ggml_silu(ctx0, cur);
|
2706
|
+
offload_func(cur);
|
2707
|
+
ggml_set_name(cur, "silu");
|
2708
|
+
|
2709
|
+
cur = ggml_mul(ctx0, cur, tmp);
|
2710
|
+
offload_func(cur);
|
2711
|
+
ggml_set_name(cur, "silu_x_result_w3");
|
2712
|
+
|
2713
|
+
cur = ggml_mul_mat(ctx0,
|
2714
|
+
model.layers[il].w2,
|
2715
|
+
cur);
|
2716
|
+
offload_func(cur);
|
2717
|
+
ggml_set_name(cur, "result_w2");
|
2718
|
+
}
|
2719
|
+
|
2720
|
+
cur = ggml_add(ctx0, cur, inpFF);
|
2721
|
+
offload_func(cur);
|
2722
|
+
ggml_set_name(cur, "inpFF_+_result_w2");
|
2723
|
+
|
2724
|
+
// input for next layer
|
2725
|
+
inpL = cur;
|
2726
|
+
}
|
2727
|
+
|
2728
|
+
cur = inpL;
|
2729
|
+
|
2730
|
+
// norm
|
2731
|
+
{
|
2732
|
+
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
|
2733
|
+
offload_func_nr(cur);
|
2734
|
+
ggml_set_name(cur, "rms_norm_2");
|
2735
|
+
|
2736
|
+
// cur = cur*norm(broadcasted)
|
2737
|
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
2738
|
+
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
2739
|
+
ggml_set_name(cur, "result_norm");
|
2740
|
+
}
|
2741
|
+
|
2742
|
+
// lm_head
|
2743
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
2744
|
+
ggml_set_name(cur, "result_output");
|
2745
|
+
|
2746
|
+
ggml_build_forward_expand(gf, cur);
|
2747
|
+
|
2748
|
+
ggml_free(ctx0);
|
2749
|
+
|
2750
|
+
return gf;
|
2751
|
+
}
|
2752
|
+
|
2753
|
+
|
2754
|
+
static struct ggml_cgraph * llm_build_baichaun(
|
2755
|
+
llama_context & lctx,
|
2756
|
+
const llama_token * tokens,
|
2757
|
+
const float * embd,
|
2758
|
+
int n_tokens,
|
2759
|
+
int n_past) {
|
2760
|
+
|
2761
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
2762
|
+
|
2763
|
+
const int N = n_tokens;
|
2764
|
+
|
2765
|
+
const auto & model = lctx.model;
|
2766
|
+
const auto & hparams = model.hparams;
|
2767
|
+
|
2768
|
+
const auto & kv_self = lctx.kv_self;
|
2769
|
+
|
2770
|
+
GGML_ASSERT(!!kv_self.ctx);
|
2771
|
+
|
2772
|
+
const int64_t n_embd = hparams.n_embd;
|
2773
|
+
const int64_t n_layer = hparams.n_layer;
|
2774
|
+
const int64_t n_ctx = hparams.n_ctx;
|
2775
|
+
const int64_t n_head = hparams.n_head;
|
2776
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
2777
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
2778
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
2779
|
+
|
2780
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
2781
|
+
|
2782
|
+
const float freq_base = hparams.rope_freq_base;
|
2783
|
+
const float freq_scale = hparams.rope_freq_scale;
|
2784
|
+
const float norm_rms_eps = hparams.f_norm_rms_eps;
|
2785
|
+
|
2786
|
+
const int n_gpu_layers = model.n_gpu_layers;
|
2787
|
+
|
2788
|
+
auto & buf_compute = lctx.buf_compute;
|
2789
|
+
|
2790
|
+
struct ggml_init_params params = {
|
2791
|
+
/*.mem_size =*/ buf_compute.size,
|
2792
|
+
/*.mem_buffer =*/ buf_compute.data,
|
2793
|
+
/*.no_alloc =*/ false,
|
2794
|
+
};
|
2795
|
+
|
2796
|
+
params.no_alloc = true;
|
2797
|
+
|
2798
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
2799
|
+
|
2800
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
2801
|
+
|
2802
|
+
struct ggml_tensor * cur;
|
2803
|
+
struct ggml_tensor * inpL;
|
2804
|
+
|
2805
|
+
if (tokens) {
|
2806
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
2807
|
+
|
2808
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
2809
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2810
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
2811
|
+
}
|
2812
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
2813
|
+
|
2814
|
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
2815
|
+
} else {
|
2816
|
+
#ifdef GGML_USE_MPI
|
2817
|
+
GGML_ASSERT(false && "not implemented");
|
2818
|
+
#endif
|
2819
|
+
|
2820
|
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
2821
|
+
|
2822
|
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
2823
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2824
|
+
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
2825
|
+
}
|
2826
|
+
}
|
2827
|
+
|
2828
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2829
|
+
(void) i_gpu_start;
|
2830
|
+
|
2831
|
+
// offload functions set the tensor output backend to GPU
|
2832
|
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
2833
|
+
//
|
2834
|
+
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
|
2835
|
+
// in that case ggml_cuda_assign_buffers has no effect
|
2836
|
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
2837
|
+
offload_func_t offload_func_kq = llama_nop;
|
2838
|
+
offload_func_t offload_func_v = llama_nop;
|
2839
|
+
|
2840
|
+
#ifdef GGML_USE_CUBLAS
|
2841
|
+
if (n_gpu_layers > n_layer) {
|
2842
|
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
2843
|
+
}
|
2844
|
+
if (n_gpu_layers > n_layer + 1) {
|
2845
|
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
2846
|
+
}
|
2847
|
+
if (n_gpu_layers > n_layer + 2) {
|
2848
|
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
2849
|
+
}
|
2850
|
+
#endif // GGML_USE_CUBLAS
|
2851
|
+
|
2852
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
2853
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
2854
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
2855
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
2856
|
+
}
|
2857
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
2858
|
+
|
2859
|
+
for (int il = 0; il < n_layer; ++il) {
|
2860
|
+
ggml_format_name(inpL, "layer_inp_%d", il);
|
2861
|
+
|
2862
|
+
offload_func_t offload_func = llama_nop;
|
2863
|
+
|
2864
|
+
#ifdef GGML_USE_CUBLAS
|
2865
|
+
if (il >= i_gpu_start) {
|
2866
|
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
2867
|
+
}
|
2868
|
+
#endif // GGML_USE_CUBLAS
|
2869
|
+
|
2870
|
+
struct ggml_tensor * inpSA = inpL;
|
2871
|
+
|
2872
|
+
// norm
|
2873
|
+
{
|
2874
|
+
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
|
2875
|
+
offload_func(cur);
|
2876
|
+
ggml_set_name(cur, "rms_norm_0");
|
2877
|
+
|
2878
|
+
// cur = cur*attn_norm(broadcasted)
|
2879
|
+
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
2880
|
+
offload_func(cur);
|
2881
|
+
ggml_set_name(cur, "attention_norm_0");
|
2882
|
+
}
|
2883
|
+
|
2884
|
+
// self-attention
|
2885
|
+
{
|
2886
|
+
// compute Q and K and RoPE them
|
2887
|
+
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
2888
|
+
offload_func_kq(tmpk);
|
2889
|
+
ggml_set_name(tmpk, "tmpk");
|
2890
|
+
|
2891
|
+
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
2892
|
+
offload_func_kq(tmpq);
|
2893
|
+
ggml_set_name(tmpq, "tmpq");
|
2894
|
+
|
2895
|
+
struct ggml_tensor * Kcur;
|
2896
|
+
struct ggml_tensor * Qcur;
|
2897
|
+
switch (model.type) {
|
2898
|
+
case MODEL_7B:
|
2899
|
+
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2900
|
+
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2901
|
+
break;
|
2902
|
+
case MODEL_13B:
|
2903
|
+
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
|
2904
|
+
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
|
2905
|
+
break;
|
2906
|
+
default:
|
2907
|
+
GGML_ASSERT(false);
|
2908
|
+
}
|
2909
|
+
|
2358
2910
|
offload_func_kq(Kcur);
|
2359
2911
|
ggml_set_name(Kcur, "Kcur");
|
2360
2912
|
|
2361
|
-
struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
|
2362
2913
|
offload_func_kq(Qcur);
|
2363
2914
|
ggml_set_name(Qcur, "Qcur");
|
2364
2915
|
|
@@ -2413,10 +2964,26 @@ static struct ggml_cgraph * llm_build_llama(
|
|
2413
2964
|
offload_func_kq(KQ_scaled);
|
2414
2965
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
2415
2966
|
|
2967
|
+
struct ggml_tensor * KQ_masked;
|
2968
|
+
struct ggml_tensor * KQ_scaled_alibi;
|
2969
|
+
|
2970
|
+
switch (model.type) {
|
2971
|
+
case MODEL_7B:
|
2972
|
+
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2973
|
+
break;
|
2974
|
+
case MODEL_13B:
|
2975
|
+
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
|
2976
|
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
2977
|
+
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2978
|
+
break;
|
2979
|
+
default:
|
2980
|
+
GGML_ASSERT(false);
|
2981
|
+
}
|
2416
2982
|
// KQ_masked = mask_past(KQ_scaled)
|
2417
|
-
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2418
|
-
|
2419
|
-
|
2983
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
2984
|
+
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
|
2985
|
+
// offload_func_kq(KQ_masked);
|
2986
|
+
// ggml_set_name(KQ_masked, "KQ_masked");
|
2420
2987
|
|
2421
2988
|
// KQ = soft_max(KQ_masked)
|
2422
2989
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
@@ -2851,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
2851
3418
|
return gf;
|
2852
3419
|
}
|
2853
3420
|
|
3421
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
3422
|
+
llama_context & lctx,
|
3423
|
+
const llama_token * tokens,
|
3424
|
+
const float * embd,
|
3425
|
+
int n_tokens,
|
3426
|
+
int n_past) {
|
3427
|
+
|
3428
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
+
|
3430
|
+
const int N = n_tokens;
|
3431
|
+
|
3432
|
+
const auto & model = lctx.model;
|
3433
|
+
const auto & hparams = model.hparams;
|
3434
|
+
|
3435
|
+
const auto & kv_self = lctx.kv_self;
|
3436
|
+
|
3437
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3438
|
+
|
3439
|
+
const int64_t n_embd = hparams.n_embd;
|
3440
|
+
const int64_t n_layer = hparams.n_layer;
|
3441
|
+
const int64_t n_ctx = hparams.n_ctx;
|
3442
|
+
const int64_t n_head = hparams.n_head;
|
3443
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3445
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3446
|
+
|
3447
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
|
+
|
3449
|
+
const float norm_eps = hparams.f_norm_eps;
|
3450
|
+
|
3451
|
+
auto & buf_compute = lctx.buf_compute;
|
3452
|
+
|
3453
|
+
struct ggml_init_params params = {
|
3454
|
+
/*.mem_size =*/ buf_compute.size,
|
3455
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3456
|
+
/*.no_alloc =*/ false,
|
3457
|
+
};
|
3458
|
+
|
3459
|
+
params.no_alloc = true;
|
3460
|
+
|
3461
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3462
|
+
|
3463
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3464
|
+
|
3465
|
+
struct ggml_tensor * cur;
|
3466
|
+
struct ggml_tensor * token;
|
3467
|
+
struct ggml_tensor * position;
|
3468
|
+
struct ggml_tensor * inpL;
|
3469
|
+
|
3470
|
+
if (tokens) {
|
3471
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3472
|
+
|
3473
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
3476
|
+
}
|
3477
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
|
+
|
3479
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3480
|
+
} else {
|
3481
|
+
#ifdef GGML_USE_MPI
|
3482
|
+
GGML_ASSERT(false && "not implemented");
|
3483
|
+
#endif
|
3484
|
+
|
3485
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
3486
|
+
|
3487
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
3488
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
+
memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
|
3490
|
+
}
|
3491
|
+
}
|
3492
|
+
|
3493
|
+
{
|
3494
|
+
// Compute position embeddings.
|
3495
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3496
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
+
for (int i = 0; i < N; ++i) {
|
3499
|
+
((int32_t *) inp_positions->data)[i] = n_past + i;
|
3500
|
+
}
|
3501
|
+
}
|
3502
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3503
|
+
|
3504
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
|
+
}
|
3506
|
+
|
3507
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3508
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
|
+
}
|
3512
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3513
|
+
|
3514
|
+
inpL = ggml_add(ctx0, token, position);
|
3515
|
+
ggml_set_name(inpL, "inpL");
|
3516
|
+
|
3517
|
+
for (int il = 0; il < n_layer; ++il) {
|
3518
|
+
{
|
3519
|
+
// Norm
|
3520
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3521
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
3522
|
+
}
|
3523
|
+
|
3524
|
+
{
|
3525
|
+
// Self Attention
|
3526
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
|
+
|
3528
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
3529
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
|
3530
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
|
+
|
3532
|
+
struct ggml_tensor * Qcur = tmpq;
|
3533
|
+
struct ggml_tensor * Kcur = tmpk;
|
3534
|
+
|
3535
|
+
{
|
3536
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
|
3537
|
+
ggml_set_name(Vcur, "Vcur");
|
3538
|
+
|
3539
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
3540
|
+
ggml_set_name(k, "k");
|
3541
|
+
|
3542
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
3543
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
3545
|
+
|
3546
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3548
|
+
}
|
3549
|
+
|
3550
|
+
struct ggml_tensor * Q =
|
3551
|
+
ggml_permute(ctx0,
|
3552
|
+
ggml_cpy(ctx0,
|
3553
|
+
Qcur,
|
3554
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
|
3555
|
+
0, 2, 1, 3);
|
3556
|
+
ggml_set_name(Q, "Q");
|
3557
|
+
|
3558
|
+
struct ggml_tensor * K =
|
3559
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
+
n_embd_head, n_past + N, n_head_kv,
|
3561
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3564
|
+
ggml_set_name(K, "K");
|
3565
|
+
|
3566
|
+
// K * Q
|
3567
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3568
|
+
ggml_set_name(KQ, "KQ");
|
3569
|
+
|
3570
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
3572
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
|
+
|
3575
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
3577
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
|
+
|
3579
|
+
// KQ = soft_max(KQ_masked)
|
3580
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
3581
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3582
|
+
|
3583
|
+
// split cached V into n_head heads
|
3584
|
+
struct ggml_tensor * V =
|
3585
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
+
n_past + N, n_embd_head, n_head_kv,
|
3587
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3588
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3590
|
+
ggml_set_name(V, "V");
|
3591
|
+
|
3592
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3593
|
+
ggml_set_name(KQV, "KQV");
|
3594
|
+
|
3595
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3596
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
|
+
|
3599
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
3600
|
+
cur = ggml_cpy(ctx0,
|
3601
|
+
KQV_merged,
|
3602
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3603
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
|
+
}
|
3605
|
+
|
3606
|
+
// Projection
|
3607
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
3608
|
+
|
3609
|
+
// Add the input
|
3610
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3611
|
+
|
3612
|
+
struct ggml_tensor * inpFF = cur;
|
3613
|
+
|
3614
|
+
// FF
|
3615
|
+
{
|
3616
|
+
// Norm
|
3617
|
+
{
|
3618
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
3619
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
3620
|
+
}
|
3621
|
+
|
3622
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
3623
|
+
|
3624
|
+
// GELU activation
|
3625
|
+
cur = ggml_gelu(ctx0, cur);
|
3626
|
+
|
3627
|
+
// Projection
|
3628
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
3629
|
+
}
|
3630
|
+
|
3631
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
3632
|
+
}
|
3633
|
+
|
3634
|
+
// Output Norm
|
3635
|
+
{
|
3636
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3637
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
3638
|
+
}
|
3639
|
+
ggml_set_name(cur, "result_norm");
|
3640
|
+
|
3641
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3642
|
+
ggml_set_name(cur, "result_output");
|
3643
|
+
|
3644
|
+
ggml_build_forward_expand(gf, cur);
|
3645
|
+
ggml_free(ctx0);
|
3646
|
+
|
3647
|
+
return gf;
|
3648
|
+
}
|
3649
|
+
|
2854
3650
|
static struct ggml_cgraph * llama_build_graph(
|
2855
3651
|
llama_context & lctx,
|
2856
3652
|
const llama_token * tokens,
|
@@ -2866,10 +3662,18 @@ static struct ggml_cgraph * llama_build_graph(
|
|
2866
3662
|
{
|
2867
3663
|
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
|
2868
3664
|
} break;
|
3665
|
+
case LLM_ARCH_BAICHUAN:
|
3666
|
+
{
|
3667
|
+
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
|
3668
|
+
} break;
|
2869
3669
|
case LLM_ARCH_FALCON:
|
2870
3670
|
{
|
2871
3671
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
2872
3672
|
} break;
|
3673
|
+
case LLM_ARCH_STARCODER:
|
3674
|
+
{
|
3675
|
+
result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
|
3676
|
+
} break;
|
2873
3677
|
default:
|
2874
3678
|
GGML_ASSERT(false);
|
2875
3679
|
};
|
@@ -2956,6 +3760,15 @@ static bool llama_eval_internal(
|
|
2956
3760
|
n_threads = std::min(4, n_threads);
|
2957
3761
|
}
|
2958
3762
|
|
3763
|
+
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
3764
|
+
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
3765
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
3766
|
+
model.arch == LLM_ARCH_FALCON;
|
3767
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
3768
|
+
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
3769
|
+
n_threads = 1;
|
3770
|
+
}
|
3771
|
+
|
2959
3772
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
2960
3773
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
2961
3774
|
|
@@ -2971,10 +3784,6 @@ static bool llama_eval_internal(
|
|
2971
3784
|
if (lctx.ctx_metal) {
|
2972
3785
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
2973
3786
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
2974
|
-
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
2975
|
-
if (!lctx.embedding.empty()) {
|
2976
|
-
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
2977
|
-
}
|
2978
3787
|
} else {
|
2979
3788
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
2980
3789
|
}
|
@@ -3123,10 +3932,9 @@ struct llm_tokenizer_spm {
|
|
3123
3932
|
while (offs < text.size()) {
|
3124
3933
|
llm_symbol sym;
|
3125
3934
|
size_t len = utf8_len(text[offs]);
|
3126
|
-
GGML_ASSERT(offs + len <= text.size());
|
3127
3935
|
sym.text = text.c_str() + offs;
|
3128
|
-
sym.n = len;
|
3129
|
-
offs +=
|
3936
|
+
sym.n = std::min(len, text.size() - offs);
|
3937
|
+
offs += sym.n;
|
3130
3938
|
sym.prev = index - 1;
|
3131
3939
|
sym.next = offs == text.size() ? -1 : index + 1;
|
3132
3940
|
index++;
|
@@ -3488,7 +4296,7 @@ struct llama_grammar_candidate {
|
|
3488
4296
|
|
3489
4297
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
3490
4298
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
3491
|
-
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
4299
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
3492
4300
|
const char * src,
|
3493
4301
|
llama_partial_utf8 partial_start) {
|
3494
4302
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -4642,7 +5450,16 @@ void llama_beam_search(llama_context * ctx,
|
|
4642
5450
|
// quantization
|
4643
5451
|
//
|
4644
5452
|
|
4645
|
-
|
5453
|
+
template <typename T>
|
5454
|
+
struct no_init {
|
5455
|
+
T value;
|
5456
|
+
no_init() { /* do nothing */ }
|
5457
|
+
};
|
5458
|
+
|
5459
|
+
static void llama_convert_tensor_internal(
|
5460
|
+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
5461
|
+
const size_t nelements, const int nthread
|
5462
|
+
) {
|
4646
5463
|
if (output.size() < nelements) {
|
4647
5464
|
output.resize(nelements);
|
4648
5465
|
}
|
@@ -4677,7 +5494,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4677
5494
|
auto blocks_per_thread = nblocks / nthread;
|
4678
5495
|
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
4679
5496
|
|
4680
|
-
std::vector<std::thread> workers;
|
4681
5497
|
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
|
4682
5498
|
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
4683
5499
|
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
|
@@ -4690,14 +5506,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
|
|
4690
5506
|
qtype.to_float(inbuf, outbuf, nels);
|
4691
5507
|
}
|
4692
5508
|
};
|
4693
|
-
workers.
|
5509
|
+
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
|
4694
5510
|
in_buff_offs += thr_block_bytes;
|
4695
5511
|
out_buff_offs += thr_elems;
|
4696
5512
|
}
|
4697
|
-
for (auto &
|
4698
|
-
|
5513
|
+
for (auto & w : workers) { w.join(); }
|
5514
|
+
workers.clear();
|
5515
|
+
}
|
5516
|
+
|
5517
|
+
#ifdef GGML_USE_K_QUANTS
|
5518
|
+
static ggml_type get_k_quant_type(
|
5519
|
+
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
|
5520
|
+
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
|
5521
|
+
) {
|
5522
|
+
const std::string name = ggml_get_name(tensor);
|
5523
|
+
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
5524
|
+
const auto tn = LLM_TN(model.arch);
|
5525
|
+
|
5526
|
+
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
|
5527
|
+
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
5528
|
+
};
|
5529
|
+
|
5530
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5531
|
+
int nx = tensor->ne[0];
|
5532
|
+
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
5533
|
+
new_type = GGML_TYPE_Q8_0;
|
5534
|
+
}
|
5535
|
+
else if (new_type != GGML_TYPE_Q8_0) {
|
5536
|
+
new_type = GGML_TYPE_Q6_K;
|
5537
|
+
}
|
5538
|
+
} else if (name.find("attn_v.weight") != std::string::npos) {
|
5539
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5540
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5541
|
+
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5542
|
+
}
|
5543
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5544
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
5545
|
+
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
5546
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
5547
|
+
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
5548
|
+
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
5549
|
+
if (model.type == MODEL_70B) {
|
5550
|
+
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
5551
|
+
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
5552
|
+
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
5553
|
+
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
5554
|
+
}
|
5555
|
+
++*i_attention_wv;
|
5556
|
+
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
5557
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5558
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
5559
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
5560
|
+
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
5561
|
+
: GGML_TYPE_Q3_K;
|
5562
|
+
}
|
5563
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
5564
|
+
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
5565
|
+
}
|
5566
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
5567
|
+
if (model.arch == LLM_ARCH_FALCON) {
|
5568
|
+
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
5569
|
+
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
5570
|
+
} else {
|
5571
|
+
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5572
|
+
}
|
5573
|
+
}
|
5574
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
5575
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
|
5576
|
+
new_type = GGML_TYPE_Q5_K;
|
5577
|
+
}
|
5578
|
+
++*i_feed_forward_w2;
|
5579
|
+
} else if (name.find("attn_output.weight") != std::string::npos) {
|
5580
|
+
if (model.arch != LLM_ARCH_FALCON) {
|
5581
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
5582
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
5583
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
5584
|
+
} else {
|
5585
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5586
|
+
}
|
5587
|
+
}
|
5588
|
+
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
5589
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
5590
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
5591
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
5592
|
+
}
|
5593
|
+
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
5594
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
5595
|
+
}
|
5596
|
+
// This can be used to reduce the size of the Q5_K_S model.
|
5597
|
+
// The associated PPL increase is fully in line with the size reduction
|
5598
|
+
//else {
|
5599
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
5600
|
+
//}
|
5601
|
+
bool convert_incompatible_tensor = false;
|
5602
|
+
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
5603
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
5604
|
+
int nx = tensor->ne[0];
|
5605
|
+
int ny = tensor->ne[1];
|
5606
|
+
if (nx % QK_K != 0) {
|
5607
|
+
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
5608
|
+
convert_incompatible_tensor = true;
|
5609
|
+
}
|
5610
|
+
}
|
5611
|
+
if (convert_incompatible_tensor) {
|
5612
|
+
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
5613
|
+
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
5614
|
+
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
5615
|
+
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
5616
|
+
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
5617
|
+
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
5618
|
+
} else {
|
5619
|
+
throw std::runtime_error("Unsupported tensor size encountered\n");
|
5620
|
+
}
|
4699
5621
|
}
|
5622
|
+
|
5623
|
+
return new_type;
|
4700
5624
|
}
|
5625
|
+
#endif
|
4701
5626
|
|
4702
5627
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
|
4703
5628
|
ggml_type quantized_type;
|
@@ -4782,18 +5707,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4782
5707
|
std::vector<int64_t> hist_all(1 << 4, 0);
|
4783
5708
|
|
4784
5709
|
std::vector<std::thread> workers;
|
5710
|
+
workers.reserve(nthread);
|
4785
5711
|
std::mutex mutex;
|
4786
5712
|
|
4787
|
-
#ifdef GGML_USE_K_QUANTS
|
4788
|
-
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
|
4789
|
-
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
|
4790
|
-
};
|
4791
|
-
#endif
|
4792
|
-
|
4793
5713
|
int idx = 0;
|
4794
5714
|
|
4795
|
-
std::vector<uint8_t
|
4796
|
-
std::vector<uint8_t
|
5715
|
+
std::vector<no_init<uint8_t>> read_data;
|
5716
|
+
std::vector<no_init<uint8_t>> work;
|
5717
|
+
std::vector<no_init<float>> f32_conv_buf;
|
4797
5718
|
|
4798
5719
|
// populate the original tensors so we get an initial meta data
|
4799
5720
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
@@ -4815,7 +5736,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4815
5736
|
|
4816
5737
|
const std::string name = ggml_get_name(tensor);
|
4817
5738
|
|
4818
|
-
read_data.
|
5739
|
+
if (read_data.size() < ggml_nbytes(tensor)) {
|
5740
|
+
read_data.resize(ggml_nbytes(tensor));
|
5741
|
+
}
|
4819
5742
|
tensor->data = read_data.data();
|
4820
5743
|
ml->load_data_for(tensor);
|
4821
5744
|
|
@@ -4840,101 +5763,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4840
5763
|
if (quantize) {
|
4841
5764
|
new_type = quantized_type;
|
4842
5765
|
#ifdef GGML_USE_K_QUANTS
|
4843
|
-
|
4844
|
-
|
4845
|
-
|
4846
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4847
|
-
int nx = tensor->ne[0];
|
4848
|
-
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
4849
|
-
new_type = GGML_TYPE_Q8_0;
|
4850
|
-
}
|
4851
|
-
else if (new_type != GGML_TYPE_Q8_0) {
|
4852
|
-
new_type = GGML_TYPE_Q6_K;
|
4853
|
-
}
|
4854
|
-
} else if (name.find("attn_v.weight") != std::string::npos) {
|
4855
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4856
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4857
|
-
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4858
|
-
}
|
4859
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4860
|
-
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
4861
|
-
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
4862
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
4863
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
4864
|
-
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
4865
|
-
if (model.type == MODEL_70B) {
|
4866
|
-
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
4867
|
-
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
4868
|
-
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
4869
|
-
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
4870
|
-
}
|
4871
|
-
++i_attention_wv;
|
4872
|
-
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
4873
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4874
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
4875
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
|
4876
|
-
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
|
4877
|
-
: GGML_TYPE_Q3_K;
|
4878
|
-
}
|
4879
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
4880
|
-
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
4881
|
-
}
|
4882
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
4883
|
-
if (model.arch == LLM_ARCH_FALCON) {
|
4884
|
-
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
|
4885
|
-
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
4886
|
-
} else {
|
4887
|
-
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4888
|
-
}
|
4889
|
-
}
|
4890
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
4891
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
|
4892
|
-
new_type = GGML_TYPE_Q5_K;
|
4893
|
-
}
|
4894
|
-
++i_feed_forward_w2;
|
4895
|
-
} else if (name.find("attn_output.weight") != std::string::npos) {
|
4896
|
-
if (model.arch != LLM_ARCH_FALCON) {
|
4897
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
4898
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
4899
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
4900
|
-
} else {
|
4901
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4902
|
-
}
|
4903
|
-
}
|
4904
|
-
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
4905
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
4906
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
4907
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
4908
|
-
}
|
4909
|
-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
|
4910
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
4911
|
-
}
|
4912
|
-
// This can be used to reduce the size of the Q5_K_S model.
|
4913
|
-
// The associated PPL increase is fully in line with the size reduction
|
4914
|
-
//else {
|
4915
|
-
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
|
4916
|
-
//}
|
4917
|
-
bool convert_incompatible_tensor = false;
|
4918
|
-
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
4919
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
4920
|
-
int nx = tensor->ne[0];
|
4921
|
-
int ny = tensor->ne[1];
|
4922
|
-
if (nx % QK_K != 0) {
|
4923
|
-
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
4924
|
-
convert_incompatible_tensor = true;
|
4925
|
-
}
|
4926
|
-
}
|
4927
|
-
if (convert_incompatible_tensor) {
|
4928
|
-
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
4929
|
-
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
4930
|
-
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
4931
|
-
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
|
4932
|
-
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
4933
|
-
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
4934
|
-
} else {
|
4935
|
-
throw std::runtime_error("Unsupported tensor size encountered\n");
|
4936
|
-
}
|
4937
|
-
}
|
5766
|
+
new_type = get_k_quant_type(
|
5767
|
+
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
|
5768
|
+
);
|
4938
5769
|
#endif
|
4939
5770
|
// If we've decided to quantize to the same type the tensor is already
|
4940
5771
|
// in then there's nothing to do.
|
@@ -4949,23 +5780,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4949
5780
|
const size_t nelements = ggml_nelements(tensor);
|
4950
5781
|
|
4951
5782
|
float * f32_data;
|
4952
|
-
std::vector<float> f32_conv_buf;
|
4953
5783
|
|
4954
5784
|
if (tensor->type == GGML_TYPE_F32) {
|
4955
5785
|
f32_data = (float *) tensor->data;
|
4956
5786
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
4957
5787
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
4958
5788
|
} else {
|
4959
|
-
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
5789
|
+
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
|
4960
5790
|
f32_data = (float *) f32_conv_buf.data();
|
4961
5791
|
}
|
4962
5792
|
|
4963
5793
|
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
|
4964
5794
|
fflush(stdout);
|
4965
5795
|
|
4966
|
-
work.
|
5796
|
+
if (work.size() < nelements * 4) {
|
5797
|
+
work.resize(nelements * 4); // upper bound on size
|
5798
|
+
}
|
4967
5799
|
new_data = work.data();
|
4968
|
-
std::
|
5800
|
+
std::array<int64_t, 1 << 4> hist_cur = {};
|
4969
5801
|
|
4970
5802
|
static const int chunk_size = 32 * 512;
|
4971
5803
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
@@ -4976,13 +5808,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4976
5808
|
size_t counter = 0;
|
4977
5809
|
new_size = 0;
|
4978
5810
|
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
|
4979
|
-
std::
|
5811
|
+
std::array<int64_t, 1 << 4> local_hist = {};
|
4980
5812
|
size_t local_size = 0;
|
4981
5813
|
while (true) {
|
4982
5814
|
std::unique_lock<std::mutex> lock(mutex);
|
4983
5815
|
size_t first = counter; counter += chunk_size;
|
4984
5816
|
if (first >= nelements) {
|
4985
|
-
if (
|
5817
|
+
if (local_size > 0) {
|
4986
5818
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
4987
5819
|
hist_cur[j] += local_hist[j];
|
4988
5820
|
}
|
@@ -4992,22 +5824,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
4992
5824
|
}
|
4993
5825
|
lock.unlock();
|
4994
5826
|
size_t last = std::min(nelements, first + chunk_size);
|
4995
|
-
if (local_hist.empty()) {
|
4996
|
-
local_hist.resize(hist_cur.size(), 0);
|
4997
|
-
}
|
4998
5827
|
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
|
4999
5828
|
}
|
5000
5829
|
};
|
5001
|
-
if ((int) workers.size() < nthread_use - 1) {
|
5002
|
-
workers.resize(nthread_use - 1);
|
5003
|
-
}
|
5004
5830
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
5005
|
-
workers
|
5831
|
+
workers.emplace_back(compute);
|
5006
5832
|
}
|
5007
5833
|
compute();
|
5008
|
-
for (
|
5009
|
-
|
5010
|
-
}
|
5834
|
+
for (auto & w : workers) { w.join(); }
|
5835
|
+
workers.clear();
|
5011
5836
|
}
|
5012
5837
|
|
5013
5838
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
@@ -5069,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5069
5894
|
}
|
5070
5895
|
|
5071
5896
|
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5072
|
-
int llama_apply_lora_from_file_internal(
|
5897
|
+
static int llama_apply_lora_from_file_internal(
|
5898
|
+
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
5899
|
+
) {
|
5073
5900
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5074
5901
|
|
5075
5902
|
const int64_t t_start_lora_us = ggml_time_us();
|
@@ -5353,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
|
|
5353
6180
|
/*.n_gpu_layers =*/ 0,
|
5354
6181
|
/*.main_gpu =*/ 0,
|
5355
6182
|
/*.tensor_split =*/ nullptr,
|
5356
|
-
/*.rope_freq_base =*/
|
5357
|
-
/*.rope_freq_scale =*/
|
6183
|
+
/*.rope_freq_base =*/ 0.0f,
|
6184
|
+
/*.rope_freq_scale =*/ 0.0f,
|
5358
6185
|
/*.progress_callback =*/ nullptr,
|
5359
6186
|
/*.progress_callback_user_data =*/ nullptr,
|
5360
6187
|
/*.low_vram =*/ false,
|
@@ -5616,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
|
|
5616
6443
|
return ctx;
|
5617
6444
|
}
|
5618
6445
|
|
5619
|
-
struct llama_context * llama_init_from_file(
|
6446
|
+
static struct llama_context * llama_init_from_file(
|
5620
6447
|
const char * path_model,
|
5621
6448
|
struct llama_context_params params) {
|
5622
6449
|
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
@@ -5635,15 +6462,19 @@ void llama_free(struct llama_context * ctx) {
|
|
5635
6462
|
}
|
5636
6463
|
|
5637
6464
|
int llama_n_vocab(const struct llama_context * ctx) {
|
5638
|
-
return ctx->model
|
6465
|
+
return llama_model_n_vocab(&ctx->model);
|
5639
6466
|
}
|
5640
6467
|
|
5641
6468
|
int llama_n_ctx(const struct llama_context * ctx) {
|
5642
|
-
return ctx->model
|
6469
|
+
return llama_model_n_ctx(&ctx->model);
|
6470
|
+
}
|
6471
|
+
|
6472
|
+
int llama_n_ctx_train(const struct llama_context * ctx) {
|
6473
|
+
return llama_model_n_ctx_train(&ctx->model);
|
5643
6474
|
}
|
5644
6475
|
|
5645
6476
|
int llama_n_embd(const struct llama_context * ctx) {
|
5646
|
-
return ctx->model
|
6477
|
+
return llama_model_n_embd(&ctx->model);
|
5647
6478
|
}
|
5648
6479
|
|
5649
6480
|
enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
|
@@ -5658,6 +6489,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
|
|
5658
6489
|
return model->hparams.n_ctx;
|
5659
6490
|
}
|
5660
6491
|
|
6492
|
+
int llama_model_n_ctx_train(const struct llama_model * model) {
|
6493
|
+
return model->hparams.n_ctx_train;
|
6494
|
+
}
|
6495
|
+
|
5661
6496
|
int llama_model_n_embd(const struct llama_model * model) {
|
5662
6497
|
return model->hparams.n_embd;
|
5663
6498
|
}
|
@@ -5813,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
|
|
5813
6648
|
* llama_copy_state_data(ctx, &data_ctx);
|
5814
6649
|
*
|
5815
6650
|
*/
|
5816
|
-
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
6651
|
+
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
5817
6652
|
// copy rng
|
5818
6653
|
{
|
5819
6654
|
std::stringstream rng_ss;
|
@@ -6197,22 +7032,24 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
6197
7032
|
int llama_tokenize(
|
6198
7033
|
struct llama_context * ctx,
|
6199
7034
|
const char * text,
|
7035
|
+
int text_len,
|
6200
7036
|
llama_token * tokens,
|
6201
7037
|
int n_max_tokens,
|
6202
7038
|
bool add_bos) {
|
6203
|
-
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
7039
|
+
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
6204
7040
|
}
|
6205
7041
|
|
6206
7042
|
int llama_tokenize_with_model(
|
6207
7043
|
const struct llama_model * model,
|
6208
7044
|
const char * text,
|
7045
|
+
int text_len,
|
6209
7046
|
llama_token * tokens,
|
6210
7047
|
int n_max_tokens,
|
6211
7048
|
bool add_bos) {
|
6212
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
7049
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
|
6213
7050
|
|
6214
7051
|
if (n_max_tokens < (int) res.size()) {
|
6215
|
-
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
7052
|
+
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
6216
7053
|
return -((int) res.size());
|
6217
7054
|
}
|
6218
7055
|
|
@@ -6351,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
6351
7188
|
}
|
6352
7189
|
|
6353
7190
|
// For internal test use
|
6354
|
-
const std::vector<std::pair<std::string, struct ggml_tensor
|
7191
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
7192
|
+
struct llama_context * ctx
|
7193
|
+
) {
|
6355
7194
|
return ctx->model.tensors_by_name;
|
6356
7195
|
}
|
6357
7196
|
|