llama_cpp 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -6
- data/ext/llama_cpp/src/ggml-cuda.cu +99 -46
- data/ext/llama_cpp/src/ggml-metal.m +37 -10
- data/ext/llama_cpp/src/ggml-metal.metal +144 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +68 -40
- data/ext/llama_cpp/src/ggml.h +43 -33
- data/ext/llama_cpp/src/llama.cpp +420 -57
- data/ext/llama_cpp/src/llama.h +5 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#define LLAMA_API_INTERNAL
|
1
2
|
#include "llama.h"
|
2
3
|
|
3
4
|
#include "ggml.h"
|
@@ -108,7 +109,7 @@ static size_t utf8_len(char src) {
|
|
108
109
|
return lookup[highbits];
|
109
110
|
}
|
110
111
|
|
111
|
-
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
113
|
std::string result;
|
113
114
|
for (size_t pos = 0; ; pos += search.length()) {
|
114
115
|
auto new_pos = s.find(search, pos);
|
@@ -160,17 +161,19 @@ enum llm_arch {
|
|
160
161
|
LLM_ARCH_GPTJ,
|
161
162
|
LLM_ARCH_GPTNEOX,
|
162
163
|
LLM_ARCH_MPT,
|
164
|
+
LLM_ARCH_STARCODER,
|
163
165
|
LLM_ARCH_UNKNOWN,
|
164
166
|
};
|
165
167
|
|
166
168
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
167
|
-
{ LLM_ARCH_LLAMA,
|
168
|
-
{ LLM_ARCH_FALCON,
|
169
|
-
{ LLM_ARCH_GPT2,
|
170
|
-
{ LLM_ARCH_GPTJ,
|
171
|
-
{ LLM_ARCH_GPTNEOX,
|
172
|
-
{ LLM_ARCH_MPT,
|
173
|
-
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
169
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
170
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
171
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
172
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
173
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
174
|
+
{ LLM_ARCH_MPT, "mpt" },
|
175
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
174
177
|
};
|
175
178
|
|
176
179
|
enum llm_kv {
|
@@ -376,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
376
379
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
377
380
|
},
|
378
381
|
},
|
382
|
+
{
|
383
|
+
LLM_ARCH_STARCODER,
|
384
|
+
{
|
385
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
386
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
387
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
388
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
389
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
390
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
391
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
392
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
393
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
394
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
395
|
+
},
|
396
|
+
},
|
379
397
|
{
|
380
398
|
LLM_ARCH_UNKNOWN,
|
381
399
|
{
|
@@ -680,6 +698,7 @@ struct llama_mmap {
|
|
680
698
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
681
699
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
682
700
|
llama_format_win_err(GetLastError()).c_str());
|
701
|
+
}
|
683
702
|
}
|
684
703
|
#else
|
685
704
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -889,9 +908,11 @@ static llama_state g_state;
|
|
889
908
|
// available llama models
|
890
909
|
enum e_model {
|
891
910
|
MODEL_UNKNOWN,
|
911
|
+
MODEL_1B,
|
892
912
|
MODEL_3B,
|
893
913
|
MODEL_7B,
|
894
914
|
MODEL_13B,
|
915
|
+
MODEL_15B,
|
895
916
|
MODEL_30B,
|
896
917
|
MODEL_34B,
|
897
918
|
MODEL_40B,
|
@@ -901,24 +922,24 @@ enum e_model {
|
|
901
922
|
|
902
923
|
static const size_t kB = 1024;
|
903
924
|
static const size_t MB = kB*kB;
|
925
|
+
static const size_t GB = kB*kB*kB;
|
904
926
|
|
905
|
-
// default hparams (LLaMA 7B)
|
906
927
|
struct llama_hparams {
|
907
|
-
uint32_t n_vocab
|
908
|
-
uint32_t n_ctx_train
|
909
|
-
uint32_t n_ctx
|
910
|
-
uint32_t n_embd
|
911
|
-
uint32_t n_head
|
912
|
-
uint32_t n_head_kv
|
913
|
-
uint32_t n_layer
|
914
|
-
uint32_t n_rot
|
915
|
-
uint32_t n_ff
|
916
|
-
|
917
|
-
float f_norm_eps
|
918
|
-
float f_norm_rms_eps
|
919
|
-
|
920
|
-
float rope_freq_base
|
921
|
-
float rope_freq_scale
|
928
|
+
uint32_t n_vocab;
|
929
|
+
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
+
uint32_t n_ctx; // context size used during inference
|
931
|
+
uint32_t n_embd;
|
932
|
+
uint32_t n_head;
|
933
|
+
uint32_t n_head_kv;
|
934
|
+
uint32_t n_layer;
|
935
|
+
uint32_t n_rot;
|
936
|
+
uint32_t n_ff;
|
937
|
+
|
938
|
+
float f_norm_eps;
|
939
|
+
float f_norm_rms_eps;
|
940
|
+
|
941
|
+
float rope_freq_base;
|
942
|
+
float rope_freq_scale;
|
922
943
|
|
923
944
|
bool operator!=(const llama_hparams & other) const {
|
924
945
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -960,13 +981,22 @@ struct llama_layer {
|
|
960
981
|
struct ggml_tensor * wo;
|
961
982
|
struct ggml_tensor * wqkv;
|
962
983
|
|
984
|
+
// attention bias
|
985
|
+
struct ggml_tensor * bo;
|
986
|
+
struct ggml_tensor * bqkv;
|
987
|
+
|
963
988
|
// normalization
|
964
989
|
struct ggml_tensor * ffn_norm;
|
990
|
+
struct ggml_tensor * ffn_norm_b;
|
965
991
|
|
966
992
|
// ff
|
967
993
|
struct ggml_tensor * w1; // ffn_gate
|
968
994
|
struct ggml_tensor * w2; // ffn_down
|
969
995
|
struct ggml_tensor * w3; // ffn_up
|
996
|
+
|
997
|
+
// ff bias
|
998
|
+
struct ggml_tensor * b2; // ffn_down
|
999
|
+
struct ggml_tensor * b3; // ffn_up
|
970
1000
|
};
|
971
1001
|
|
972
1002
|
struct llama_kv_cache {
|
@@ -1040,10 +1070,11 @@ struct llama_model {
|
|
1040
1070
|
|
1041
1071
|
std::string name = "n/a";
|
1042
1072
|
|
1043
|
-
llama_hparams hparams;
|
1073
|
+
llama_hparams hparams = {};
|
1044
1074
|
llama_vocab vocab;
|
1045
1075
|
|
1046
1076
|
struct ggml_tensor * tok_embeddings;
|
1077
|
+
struct ggml_tensor * pos_embeddings;
|
1047
1078
|
|
1048
1079
|
struct ggml_tensor * output_norm;
|
1049
1080
|
struct ggml_tensor * output_norm_b;
|
@@ -1244,6 +1275,7 @@ struct llama_model_loader {
|
|
1244
1275
|
int n_created = 0;
|
1245
1276
|
|
1246
1277
|
int64_t n_elements = 0;
|
1278
|
+
size_t n_bytes = 0;
|
1247
1279
|
|
1248
1280
|
bool use_mmap = false;
|
1249
1281
|
|
@@ -1276,6 +1308,7 @@ struct llama_model_loader {
|
|
1276
1308
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
1277
1309
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
1278
1310
|
n_elements += ggml_nelements(t);
|
1311
|
+
n_bytes += ggml_nbytes(t);
|
1279
1312
|
}
|
1280
1313
|
|
1281
1314
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -1554,7 +1587,7 @@ struct llama_model_loader {
|
|
1554
1587
|
// load LLaMA models
|
1555
1588
|
//
|
1556
1589
|
|
1557
|
-
std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1590
|
+
static std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1558
1591
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1559
1592
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1560
1593
|
}
|
@@ -1587,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|
1587
1620
|
|
1588
1621
|
static const char * llama_model_type_name(e_model type) {
|
1589
1622
|
switch (type) {
|
1623
|
+
case MODEL_1B: return "1B";
|
1590
1624
|
case MODEL_3B: return "3B";
|
1591
1625
|
case MODEL_7B: return "7B";
|
1592
1626
|
case MODEL_13B: return "13B";
|
1627
|
+
case MODEL_15B: return "15B";
|
1593
1628
|
case MODEL_30B: return "30B";
|
1594
1629
|
case MODEL_34B: return "34B";
|
1595
1630
|
case MODEL_40B: return "40B";
|
@@ -1633,28 +1668,17 @@ static void llm_load_hparams(
|
|
1633
1668
|
hparams.n_head_kv = hparams.n_head;
|
1634
1669
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1635
1670
|
|
1636
|
-
//
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
llama_context_params defaults = llama_context_default_params();
|
1641
|
-
|
1642
|
-
// rope_freq_base
|
1643
|
-
{
|
1644
|
-
float ropebase = 10000.0f;
|
1645
|
-
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1646
|
-
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
1647
|
-
rope_freq_base = ropebase;
|
1648
|
-
}
|
1671
|
+
// rope_freq_base (optional)
|
1672
|
+
if (rope_freq_base == 0.0f) {
|
1673
|
+
rope_freq_base = 10000.0f;
|
1674
|
+
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1649
1675
|
}
|
1650
1676
|
|
1651
1677
|
// rope_freq_scale (inverse of the kv) is optional
|
1652
|
-
{
|
1678
|
+
if (rope_freq_scale == 0.0f) {
|
1653
1679
|
float ropescale = 1.0f;
|
1654
1680
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1655
|
-
|
1656
|
-
rope_freq_scale = 1.0f/ropescale;
|
1657
|
-
}
|
1681
|
+
rope_freq_scale = 1.0f/ropescale;
|
1658
1682
|
}
|
1659
1683
|
|
1660
1684
|
// sanity check for n_rot (optional)
|
@@ -1707,6 +1731,17 @@ static void llm_load_hparams(
|
|
1707
1731
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
1732
|
}
|
1709
1733
|
} break;
|
1734
|
+
case LLM_ARCH_STARCODER:
|
1735
|
+
{
|
1736
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
1737
|
+
switch (hparams.n_layer) {
|
1738
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
1739
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
1740
|
+
case 42: model.type = e_model::MODEL_7B; break;
|
1741
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
1742
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1743
|
+
}
|
1744
|
+
} break;
|
1710
1745
|
default: (void)0;
|
1711
1746
|
};
|
1712
1747
|
|
@@ -1860,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1860
1895
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1861
1896
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
1862
1897
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
1863
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
|
+
if (ml.n_bytes < GB) {
|
1900
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
|
+
} else {
|
1902
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
|
+
}
|
1864
1904
|
|
1865
1905
|
// general kv
|
1866
1906
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
@@ -2160,6 +2200,85 @@ static void llm_load_tensors(
|
|
2160
2200
|
}
|
2161
2201
|
}
|
2162
2202
|
} break;
|
2203
|
+
case LLM_ARCH_STARCODER:
|
2204
|
+
{
|
2205
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2206
|
+
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
2207
|
+
|
2208
|
+
// output
|
2209
|
+
{
|
2210
|
+
ggml_backend backend_norm;
|
2211
|
+
ggml_backend backend_output;
|
2212
|
+
|
2213
|
+
if (n_gpu_layers > int(n_layer)) {
|
2214
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2216
|
+
#ifndef _WIN32
|
2217
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2218
|
+
#else
|
2219
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
|
+
#endif // _WIN32
|
2221
|
+
|
2222
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2223
|
+
} else {
|
2224
|
+
backend_norm = GGML_BACKEND_CPU;
|
2225
|
+
backend_output = GGML_BACKEND_CPU;
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2229
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2230
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2231
|
+
|
2232
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2233
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2234
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2235
|
+
}
|
2236
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2237
|
+
vram_weights += ggml_nbytes(model.output);
|
2238
|
+
}
|
2239
|
+
}
|
2240
|
+
|
2241
|
+
const uint32_t n_ff = hparams.n_ff;
|
2242
|
+
|
2243
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2244
|
+
|
2245
|
+
model.layers.resize(n_layer);
|
2246
|
+
|
2247
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2248
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2249
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2250
|
+
|
2251
|
+
auto & layer = model.layers[i];
|
2252
|
+
|
2253
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2254
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2255
|
+
|
2256
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2257
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2258
|
+
|
2259
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2260
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2261
|
+
|
2262
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2263
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2264
|
+
|
2265
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2266
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2267
|
+
|
2268
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2269
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2270
|
+
|
2271
|
+
if (backend == GGML_BACKEND_GPU) {
|
2272
|
+
vram_weights +=
|
2273
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2274
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2275
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2276
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2277
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
|
2278
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
|
2279
|
+
}
|
2280
|
+
}
|
2281
|
+
} break;
|
2163
2282
|
default:
|
2164
2283
|
throw std::runtime_error("unknown architecture");
|
2165
2284
|
};
|
@@ -3299,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3299
3418
|
return gf;
|
3300
3419
|
}
|
3301
3420
|
|
3421
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
3422
|
+
llama_context & lctx,
|
3423
|
+
const llama_token * tokens,
|
3424
|
+
const float * embd,
|
3425
|
+
int n_tokens,
|
3426
|
+
int n_past) {
|
3427
|
+
|
3428
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
+
|
3430
|
+
const int N = n_tokens;
|
3431
|
+
|
3432
|
+
const auto & model = lctx.model;
|
3433
|
+
const auto & hparams = model.hparams;
|
3434
|
+
|
3435
|
+
const auto & kv_self = lctx.kv_self;
|
3436
|
+
|
3437
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3438
|
+
|
3439
|
+
const int64_t n_embd = hparams.n_embd;
|
3440
|
+
const int64_t n_layer = hparams.n_layer;
|
3441
|
+
const int64_t n_ctx = hparams.n_ctx;
|
3442
|
+
const int64_t n_head = hparams.n_head;
|
3443
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3445
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3446
|
+
|
3447
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
|
+
|
3449
|
+
const float norm_eps = hparams.f_norm_eps;
|
3450
|
+
|
3451
|
+
auto & buf_compute = lctx.buf_compute;
|
3452
|
+
|
3453
|
+
struct ggml_init_params params = {
|
3454
|
+
/*.mem_size =*/ buf_compute.size,
|
3455
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3456
|
+
/*.no_alloc =*/ false,
|
3457
|
+
};
|
3458
|
+
|
3459
|
+
params.no_alloc = true;
|
3460
|
+
|
3461
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3462
|
+
|
3463
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3464
|
+
|
3465
|
+
struct ggml_tensor * cur;
|
3466
|
+
struct ggml_tensor * token;
|
3467
|
+
struct ggml_tensor * position;
|
3468
|
+
struct ggml_tensor * inpL;
|
3469
|
+
|
3470
|
+
if (tokens) {
|
3471
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3472
|
+
|
3473
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
3476
|
+
}
|
3477
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
|
+
|
3479
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3480
|
+
} else {
|
3481
|
+
#ifdef GGML_USE_MPI
|
3482
|
+
GGML_ASSERT(false && "not implemented");
|
3483
|
+
#endif
|
3484
|
+
|
3485
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
3486
|
+
|
3487
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
3488
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
+
memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
|
3490
|
+
}
|
3491
|
+
}
|
3492
|
+
|
3493
|
+
{
|
3494
|
+
// Compute position embeddings.
|
3495
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3496
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
+
for (int i = 0; i < N; ++i) {
|
3499
|
+
((int32_t *) inp_positions->data)[i] = n_past + i;
|
3500
|
+
}
|
3501
|
+
}
|
3502
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3503
|
+
|
3504
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
|
+
}
|
3506
|
+
|
3507
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3508
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
|
+
}
|
3512
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3513
|
+
|
3514
|
+
inpL = ggml_add(ctx0, token, position);
|
3515
|
+
ggml_set_name(inpL, "inpL");
|
3516
|
+
|
3517
|
+
for (int il = 0; il < n_layer; ++il) {
|
3518
|
+
{
|
3519
|
+
// Norm
|
3520
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3521
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
3522
|
+
}
|
3523
|
+
|
3524
|
+
{
|
3525
|
+
// Self Attention
|
3526
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
|
+
|
3528
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
3529
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
|
3530
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
|
+
|
3532
|
+
struct ggml_tensor * Qcur = tmpq;
|
3533
|
+
struct ggml_tensor * Kcur = tmpk;
|
3534
|
+
|
3535
|
+
{
|
3536
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
|
3537
|
+
ggml_set_name(Vcur, "Vcur");
|
3538
|
+
|
3539
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
3540
|
+
ggml_set_name(k, "k");
|
3541
|
+
|
3542
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
3543
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
3545
|
+
|
3546
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3548
|
+
}
|
3549
|
+
|
3550
|
+
struct ggml_tensor * Q =
|
3551
|
+
ggml_permute(ctx0,
|
3552
|
+
ggml_cpy(ctx0,
|
3553
|
+
Qcur,
|
3554
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
|
3555
|
+
0, 2, 1, 3);
|
3556
|
+
ggml_set_name(Q, "Q");
|
3557
|
+
|
3558
|
+
struct ggml_tensor * K =
|
3559
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
+
n_embd_head, n_past + N, n_head_kv,
|
3561
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3564
|
+
ggml_set_name(K, "K");
|
3565
|
+
|
3566
|
+
// K * Q
|
3567
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3568
|
+
ggml_set_name(KQ, "KQ");
|
3569
|
+
|
3570
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
3572
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
|
+
|
3575
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
3577
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
|
+
|
3579
|
+
// KQ = soft_max(KQ_masked)
|
3580
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
3581
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3582
|
+
|
3583
|
+
// split cached V into n_head heads
|
3584
|
+
struct ggml_tensor * V =
|
3585
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
+
n_past + N, n_embd_head, n_head_kv,
|
3587
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3588
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3590
|
+
ggml_set_name(V, "V");
|
3591
|
+
|
3592
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3593
|
+
ggml_set_name(KQV, "KQV");
|
3594
|
+
|
3595
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3596
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
|
+
|
3599
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
3600
|
+
cur = ggml_cpy(ctx0,
|
3601
|
+
KQV_merged,
|
3602
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3603
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
|
+
}
|
3605
|
+
|
3606
|
+
// Projection
|
3607
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
3608
|
+
|
3609
|
+
// Add the input
|
3610
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3611
|
+
|
3612
|
+
struct ggml_tensor * inpFF = cur;
|
3613
|
+
|
3614
|
+
// FF
|
3615
|
+
{
|
3616
|
+
// Norm
|
3617
|
+
{
|
3618
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
3619
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
3620
|
+
}
|
3621
|
+
|
3622
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
3623
|
+
|
3624
|
+
// GELU activation
|
3625
|
+
cur = ggml_gelu(ctx0, cur);
|
3626
|
+
|
3627
|
+
// Projection
|
3628
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
3629
|
+
}
|
3630
|
+
|
3631
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
3632
|
+
}
|
3633
|
+
|
3634
|
+
// Output Norm
|
3635
|
+
{
|
3636
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3637
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
3638
|
+
}
|
3639
|
+
ggml_set_name(cur, "result_norm");
|
3640
|
+
|
3641
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3642
|
+
ggml_set_name(cur, "result_output");
|
3643
|
+
|
3644
|
+
ggml_build_forward_expand(gf, cur);
|
3645
|
+
ggml_free(ctx0);
|
3646
|
+
|
3647
|
+
return gf;
|
3648
|
+
}
|
3649
|
+
|
3302
3650
|
static struct ggml_cgraph * llama_build_graph(
|
3303
3651
|
llama_context & lctx,
|
3304
3652
|
const llama_token * tokens,
|
@@ -3322,6 +3670,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3322
3670
|
{
|
3323
3671
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
3324
3672
|
} break;
|
3673
|
+
case LLM_ARCH_STARCODER:
|
3674
|
+
{
|
3675
|
+
result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
|
3676
|
+
} break;
|
3325
3677
|
default:
|
3326
3678
|
GGML_ASSERT(false);
|
3327
3679
|
};
|
@@ -3408,6 +3760,15 @@ static bool llama_eval_internal(
|
|
3408
3760
|
n_threads = std::min(4, n_threads);
|
3409
3761
|
}
|
3410
3762
|
|
3763
|
+
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
3764
|
+
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
3765
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
3766
|
+
model.arch == LLM_ARCH_FALCON;
|
3767
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
3768
|
+
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
3769
|
+
n_threads = 1;
|
3770
|
+
}
|
3771
|
+
|
3411
3772
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
3412
3773
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
3413
3774
|
|
@@ -3423,10 +3784,6 @@ static bool llama_eval_internal(
|
|
3423
3784
|
if (lctx.ctx_metal) {
|
3424
3785
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
3425
3786
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
3426
|
-
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
3427
|
-
if (!lctx.embedding.empty()) {
|
3428
|
-
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
3429
|
-
}
|
3430
3787
|
} else {
|
3431
3788
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
3432
3789
|
}
|
@@ -3939,7 +4296,7 @@ struct llama_grammar_candidate {
|
|
3939
4296
|
|
3940
4297
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
3941
4298
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
3942
|
-
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
4299
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
3943
4300
|
const char * src,
|
3944
4301
|
llama_partial_utf8 partial_start) {
|
3945
4302
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -5537,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5537
5894
|
}
|
5538
5895
|
|
5539
5896
|
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5540
|
-
int llama_apply_lora_from_file_internal(
|
5897
|
+
static int llama_apply_lora_from_file_internal(
|
5898
|
+
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
5899
|
+
) {
|
5541
5900
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5542
5901
|
|
5543
5902
|
const int64_t t_start_lora_us = ggml_time_us();
|
@@ -5821,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
|
|
5821
6180
|
/*.n_gpu_layers =*/ 0,
|
5822
6181
|
/*.main_gpu =*/ 0,
|
5823
6182
|
/*.tensor_split =*/ nullptr,
|
5824
|
-
/*.rope_freq_base =*/
|
5825
|
-
/*.rope_freq_scale =*/
|
6183
|
+
/*.rope_freq_base =*/ 0.0f,
|
6184
|
+
/*.rope_freq_scale =*/ 0.0f,
|
5826
6185
|
/*.progress_callback =*/ nullptr,
|
5827
6186
|
/*.progress_callback_user_data =*/ nullptr,
|
5828
6187
|
/*.low_vram =*/ false,
|
@@ -6084,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
|
|
6084
6443
|
return ctx;
|
6085
6444
|
}
|
6086
6445
|
|
6087
|
-
struct llama_context * llama_init_from_file(
|
6446
|
+
static struct llama_context * llama_init_from_file(
|
6088
6447
|
const char * path_model,
|
6089
6448
|
struct llama_context_params params) {
|
6090
6449
|
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
@@ -6289,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
|
|
6289
6648
|
* llama_copy_state_data(ctx, &data_ctx);
|
6290
6649
|
*
|
6291
6650
|
*/
|
6292
|
-
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
6651
|
+
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
6293
6652
|
// copy rng
|
6294
6653
|
{
|
6295
6654
|
std::stringstream rng_ss;
|
@@ -6673,19 +7032,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
6673
7032
|
int llama_tokenize(
|
6674
7033
|
struct llama_context * ctx,
|
6675
7034
|
const char * text,
|
7035
|
+
int text_len,
|
6676
7036
|
llama_token * tokens,
|
6677
7037
|
int n_max_tokens,
|
6678
7038
|
bool add_bos) {
|
6679
|
-
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
7039
|
+
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
6680
7040
|
}
|
6681
7041
|
|
6682
7042
|
int llama_tokenize_with_model(
|
6683
7043
|
const struct llama_model * model,
|
6684
7044
|
const char * text,
|
7045
|
+
int text_len,
|
6685
7046
|
llama_token * tokens,
|
6686
7047
|
int n_max_tokens,
|
6687
7048
|
bool add_bos) {
|
6688
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
7049
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
|
6689
7050
|
|
6690
7051
|
if (n_max_tokens < (int) res.size()) {
|
6691
7052
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -6827,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
6827
7188
|
}
|
6828
7189
|
|
6829
7190
|
// For internal test use
|
6830
|
-
const std::vector<std::pair<std::string, struct ggml_tensor
|
7191
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
7192
|
+
struct llama_context * ctx
|
7193
|
+
) {
|
6831
7194
|
return ctx->model.tensors_by_name;
|
6832
7195
|
}
|
6833
7196
|
|