llama_cpp 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/llama_cpp/llama_cpp.cpp +2 -2
- data/ext/llama_cpp/src/ggml-alloc.c +6 -6
- data/ext/llama_cpp/src/ggml-cuda.cu +99 -46
- data/ext/llama_cpp/src/ggml-metal.m +37 -10
- data/ext/llama_cpp/src/ggml-metal.metal +144 -45
- data/ext/llama_cpp/src/ggml-opencl.cpp +3 -3
- data/ext/llama_cpp/src/ggml.c +68 -40
- data/ext/llama_cpp/src/ggml.h +43 -33
- data/ext/llama_cpp/src/llama.cpp +420 -57
- data/ext/llama_cpp/src/llama.h +5 -1
- data/lib/llama_cpp/version.rb +2 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
#define LLAMA_API_INTERNAL
|
1
2
|
#include "llama.h"
|
2
3
|
|
3
4
|
#include "ggml.h"
|
@@ -108,7 +109,7 @@ static size_t utf8_len(char src) {
|
|
108
109
|
return lookup[highbits];
|
109
110
|
}
|
110
111
|
|
111
|
-
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
112
113
|
std::string result;
|
113
114
|
for (size_t pos = 0; ; pos += search.length()) {
|
114
115
|
auto new_pos = s.find(search, pos);
|
@@ -160,17 +161,19 @@ enum llm_arch {
|
|
160
161
|
LLM_ARCH_GPTJ,
|
161
162
|
LLM_ARCH_GPTNEOX,
|
162
163
|
LLM_ARCH_MPT,
|
164
|
+
LLM_ARCH_STARCODER,
|
163
165
|
LLM_ARCH_UNKNOWN,
|
164
166
|
};
|
165
167
|
|
166
168
|
static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
167
|
-
{ LLM_ARCH_LLAMA,
|
168
|
-
{ LLM_ARCH_FALCON,
|
169
|
-
{ LLM_ARCH_GPT2,
|
170
|
-
{ LLM_ARCH_GPTJ,
|
171
|
-
{ LLM_ARCH_GPTNEOX,
|
172
|
-
{ LLM_ARCH_MPT,
|
173
|
-
{ LLM_ARCH_BAICHUAN,"baichuan" },
|
169
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
170
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
171
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
172
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
173
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
174
|
+
{ LLM_ARCH_MPT, "mpt" },
|
175
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
176
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
174
177
|
};
|
175
178
|
|
176
179
|
enum llm_kv {
|
@@ -376,6 +379,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
376
379
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
377
380
|
},
|
378
381
|
},
|
382
|
+
{
|
383
|
+
LLM_ARCH_STARCODER,
|
384
|
+
{
|
385
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
386
|
+
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
387
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
388
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
389
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
390
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
391
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
392
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
393
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
394
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
395
|
+
},
|
396
|
+
},
|
379
397
|
{
|
380
398
|
LLM_ARCH_UNKNOWN,
|
381
399
|
{
|
@@ -680,6 +698,7 @@ struct llama_mmap {
|
|
680
698
|
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
681
699
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
682
700
|
llama_format_win_err(GetLastError()).c_str());
|
701
|
+
}
|
683
702
|
}
|
684
703
|
#else
|
685
704
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
@@ -889,9 +908,11 @@ static llama_state g_state;
|
|
889
908
|
// available llama models
|
890
909
|
enum e_model {
|
891
910
|
MODEL_UNKNOWN,
|
911
|
+
MODEL_1B,
|
892
912
|
MODEL_3B,
|
893
913
|
MODEL_7B,
|
894
914
|
MODEL_13B,
|
915
|
+
MODEL_15B,
|
895
916
|
MODEL_30B,
|
896
917
|
MODEL_34B,
|
897
918
|
MODEL_40B,
|
@@ -901,24 +922,24 @@ enum e_model {
|
|
901
922
|
|
902
923
|
static const size_t kB = 1024;
|
903
924
|
static const size_t MB = kB*kB;
|
925
|
+
static const size_t GB = kB*kB*kB;
|
904
926
|
|
905
|
-
// default hparams (LLaMA 7B)
|
906
927
|
struct llama_hparams {
|
907
|
-
uint32_t n_vocab
|
908
|
-
uint32_t n_ctx_train
|
909
|
-
uint32_t n_ctx
|
910
|
-
uint32_t n_embd
|
911
|
-
uint32_t n_head
|
912
|
-
uint32_t n_head_kv
|
913
|
-
uint32_t n_layer
|
914
|
-
uint32_t n_rot
|
915
|
-
uint32_t n_ff
|
916
|
-
|
917
|
-
float f_norm_eps
|
918
|
-
float f_norm_rms_eps
|
919
|
-
|
920
|
-
float rope_freq_base
|
921
|
-
float rope_freq_scale
|
928
|
+
uint32_t n_vocab;
|
929
|
+
uint32_t n_ctx_train; // context size the model was trained on
|
930
|
+
uint32_t n_ctx; // context size used during inference
|
931
|
+
uint32_t n_embd;
|
932
|
+
uint32_t n_head;
|
933
|
+
uint32_t n_head_kv;
|
934
|
+
uint32_t n_layer;
|
935
|
+
uint32_t n_rot;
|
936
|
+
uint32_t n_ff;
|
937
|
+
|
938
|
+
float f_norm_eps;
|
939
|
+
float f_norm_rms_eps;
|
940
|
+
|
941
|
+
float rope_freq_base;
|
942
|
+
float rope_freq_scale;
|
922
943
|
|
923
944
|
bool operator!=(const llama_hparams & other) const {
|
924
945
|
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
@@ -960,13 +981,22 @@ struct llama_layer {
|
|
960
981
|
struct ggml_tensor * wo;
|
961
982
|
struct ggml_tensor * wqkv;
|
962
983
|
|
984
|
+
// attention bias
|
985
|
+
struct ggml_tensor * bo;
|
986
|
+
struct ggml_tensor * bqkv;
|
987
|
+
|
963
988
|
// normalization
|
964
989
|
struct ggml_tensor * ffn_norm;
|
990
|
+
struct ggml_tensor * ffn_norm_b;
|
965
991
|
|
966
992
|
// ff
|
967
993
|
struct ggml_tensor * w1; // ffn_gate
|
968
994
|
struct ggml_tensor * w2; // ffn_down
|
969
995
|
struct ggml_tensor * w3; // ffn_up
|
996
|
+
|
997
|
+
// ff bias
|
998
|
+
struct ggml_tensor * b2; // ffn_down
|
999
|
+
struct ggml_tensor * b3; // ffn_up
|
970
1000
|
};
|
971
1001
|
|
972
1002
|
struct llama_kv_cache {
|
@@ -1040,10 +1070,11 @@ struct llama_model {
|
|
1040
1070
|
|
1041
1071
|
std::string name = "n/a";
|
1042
1072
|
|
1043
|
-
llama_hparams hparams;
|
1073
|
+
llama_hparams hparams = {};
|
1044
1074
|
llama_vocab vocab;
|
1045
1075
|
|
1046
1076
|
struct ggml_tensor * tok_embeddings;
|
1077
|
+
struct ggml_tensor * pos_embeddings;
|
1047
1078
|
|
1048
1079
|
struct ggml_tensor * output_norm;
|
1049
1080
|
struct ggml_tensor * output_norm_b;
|
@@ -1244,6 +1275,7 @@ struct llama_model_loader {
|
|
1244
1275
|
int n_created = 0;
|
1245
1276
|
|
1246
1277
|
int64_t n_elements = 0;
|
1278
|
+
size_t n_bytes = 0;
|
1247
1279
|
|
1248
1280
|
bool use_mmap = false;
|
1249
1281
|
|
@@ -1276,6 +1308,7 @@ struct llama_model_loader {
|
|
1276
1308
|
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
1277
1309
|
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
|
1278
1310
|
n_elements += ggml_nelements(t);
|
1311
|
+
n_bytes += ggml_nbytes(t);
|
1279
1312
|
}
|
1280
1313
|
|
1281
1314
|
LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
|
@@ -1554,7 +1587,7 @@ struct llama_model_loader {
|
|
1554
1587
|
// load LLaMA models
|
1555
1588
|
//
|
1556
1589
|
|
1557
|
-
std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1590
|
+
static std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
1558
1591
|
if (ftype & LLAMA_FTYPE_GUESSED) {
|
1559
1592
|
return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
|
1560
1593
|
}
|
@@ -1587,9 +1620,11 @@ std::string llama_model_ftype_name(enum llama_ftype ftype) {
|
|
1587
1620
|
|
1588
1621
|
static const char * llama_model_type_name(e_model type) {
|
1589
1622
|
switch (type) {
|
1623
|
+
case MODEL_1B: return "1B";
|
1590
1624
|
case MODEL_3B: return "3B";
|
1591
1625
|
case MODEL_7B: return "7B";
|
1592
1626
|
case MODEL_13B: return "13B";
|
1627
|
+
case MODEL_15B: return "15B";
|
1593
1628
|
case MODEL_30B: return "30B";
|
1594
1629
|
case MODEL_34B: return "34B";
|
1595
1630
|
case MODEL_40B: return "40B";
|
@@ -1633,28 +1668,17 @@ static void llm_load_hparams(
|
|
1633
1668
|
hparams.n_head_kv = hparams.n_head;
|
1634
1669
|
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
1635
1670
|
|
1636
|
-
//
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
llama_context_params defaults = llama_context_default_params();
|
1641
|
-
|
1642
|
-
// rope_freq_base
|
1643
|
-
{
|
1644
|
-
float ropebase = 10000.0f;
|
1645
|
-
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1646
|
-
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
1647
|
-
rope_freq_base = ropebase;
|
1648
|
-
}
|
1671
|
+
// rope_freq_base (optional)
|
1672
|
+
if (rope_freq_base == 0.0f) {
|
1673
|
+
rope_freq_base = 10000.0f;
|
1674
|
+
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
1649
1675
|
}
|
1650
1676
|
|
1651
1677
|
// rope_freq_scale (inverse of the kv) is optional
|
1652
|
-
{
|
1678
|
+
if (rope_freq_scale == 0.0f) {
|
1653
1679
|
float ropescale = 1.0f;
|
1654
1680
|
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
1655
|
-
|
1656
|
-
rope_freq_scale = 1.0f/ropescale;
|
1657
|
-
}
|
1681
|
+
rope_freq_scale = 1.0f/ropescale;
|
1658
1682
|
}
|
1659
1683
|
|
1660
1684
|
// sanity check for n_rot (optional)
|
@@ -1707,6 +1731,17 @@ static void llm_load_hparams(
|
|
1707
1731
|
default: model.type = e_model::MODEL_UNKNOWN;
|
1708
1732
|
}
|
1709
1733
|
} break;
|
1734
|
+
case LLM_ARCH_STARCODER:
|
1735
|
+
{
|
1736
|
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
1737
|
+
switch (hparams.n_layer) {
|
1738
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
1739
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
1740
|
+
case 42: model.type = e_model::MODEL_7B; break;
|
1741
|
+
case 40: model.type = e_model::MODEL_15B; break;
|
1742
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
1743
|
+
}
|
1744
|
+
} break;
|
1710
1745
|
default: (void)0;
|
1711
1746
|
};
|
1712
1747
|
|
@@ -1860,7 +1895,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
1860
1895
|
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
|
1861
1896
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
1862
1897
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
1863
|
-
LLAMA_LOG_INFO("%s: model
|
1898
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
1899
|
+
if (ml.n_bytes < GB) {
|
1900
|
+
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1901
|
+
} else {
|
1902
|
+
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
1903
|
+
}
|
1864
1904
|
|
1865
1905
|
// general kv
|
1866
1906
|
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
@@ -2160,6 +2200,85 @@ static void llm_load_tensors(
|
|
2160
2200
|
}
|
2161
2201
|
}
|
2162
2202
|
} break;
|
2203
|
+
case LLM_ARCH_STARCODER:
|
2204
|
+
{
|
2205
|
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2206
|
+
model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
2207
|
+
|
2208
|
+
// output
|
2209
|
+
{
|
2210
|
+
ggml_backend backend_norm;
|
2211
|
+
ggml_backend backend_output;
|
2212
|
+
|
2213
|
+
if (n_gpu_layers > int(n_layer)) {
|
2214
|
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2215
|
+
// on Windows however this is detrimental unless everything is on the GPU
|
2216
|
+
#ifndef _WIN32
|
2217
|
+
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2218
|
+
#else
|
2219
|
+
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2220
|
+
#endif // _WIN32
|
2221
|
+
|
2222
|
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2223
|
+
} else {
|
2224
|
+
backend_norm = GGML_BACKEND_CPU;
|
2225
|
+
backend_output = GGML_BACKEND_CPU;
|
2226
|
+
}
|
2227
|
+
|
2228
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2229
|
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2230
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2231
|
+
|
2232
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2233
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
2234
|
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2235
|
+
}
|
2236
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2237
|
+
vram_weights += ggml_nbytes(model.output);
|
2238
|
+
}
|
2239
|
+
}
|
2240
|
+
|
2241
|
+
const uint32_t n_ff = hparams.n_ff;
|
2242
|
+
|
2243
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2244
|
+
|
2245
|
+
model.layers.resize(n_layer);
|
2246
|
+
|
2247
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2248
|
+
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2249
|
+
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2250
|
+
|
2251
|
+
auto & layer = model.layers[i];
|
2252
|
+
|
2253
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2254
|
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2255
|
+
|
2256
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2257
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2258
|
+
|
2259
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2260
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2261
|
+
|
2262
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2263
|
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2264
|
+
|
2265
|
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2266
|
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2267
|
+
|
2268
|
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2269
|
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2270
|
+
|
2271
|
+
if (backend == GGML_BACKEND_GPU) {
|
2272
|
+
vram_weights +=
|
2273
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2274
|
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2275
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2276
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2277
|
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) +
|
2278
|
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3);
|
2279
|
+
}
|
2280
|
+
}
|
2281
|
+
} break;
|
2163
2282
|
default:
|
2164
2283
|
throw std::runtime_error("unknown architecture");
|
2165
2284
|
};
|
@@ -3299,6 +3418,235 @@ static struct ggml_cgraph * llm_build_falcon(
|
|
3299
3418
|
return gf;
|
3300
3419
|
}
|
3301
3420
|
|
3421
|
+
static struct ggml_cgraph * llm_build_starcoder(
|
3422
|
+
llama_context & lctx,
|
3423
|
+
const llama_token * tokens,
|
3424
|
+
const float * embd,
|
3425
|
+
int n_tokens,
|
3426
|
+
int n_past) {
|
3427
|
+
|
3428
|
+
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
3429
|
+
|
3430
|
+
const int N = n_tokens;
|
3431
|
+
|
3432
|
+
const auto & model = lctx.model;
|
3433
|
+
const auto & hparams = model.hparams;
|
3434
|
+
|
3435
|
+
const auto & kv_self = lctx.kv_self;
|
3436
|
+
|
3437
|
+
GGML_ASSERT(!!kv_self.ctx);
|
3438
|
+
|
3439
|
+
const int64_t n_embd = hparams.n_embd;
|
3440
|
+
const int64_t n_layer = hparams.n_layer;
|
3441
|
+
const int64_t n_ctx = hparams.n_ctx;
|
3442
|
+
const int64_t n_head = hparams.n_head;
|
3443
|
+
const int64_t n_head_kv = hparams.n_head_kv;
|
3444
|
+
const int64_t n_embd_head = hparams.n_embd_head();
|
3445
|
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
3446
|
+
|
3447
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
3448
|
+
|
3449
|
+
const float norm_eps = hparams.f_norm_eps;
|
3450
|
+
|
3451
|
+
auto & buf_compute = lctx.buf_compute;
|
3452
|
+
|
3453
|
+
struct ggml_init_params params = {
|
3454
|
+
/*.mem_size =*/ buf_compute.size,
|
3455
|
+
/*.mem_buffer =*/ buf_compute.data,
|
3456
|
+
/*.no_alloc =*/ false,
|
3457
|
+
};
|
3458
|
+
|
3459
|
+
params.no_alloc = true;
|
3460
|
+
|
3461
|
+
struct ggml_context * ctx0 = ggml_init(params);
|
3462
|
+
|
3463
|
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
3464
|
+
|
3465
|
+
struct ggml_tensor * cur;
|
3466
|
+
struct ggml_tensor * token;
|
3467
|
+
struct ggml_tensor * position;
|
3468
|
+
struct ggml_tensor * inpL;
|
3469
|
+
|
3470
|
+
if (tokens) {
|
3471
|
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3472
|
+
|
3473
|
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
3474
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3475
|
+
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
3476
|
+
}
|
3477
|
+
ggml_set_name(inp_tokens, "inp_tokens");
|
3478
|
+
|
3479
|
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
3480
|
+
} else {
|
3481
|
+
#ifdef GGML_USE_MPI
|
3482
|
+
GGML_ASSERT(false && "not implemented");
|
3483
|
+
#endif
|
3484
|
+
|
3485
|
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
3486
|
+
|
3487
|
+
ggml_allocr_alloc(lctx.alloc, token);
|
3488
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3489
|
+
memcpy(token->data, embd, N * n_embd * ggml_element_size(token));
|
3490
|
+
}
|
3491
|
+
}
|
3492
|
+
|
3493
|
+
{
|
3494
|
+
// Compute position embeddings.
|
3495
|
+
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
3496
|
+
ggml_allocr_alloc(lctx.alloc, inp_positions);
|
3497
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3498
|
+
for (int i = 0; i < N; ++i) {
|
3499
|
+
((int32_t *) inp_positions->data)[i] = n_past + i;
|
3500
|
+
}
|
3501
|
+
}
|
3502
|
+
ggml_set_name(inp_positions, "inp_positions");
|
3503
|
+
|
3504
|
+
position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions);
|
3505
|
+
}
|
3506
|
+
|
3507
|
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
3508
|
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
3509
|
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
3510
|
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
3511
|
+
}
|
3512
|
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
3513
|
+
|
3514
|
+
inpL = ggml_add(ctx0, token, position);
|
3515
|
+
ggml_set_name(inpL, "inpL");
|
3516
|
+
|
3517
|
+
for (int il = 0; il < n_layer; ++il) {
|
3518
|
+
{
|
3519
|
+
// Norm
|
3520
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3521
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
3522
|
+
}
|
3523
|
+
|
3524
|
+
{
|
3525
|
+
// Self Attention
|
3526
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
3527
|
+
|
3528
|
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
3529
|
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*n_embd);
|
3530
|
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, N, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
3531
|
+
|
3532
|
+
struct ggml_tensor * Qcur = tmpq;
|
3533
|
+
struct ggml_tensor * Kcur = tmpk;
|
3534
|
+
|
3535
|
+
{
|
3536
|
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, N));
|
3537
|
+
ggml_set_name(Vcur, "Vcur");
|
3538
|
+
|
3539
|
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
|
3540
|
+
ggml_set_name(k, "k");
|
3541
|
+
|
3542
|
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
|
3543
|
+
( n_ctx)*ggml_element_size(kv_self.v),
|
3544
|
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
|
3545
|
+
|
3546
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
3547
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
3548
|
+
}
|
3549
|
+
|
3550
|
+
struct ggml_tensor * Q =
|
3551
|
+
ggml_permute(ctx0,
|
3552
|
+
ggml_cpy(ctx0,
|
3553
|
+
Qcur,
|
3554
|
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, N)),
|
3555
|
+
0, 2, 1, 3);
|
3556
|
+
ggml_set_name(Q, "Q");
|
3557
|
+
|
3558
|
+
struct ggml_tensor * K =
|
3559
|
+
ggml_view_3d(ctx0, kv_self.k,
|
3560
|
+
n_embd_head, n_past + N, n_head_kv,
|
3561
|
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
3562
|
+
ggml_element_size(kv_self.k)*n_embd_head,
|
3563
|
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
3564
|
+
ggml_set_name(K, "K");
|
3565
|
+
|
3566
|
+
// K * Q
|
3567
|
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
3568
|
+
ggml_set_name(KQ, "KQ");
|
3569
|
+
|
3570
|
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
3571
|
+
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
3572
|
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
3573
|
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
3574
|
+
|
3575
|
+
// KQ_masked = mask_past(KQ_scaled)
|
3576
|
+
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
3577
|
+
ggml_set_name(KQ_masked, "KQ_masked");
|
3578
|
+
|
3579
|
+
// KQ = soft_max(KQ_masked)
|
3580
|
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
3581
|
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
3582
|
+
|
3583
|
+
// split cached V into n_head heads
|
3584
|
+
struct ggml_tensor * V =
|
3585
|
+
ggml_view_3d(ctx0, kv_self.v,
|
3586
|
+
n_past + N, n_embd_head, n_head_kv,
|
3587
|
+
ggml_element_size(kv_self.v)*n_ctx,
|
3588
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
3589
|
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
3590
|
+
ggml_set_name(V, "V");
|
3591
|
+
|
3592
|
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
3593
|
+
ggml_set_name(KQV, "KQV");
|
3594
|
+
|
3595
|
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
3596
|
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
3597
|
+
ggml_set_name(KQV_merged, "KQV_merged");
|
3598
|
+
|
3599
|
+
// cur = KQV_merged.contiguous().view(n_embd, N)
|
3600
|
+
cur = ggml_cpy(ctx0,
|
3601
|
+
KQV_merged,
|
3602
|
+
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
3603
|
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
3604
|
+
}
|
3605
|
+
|
3606
|
+
// Projection
|
3607
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
3608
|
+
|
3609
|
+
// Add the input
|
3610
|
+
cur = ggml_add(ctx0, cur, inpL);
|
3611
|
+
|
3612
|
+
struct ggml_tensor * inpFF = cur;
|
3613
|
+
|
3614
|
+
// FF
|
3615
|
+
{
|
3616
|
+
// Norm
|
3617
|
+
{
|
3618
|
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
3619
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
3620
|
+
}
|
3621
|
+
|
3622
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
3623
|
+
|
3624
|
+
// GELU activation
|
3625
|
+
cur = ggml_gelu(ctx0, cur);
|
3626
|
+
|
3627
|
+
// Projection
|
3628
|
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
3629
|
+
}
|
3630
|
+
|
3631
|
+
inpL = ggml_add(ctx0, cur, inpFF);
|
3632
|
+
}
|
3633
|
+
|
3634
|
+
// Output Norm
|
3635
|
+
{
|
3636
|
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
3637
|
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
3638
|
+
}
|
3639
|
+
ggml_set_name(cur, "result_norm");
|
3640
|
+
|
3641
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
3642
|
+
ggml_set_name(cur, "result_output");
|
3643
|
+
|
3644
|
+
ggml_build_forward_expand(gf, cur);
|
3645
|
+
ggml_free(ctx0);
|
3646
|
+
|
3647
|
+
return gf;
|
3648
|
+
}
|
3649
|
+
|
3302
3650
|
static struct ggml_cgraph * llama_build_graph(
|
3303
3651
|
llama_context & lctx,
|
3304
3652
|
const llama_token * tokens,
|
@@ -3322,6 +3670,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
3322
3670
|
{
|
3323
3671
|
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
|
3324
3672
|
} break;
|
3673
|
+
case LLM_ARCH_STARCODER:
|
3674
|
+
{
|
3675
|
+
result = llm_build_starcoder(lctx, tokens, embd, n_tokens, n_past);
|
3676
|
+
} break;
|
3325
3677
|
default:
|
3326
3678
|
GGML_ASSERT(false);
|
3327
3679
|
};
|
@@ -3408,6 +3760,15 @@ static bool llama_eval_internal(
|
|
3408
3760
|
n_threads = std::min(4, n_threads);
|
3409
3761
|
}
|
3410
3762
|
|
3763
|
+
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
3764
|
+
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
3765
|
+
model.arch == LLM_ARCH_BAICHUAN ||
|
3766
|
+
model.arch == LLM_ARCH_FALCON;
|
3767
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
3768
|
+
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
3769
|
+
n_threads = 1;
|
3770
|
+
}
|
3771
|
+
|
3411
3772
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
3412
3773
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
3413
3774
|
|
@@ -3423,10 +3784,6 @@ static bool llama_eval_internal(
|
|
3423
3784
|
if (lctx.ctx_metal) {
|
3424
3785
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
3425
3786
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
3426
|
-
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
3427
|
-
if (!lctx.embedding.empty()) {
|
3428
|
-
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
3429
|
-
}
|
3430
3787
|
} else {
|
3431
3788
|
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
|
3432
3789
|
}
|
@@ -3939,7 +4296,7 @@ struct llama_grammar_candidate {
|
|
3939
4296
|
|
3940
4297
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
3941
4298
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
3942
|
-
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
4299
|
+
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
3943
4300
|
const char * src,
|
3944
4301
|
llama_partial_utf8 partial_start) {
|
3945
4302
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
@@ -5537,7 +5894,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
5537
5894
|
}
|
5538
5895
|
|
5539
5896
|
// TODO: after the GGUF PR, this likely won't work and needs to be updated
|
5540
|
-
int llama_apply_lora_from_file_internal(
|
5897
|
+
static int llama_apply_lora_from_file_internal(
|
5898
|
+
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
5899
|
+
) {
|
5541
5900
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
5542
5901
|
|
5543
5902
|
const int64_t t_start_lora_us = ggml_time_us();
|
@@ -5821,8 +6180,8 @@ struct llama_context_params llama_context_default_params() {
|
|
5821
6180
|
/*.n_gpu_layers =*/ 0,
|
5822
6181
|
/*.main_gpu =*/ 0,
|
5823
6182
|
/*.tensor_split =*/ nullptr,
|
5824
|
-
/*.rope_freq_base =*/
|
5825
|
-
/*.rope_freq_scale =*/
|
6183
|
+
/*.rope_freq_base =*/ 0.0f,
|
6184
|
+
/*.rope_freq_scale =*/ 0.0f,
|
5826
6185
|
/*.progress_callback =*/ nullptr,
|
5827
6186
|
/*.progress_callback_user_data =*/ nullptr,
|
5828
6187
|
/*.low_vram =*/ false,
|
@@ -6084,7 +6443,7 @@ struct llama_context * llama_new_context_with_model(
|
|
6084
6443
|
return ctx;
|
6085
6444
|
}
|
6086
6445
|
|
6087
|
-
struct llama_context * llama_init_from_file(
|
6446
|
+
static struct llama_context * llama_init_from_file(
|
6088
6447
|
const char * path_model,
|
6089
6448
|
struct llama_context_params params) {
|
6090
6449
|
struct llama_model * model = llama_load_model_from_file(path_model, params);
|
@@ -6289,7 +6648,7 @@ struct llama_data_file_context : llama_data_context {
|
|
6289
6648
|
* llama_copy_state_data(ctx, &data_ctx);
|
6290
6649
|
*
|
6291
6650
|
*/
|
6292
|
-
void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
6651
|
+
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
6293
6652
|
// copy rng
|
6294
6653
|
{
|
6295
6654
|
std::stringstream rng_ss;
|
@@ -6673,19 +7032,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
|
|
6673
7032
|
int llama_tokenize(
|
6674
7033
|
struct llama_context * ctx,
|
6675
7034
|
const char * text,
|
7035
|
+
int text_len,
|
6676
7036
|
llama_token * tokens,
|
6677
7037
|
int n_max_tokens,
|
6678
7038
|
bool add_bos) {
|
6679
|
-
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
|
7039
|
+
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
|
6680
7040
|
}
|
6681
7041
|
|
6682
7042
|
int llama_tokenize_with_model(
|
6683
7043
|
const struct llama_model * model,
|
6684
7044
|
const char * text,
|
7045
|
+
int text_len,
|
6685
7046
|
llama_token * tokens,
|
6686
7047
|
int n_max_tokens,
|
6687
7048
|
bool add_bos) {
|
6688
|
-
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
7049
|
+
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);
|
6689
7050
|
|
6690
7051
|
if (n_max_tokens < (int) res.size()) {
|
6691
7052
|
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
@@ -6827,7 +7188,9 @@ void llama_dump_timing_info_yaml(FILE * stream, const llama_context * ctx) {
|
|
6827
7188
|
}
|
6828
7189
|
|
6829
7190
|
// For internal test use
|
6830
|
-
const std::vector<std::pair<std::string, struct ggml_tensor
|
7191
|
+
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
7192
|
+
struct llama_context * ctx
|
7193
|
+
) {
|
6831
7194
|
return ctx->model.tensors_by_name;
|
6832
7195
|
}
|
6833
7196
|
|