llama_cpp 0.15.2 → 0.15.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -26,16 +26,9 @@
|
|
26
26
|
#ifdef GGML_USE_METAL
|
27
27
|
# include "ggml-metal.h"
|
28
28
|
#endif
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
#ifndef QK_K
|
33
|
-
# ifdef GGML_QKK_64
|
34
|
-
# define QK_K 64
|
35
|
-
# else
|
36
|
-
# define QK_K 256
|
37
|
-
# endif
|
38
|
-
#endif
|
29
|
+
|
30
|
+
// TODO: replace with ggml API call
|
31
|
+
#define QK_K 256
|
39
32
|
|
40
33
|
#ifdef __has_include
|
41
34
|
#if __has_include(<unistd.h>)
|
@@ -110,7 +103,7 @@
|
|
110
103
|
#endif
|
111
104
|
|
112
105
|
#define LLAMA_MAX_NODES 8192
|
113
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 128
|
114
107
|
|
115
108
|
//
|
116
109
|
// logging
|
@@ -205,7 +198,6 @@ enum llm_arch {
|
|
205
198
|
LLM_ARCH_GPTNEOX,
|
206
199
|
LLM_ARCH_MPT,
|
207
200
|
LLM_ARCH_STARCODER,
|
208
|
-
LLM_ARCH_PERSIMMON,
|
209
201
|
LLM_ARCH_REFACT,
|
210
202
|
LLM_ARCH_BERT,
|
211
203
|
LLM_ARCH_NOMIC_BERT,
|
@@ -229,6 +221,7 @@ enum llm_arch {
|
|
229
221
|
LLM_ARCH_COMMAND_R,
|
230
222
|
LLM_ARCH_DBRX,
|
231
223
|
LLM_ARCH_OLMO,
|
224
|
+
LLM_ARCH_ARCTIC,
|
232
225
|
LLM_ARCH_UNKNOWN,
|
233
226
|
};
|
234
227
|
|
@@ -242,7 +235,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
242
235
|
{ LLM_ARCH_MPT, "mpt" },
|
243
236
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
244
237
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
245
|
-
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
246
238
|
{ LLM_ARCH_REFACT, "refact" },
|
247
239
|
{ LLM_ARCH_BERT, "bert" },
|
248
240
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
@@ -266,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
266
258
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
267
259
|
{ LLM_ARCH_DBRX, "dbrx" },
|
268
260
|
{ LLM_ARCH_OLMO, "olmo" },
|
261
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
269
262
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
270
263
|
};
|
271
264
|
|
@@ -309,6 +302,7 @@ enum llm_kv {
|
|
309
302
|
LLM_KV_ROPE_SCALE_LINEAR,
|
310
303
|
LLM_KV_ROPE_SCALING_TYPE,
|
311
304
|
LLM_KV_ROPE_SCALING_FACTOR,
|
305
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
312
306
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
313
307
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
314
308
|
|
@@ -386,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
386
380
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
387
381
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
388
382
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
383
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
389
384
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
390
385
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
391
386
|
|
@@ -441,6 +436,8 @@ enum llm_tensor {
|
|
441
436
|
LLM_TENSOR_OUTPUT,
|
442
437
|
LLM_TENSOR_OUTPUT_NORM,
|
443
438
|
LLM_TENSOR_ROPE_FREQS,
|
439
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
440
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
444
441
|
LLM_TENSOR_ATTN_Q,
|
445
442
|
LLM_TENSOR_ATTN_K,
|
446
443
|
LLM_TENSOR_ATTN_V,
|
@@ -460,6 +457,7 @@ enum llm_tensor {
|
|
460
457
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
461
458
|
LLM_TENSOR_FFN_GATE_EXP,
|
462
459
|
LLM_TENSOR_FFN_UP_EXP,
|
460
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
463
461
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
464
462
|
LLM_TENSOR_FFN_GATE_EXPS,
|
465
463
|
LLM_TENSOR_FFN_UP_EXPS,
|
@@ -598,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
598
596
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
599
597
|
},
|
600
598
|
},
|
601
|
-
{
|
602
|
-
LLM_ARCH_PERSIMMON,
|
603
|
-
{
|
604
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
605
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
606
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
607
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
608
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
609
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
610
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
611
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
612
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
613
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
614
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
615
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
616
|
-
},
|
617
|
-
},
|
618
599
|
{
|
619
600
|
LLM_ARCH_MPT,
|
620
601
|
{
|
@@ -825,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
825
806
|
{
|
826
807
|
LLM_ARCH_PHI3,
|
827
808
|
{
|
828
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
829
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
830
|
-
{ LLM_TENSOR_OUTPUT,
|
831
|
-
{
|
832
|
-
{
|
833
|
-
{
|
834
|
-
{
|
835
|
-
{
|
836
|
-
{
|
837
|
-
{
|
838
|
-
{
|
839
|
-
{
|
809
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
810
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
811
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
812
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
813
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
815
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
816
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
817
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
840
823
|
},
|
841
824
|
},
|
842
825
|
{
|
@@ -1052,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1052
1035
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1053
1036
|
},
|
1054
1037
|
},
|
1038
|
+
{
|
1039
|
+
LLM_ARCH_ARCTIC,
|
1040
|
+
{
|
1041
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1042
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1043
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1044
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1045
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1046
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1047
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1048
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1049
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1050
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1051
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1052
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1053
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1054
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
1055
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1056
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1057
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1058
|
+
},
|
1059
|
+
},
|
1055
1060
|
{
|
1056
1061
|
LLM_ARCH_UNKNOWN,
|
1057
1062
|
{
|
@@ -1697,6 +1702,8 @@ struct llama_state {
|
|
1697
1702
|
llama_state() {
|
1698
1703
|
#ifdef GGML_USE_METAL
|
1699
1704
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
1705
|
+
#elif defined(GGML_USE_CUDA)
|
1706
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
1700
1707
|
#endif
|
1701
1708
|
}
|
1702
1709
|
|
@@ -1710,17 +1717,24 @@ static llama_state g_state;
|
|
1710
1717
|
// available llama models
|
1711
1718
|
enum e_model {
|
1712
1719
|
MODEL_UNKNOWN,
|
1720
|
+
MODEL_14M,
|
1713
1721
|
MODEL_17M,
|
1714
1722
|
MODEL_22M,
|
1715
1723
|
MODEL_33M,
|
1724
|
+
MODEL_70M,
|
1716
1725
|
MODEL_109M,
|
1717
1726
|
MODEL_137M,
|
1727
|
+
MODEL_160M,
|
1718
1728
|
MODEL_335M,
|
1729
|
+
MODEL_410M,
|
1719
1730
|
MODEL_0_5B,
|
1720
1731
|
MODEL_1B,
|
1732
|
+
MODEL_1_4B,
|
1721
1733
|
MODEL_2B,
|
1734
|
+
MODEL_2_8B,
|
1722
1735
|
MODEL_3B,
|
1723
1736
|
MODEL_4B,
|
1737
|
+
MODEL_6_9B,
|
1724
1738
|
MODEL_7B,
|
1725
1739
|
MODEL_8B,
|
1726
1740
|
MODEL_12B,
|
@@ -1743,6 +1757,7 @@ enum e_model {
|
|
1743
1757
|
MODEL_8x7B,
|
1744
1758
|
MODEL_8x22B,
|
1745
1759
|
MODEL_16x12B,
|
1760
|
+
MODEL_10B_128x3_66B,
|
1746
1761
|
};
|
1747
1762
|
|
1748
1763
|
static const size_t kiB = 1024;
|
@@ -1752,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
|
|
1752
1767
|
struct llama_hparams {
|
1753
1768
|
bool vocab_only;
|
1754
1769
|
bool rope_finetuned;
|
1770
|
+
bool use_par_res;
|
1755
1771
|
|
1756
1772
|
uint32_t n_vocab;
|
1757
1773
|
uint32_t n_ctx_train; // context size the model was trained on
|
@@ -1770,6 +1786,7 @@ struct llama_hparams {
|
|
1770
1786
|
float f_norm_eps;
|
1771
1787
|
float f_norm_rms_eps;
|
1772
1788
|
|
1789
|
+
float rope_attn_factor = 1.0f;
|
1773
1790
|
float rope_freq_base_train;
|
1774
1791
|
float rope_freq_scale_train;
|
1775
1792
|
uint32_t n_yarn_orig_ctx;
|
@@ -1818,6 +1835,7 @@ struct llama_hparams {
|
|
1818
1835
|
|
1819
1836
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1820
1837
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1838
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1821
1839
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1822
1840
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1823
1841
|
|
@@ -1915,6 +1933,7 @@ struct llama_layer {
|
|
1915
1933
|
struct ggml_tensor * ffn_norm_b;
|
1916
1934
|
struct ggml_tensor * layer_out_norm;
|
1917
1935
|
struct ggml_tensor * layer_out_norm_b;
|
1936
|
+
struct ggml_tensor * ffn_norm_exps;
|
1918
1937
|
|
1919
1938
|
// ff
|
1920
1939
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -1952,6 +1971,10 @@ struct llama_layer {
|
|
1952
1971
|
// mamba bias
|
1953
1972
|
struct ggml_tensor * ssm_conv1d_b;
|
1954
1973
|
struct ggml_tensor * ssm_dt_b;
|
1974
|
+
|
1975
|
+
// long rope factors
|
1976
|
+
struct ggml_tensor * rope_long = nullptr;
|
1977
|
+
struct ggml_tensor * rope_short = nullptr;
|
1955
1978
|
};
|
1956
1979
|
|
1957
1980
|
struct llama_kv_cell {
|
@@ -2268,10 +2291,6 @@ struct llama_context {
|
|
2268
2291
|
|
2269
2292
|
// control vectors
|
2270
2293
|
struct llama_control_vector cvec;
|
2271
|
-
|
2272
|
-
#ifdef GGML_USE_MPI
|
2273
|
-
ggml_mpi_context * ctx_mpi = NULL;
|
2274
|
-
#endif
|
2275
2294
|
};
|
2276
2295
|
|
2277
2296
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
@@ -2491,7 +2510,6 @@ static bool llama_kv_cache_init(
|
|
2491
2510
|
static bool llama_kv_cache_find_slot(
|
2492
2511
|
struct llama_kv_cache & cache,
|
2493
2512
|
const struct llama_batch & batch) {
|
2494
|
-
const uint32_t n_ctx = cache.size;
|
2495
2513
|
const uint32_t n_tokens = batch.n_tokens;
|
2496
2514
|
|
2497
2515
|
if (cache.recurrent) {
|
@@ -2542,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
|
|
2542
2560
|
}
|
2543
2561
|
// otherwise, one cell per token.
|
2544
2562
|
|
2545
|
-
if (n_tokens >
|
2546
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
2563
|
+
if (n_tokens > cache.size) {
|
2564
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
2547
2565
|
return false;
|
2548
2566
|
}
|
2549
2567
|
|
2550
2568
|
uint32_t n_tested = 0;
|
2551
2569
|
|
2552
2570
|
while (true) {
|
2553
|
-
if (cache.head + n_tokens >
|
2554
|
-
n_tested +=
|
2571
|
+
if (cache.head + n_tokens > cache.size) {
|
2572
|
+
n_tested += cache.size - cache.head;
|
2555
2573
|
cache.head = 0;
|
2556
2574
|
continue;
|
2557
2575
|
}
|
@@ -2570,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
|
|
2570
2588
|
break;
|
2571
2589
|
}
|
2572
2590
|
|
2573
|
-
if (n_tested >=
|
2591
|
+
if (n_tested >= cache.size) {
|
2574
2592
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
2575
2593
|
return false;
|
2576
2594
|
}
|
@@ -3330,6 +3348,39 @@ struct llama_model_loader {
|
|
3330
3348
|
return get_arr_n(llm_kv(kid), result, required);
|
3331
3349
|
}
|
3332
3350
|
|
3351
|
+
template<typename T>
|
3352
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
3353
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
3354
|
+
|
3355
|
+
if (kid < 0) {
|
3356
|
+
if (required) {
|
3357
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3358
|
+
}
|
3359
|
+
return false;
|
3360
|
+
}
|
3361
|
+
|
3362
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
3363
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
3364
|
+
|
3365
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
3366
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
3367
|
+
}
|
3368
|
+
|
3369
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
3370
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
3371
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
3372
|
+
|
3373
|
+
result.resize(arr_info.length);
|
3374
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
3375
|
+
|
3376
|
+
return true;
|
3377
|
+
}
|
3378
|
+
|
3379
|
+
template<typename T>
|
3380
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
3381
|
+
return get_arr(llm_kv(kid), result, required);
|
3382
|
+
}
|
3383
|
+
|
3333
3384
|
template<typename T>
|
3334
3385
|
bool get_key(const std::string & key, T & result, const bool required = true) {
|
3335
3386
|
auto it = kv_overrides.find(key);
|
@@ -3404,11 +3455,15 @@ struct llama_model_loader {
|
|
3404
3455
|
return get_tensor_meta(get_tensor_name(i));
|
3405
3456
|
}
|
3406
3457
|
|
3407
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3458
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
3408
3459
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3409
3460
|
ggml_set_name(tensor, ggml_get_name(cur));
|
3410
3461
|
|
3411
|
-
|
3462
|
+
if (duplicated) {
|
3463
|
+
size_data += ggml_nbytes(cur);
|
3464
|
+
} else {
|
3465
|
+
n_created++;
|
3466
|
+
}
|
3412
3467
|
|
3413
3468
|
return tensor;
|
3414
3469
|
}
|
@@ -3443,14 +3498,17 @@ struct llama_model_loader {
|
|
3443
3498
|
return cur;
|
3444
3499
|
}
|
3445
3500
|
|
3446
|
-
|
3447
|
-
|
3501
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
3502
|
+
static const int TENSOR_DUPLICATED = 2;
|
3503
|
+
|
3504
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
3505
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
3448
3506
|
|
3449
3507
|
if (cur == NULL) {
|
3450
3508
|
return NULL;
|
3451
3509
|
}
|
3452
3510
|
|
3453
|
-
return create_tensor_for(ctx, cur);
|
3511
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
3454
3512
|
}
|
3455
3513
|
|
3456
3514
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
@@ -3750,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3750
3808
|
|
3751
3809
|
static const char * llama_model_type_name(e_model type) {
|
3752
3810
|
switch (type) {
|
3753
|
-
case
|
3754
|
-
case
|
3755
|
-
case
|
3756
|
-
case
|
3757
|
-
case
|
3758
|
-
case
|
3759
|
-
case
|
3760
|
-
case
|
3761
|
-
case
|
3762
|
-
case
|
3763
|
-
case
|
3764
|
-
case
|
3765
|
-
case
|
3766
|
-
case
|
3767
|
-
case
|
3768
|
-
case
|
3769
|
-
case
|
3770
|
-
case
|
3771
|
-
case
|
3772
|
-
case
|
3773
|
-
case
|
3774
|
-
case
|
3775
|
-
case
|
3776
|
-
case
|
3777
|
-
case
|
3778
|
-
case
|
3779
|
-
case
|
3780
|
-
case
|
3781
|
-
case
|
3782
|
-
case
|
3783
|
-
|
3811
|
+
case MODEL_14M: return "14M";
|
3812
|
+
case MODEL_17M: return "17M";
|
3813
|
+
case MODEL_22M: return "22M";
|
3814
|
+
case MODEL_33M: return "33M";
|
3815
|
+
case MODEL_70M: return "70M";
|
3816
|
+
case MODEL_109M: return "109M";
|
3817
|
+
case MODEL_137M: return "137M";
|
3818
|
+
case MODEL_160M: return "160M";
|
3819
|
+
case MODEL_335M: return "335M";
|
3820
|
+
case MODEL_410M: return "410M";
|
3821
|
+
case MODEL_0_5B: return "0.5B";
|
3822
|
+
case MODEL_1B: return "1B";
|
3823
|
+
case MODEL_1_4B: return "1.4B";
|
3824
|
+
case MODEL_2B: return "2B";
|
3825
|
+
case MODEL_2_8B: return "2.8B";
|
3826
|
+
case MODEL_3B: return "3B";
|
3827
|
+
case MODEL_4B: return "4B";
|
3828
|
+
case MODEL_6_9B: return "6.9B";
|
3829
|
+
case MODEL_7B: return "7B";
|
3830
|
+
case MODEL_8B: return "8B";
|
3831
|
+
case MODEL_12B: return "12B";
|
3832
|
+
case MODEL_13B: return "13B";
|
3833
|
+
case MODEL_14B: return "14B";
|
3834
|
+
case MODEL_15B: return "15B";
|
3835
|
+
case MODEL_20B: return "20B";
|
3836
|
+
case MODEL_30B: return "30B";
|
3837
|
+
case MODEL_34B: return "34B";
|
3838
|
+
case MODEL_35B: return "35B";
|
3839
|
+
case MODEL_40B: return "40B";
|
3840
|
+
case MODEL_65B: return "65B";
|
3841
|
+
case MODEL_70B: return "70B";
|
3842
|
+
case MODEL_314B: return "314B";
|
3843
|
+
case MODEL_SMALL: return "0.1B";
|
3844
|
+
case MODEL_MEDIUM: return "0.4B";
|
3845
|
+
case MODEL_LARGE: return "0.8B";
|
3846
|
+
case MODEL_XL: return "1.5B";
|
3847
|
+
case MODEL_A2_7B: return "A2.7B";
|
3848
|
+
case MODEL_8x7B: return "8x7B";
|
3849
|
+
case MODEL_8x22B: return "8x22B";
|
3850
|
+
case MODEL_16x12B: return "16x12B";
|
3851
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
3852
|
+
default: return "?B";
|
3784
3853
|
}
|
3785
3854
|
}
|
3786
3855
|
|
@@ -3873,6 +3942,8 @@ static void llm_load_hparams(
|
|
3873
3942
|
}
|
3874
3943
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
3875
3944
|
|
3945
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
3946
|
+
|
3876
3947
|
// sanity check for n_rot (optional)
|
3877
3948
|
{
|
3878
3949
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
@@ -3972,14 +4043,6 @@ static void llm_load_hparams(
|
|
3972
4043
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3973
4044
|
}
|
3974
4045
|
} break;
|
3975
|
-
case LLM_ARCH_PERSIMMON:
|
3976
|
-
{
|
3977
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3978
|
-
switch (hparams.n_layer) {
|
3979
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
3980
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
3981
|
-
}
|
3982
|
-
} break;
|
3983
4046
|
case LLM_ARCH_REFACT:
|
3984
4047
|
{
|
3985
4048
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -4121,6 +4184,7 @@ static void llm_load_hparams(
|
|
4121
4184
|
switch (hparams.n_layer) {
|
4122
4185
|
case 24: model.type = e_model::MODEL_1B; break;
|
4123
4186
|
case 32: model.type = e_model::MODEL_3B; break;
|
4187
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
4124
4188
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4125
4189
|
}
|
4126
4190
|
} break;
|
@@ -4261,6 +4325,65 @@ static void llm_load_hparams(
|
|
4261
4325
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4262
4326
|
}
|
4263
4327
|
} break;
|
4328
|
+
case LLM_ARCH_GPTNEOX:
|
4329
|
+
{
|
4330
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4331
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
4332
|
+
switch (hparams.n_layer) {
|
4333
|
+
case 6:
|
4334
|
+
switch (hparams.n_ff) {
|
4335
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
4336
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
4337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4338
|
+
} break;
|
4339
|
+
case 12:
|
4340
|
+
switch (hparams.n_ff) {
|
4341
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
4342
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4343
|
+
} break;
|
4344
|
+
case 16:
|
4345
|
+
switch (hparams.n_ff) {
|
4346
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
4347
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4348
|
+
} break;
|
4349
|
+
case 24:
|
4350
|
+
switch (hparams.n_ff) {
|
4351
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
4352
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
4353
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4354
|
+
} break;
|
4355
|
+
case 32:
|
4356
|
+
switch (hparams.n_ff) {
|
4357
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
4358
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
4359
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4360
|
+
} break;
|
4361
|
+
case 36:
|
4362
|
+
switch (hparams.n_ff) {
|
4363
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
4364
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4365
|
+
} break;
|
4366
|
+
case 44:
|
4367
|
+
switch (hparams.n_ff) {
|
4368
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
4369
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4370
|
+
} break;
|
4371
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4372
|
+
}
|
4373
|
+
} break;
|
4374
|
+
case LLM_ARCH_ARCTIC:
|
4375
|
+
{
|
4376
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4377
|
+
|
4378
|
+
if (hparams.n_expert == 128) {
|
4379
|
+
switch (hparams.n_layer) {
|
4380
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
4381
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4382
|
+
}
|
4383
|
+
} else {
|
4384
|
+
model.type = e_model::MODEL_UNKNOWN;
|
4385
|
+
}
|
4386
|
+
} break;
|
4264
4387
|
default: (void)0;
|
4265
4388
|
}
|
4266
4389
|
|
@@ -4461,6 +4584,9 @@ static void llm_load_vocab(
|
|
4461
4584
|
} else if (
|
4462
4585
|
tokenizer_pre == "qwen2") {
|
4463
4586
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4587
|
+
} else if (
|
4588
|
+
tokenizer_pre == "stablelm2") {
|
4589
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
4464
4590
|
} else if (
|
4465
4591
|
tokenizer_pre == "olmo") {
|
4466
4592
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
@@ -4582,7 +4708,8 @@ static void llm_load_vocab(
|
|
4582
4708
|
(t.first == "<|eot_id|>" ||
|
4583
4709
|
t.first == "<|im_end|>" ||
|
4584
4710
|
t.first == "<|end|>" ||
|
4585
|
-
t.first == "<end_of_turn>"
|
4711
|
+
t.first == "<end_of_turn>" ||
|
4712
|
+
t.first == "<|endoftext|>"
|
4586
4713
|
)
|
4587
4714
|
) {
|
4588
4715
|
vocab.special_eot_id = t.second;
|
@@ -4908,6 +5035,7 @@ static bool llm_load_tensors(
|
|
4908
5035
|
// create tensors for the weights
|
4909
5036
|
{
|
4910
5037
|
const int64_t n_embd = hparams.n_embd;
|
5038
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
4911
5039
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4912
5040
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4913
5041
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -4942,12 +5070,10 @@ static bool llm_load_tensors(
|
|
4942
5070
|
{
|
4943
5071
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4944
5072
|
if (model.arch != LLM_ARCH_MINICPM){
|
4945
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5073
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4946
5074
|
// if output is NULL, init from the input tok embed
|
4947
5075
|
if (model.output == NULL) {
|
4948
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4949
|
-
ml.n_created--; // artificial tensor
|
4950
|
-
ml.size_data += ggml_nbytes(model.output);
|
5076
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4951
5077
|
}
|
4952
5078
|
}
|
4953
5079
|
}
|
@@ -4966,10 +5092,10 @@ static bool llm_load_tensors(
|
|
4966
5092
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4967
5093
|
|
4968
5094
|
// optional bias tensors
|
4969
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
4970
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
4971
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
4972
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5095
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5096
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5097
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5098
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4973
5099
|
|
4974
5100
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4975
5101
|
|
@@ -4980,7 +5106,7 @@ static bool llm_load_tensors(
|
|
4980
5106
|
} else {
|
4981
5107
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4982
5108
|
|
4983
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5109
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4984
5110
|
if (layer.ffn_gate_exps) {
|
4985
5111
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4986
5112
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5022,12 +5148,10 @@ static bool llm_load_tensors(
|
|
5022
5148
|
// output
|
5023
5149
|
{
|
5024
5150
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5025
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5151
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5026
5152
|
// if output is NULL, init from the input tok embed
|
5027
5153
|
if (model.output == NULL) {
|
5028
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5029
|
-
ml.n_created--; // artificial tensor
|
5030
|
-
ml.size_data += ggml_nbytes(model.output);
|
5154
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5031
5155
|
}
|
5032
5156
|
}
|
5033
5157
|
|
@@ -5050,7 +5174,7 @@ static bool llm_load_tensors(
|
|
5050
5174
|
|
5051
5175
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5052
5176
|
|
5053
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5177
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5054
5178
|
if (layer.ffn_gate_exps) {
|
5055
5179
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
5056
5180
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5152,11 +5276,9 @@ static bool llm_load_tensors(
|
|
5152
5276
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5153
5277
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5154
5278
|
|
5155
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5279
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5156
5280
|
if (!model.output) {
|
5157
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5158
|
-
ml.n_created--; // artificial tensor
|
5159
|
-
ml.size_data += ggml_nbytes(model.output);
|
5281
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5160
5282
|
}
|
5161
5283
|
}
|
5162
5284
|
|
@@ -5169,8 +5291,8 @@ static bool llm_load_tensors(
|
|
5169
5291
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5170
5292
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5171
5293
|
|
5172
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
5173
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
5294
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5295
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5174
5296
|
|
5175
5297
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5176
5298
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -5188,7 +5310,12 @@ static bool llm_load_tensors(
|
|
5188
5310
|
{
|
5189
5311
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5190
5312
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5191
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5313
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5314
|
+
if (!model.output) {
|
5315
|
+
// needs to be on GPU
|
5316
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5317
|
+
}
|
5318
|
+
|
5192
5319
|
}
|
5193
5320
|
|
5194
5321
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5216,47 +5343,6 @@ static bool llm_load_tensors(
|
|
5216
5343
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5217
5344
|
}
|
5218
5345
|
} break;
|
5219
|
-
case LLM_ARCH_PERSIMMON:
|
5220
|
-
{
|
5221
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5222
|
-
|
5223
|
-
{
|
5224
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5225
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5226
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5227
|
-
}
|
5228
|
-
|
5229
|
-
for (int i = 0; i < n_layer; ++i) {
|
5230
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
5231
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5232
|
-
|
5233
|
-
auto & layer = model.layers[i];
|
5234
|
-
|
5235
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5236
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5237
|
-
|
5238
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5239
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
5240
|
-
|
5241
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5242
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
5243
|
-
|
5244
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5245
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5246
|
-
|
5247
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5248
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5249
|
-
|
5250
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5251
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
5252
|
-
|
5253
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
5254
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
5255
|
-
|
5256
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
5257
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
5258
|
-
}
|
5259
|
-
} break;
|
5260
5346
|
case LLM_ARCH_BERT:
|
5261
5347
|
case LLM_ARCH_NOMIC_BERT:
|
5262
5348
|
{
|
@@ -5325,14 +5411,14 @@ static bool llm_load_tensors(
|
|
5325
5411
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5326
5412
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5327
5413
|
|
5328
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5329
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5414
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5415
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5330
5416
|
|
5331
5417
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5332
5418
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5333
5419
|
|
5334
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5335
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5420
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5421
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5336
5422
|
|
5337
5423
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5338
5424
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
@@ -5394,18 +5480,16 @@ static bool llm_load_tensors(
|
|
5394
5480
|
case LLM_ARCH_MPT:
|
5395
5481
|
{
|
5396
5482
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5397
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
5483
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5398
5484
|
|
5399
5485
|
// output
|
5400
5486
|
{
|
5401
5487
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5402
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
5488
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5403
5489
|
|
5404
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5490
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5405
5491
|
if (!model.output) {
|
5406
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5407
|
-
ml.n_created--; // artificial tensor
|
5408
|
-
ml.size_data += ggml_nbytes(model.output);
|
5492
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5409
5493
|
}
|
5410
5494
|
}
|
5411
5495
|
|
@@ -5416,31 +5500,31 @@ static bool llm_load_tensors(
|
|
5416
5500
|
auto & layer = model.layers[i];
|
5417
5501
|
|
5418
5502
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5419
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
5503
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5420
5504
|
|
5421
5505
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5422
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5506
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5423
5507
|
|
5424
5508
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5425
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5509
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5426
5510
|
|
5427
5511
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5428
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5512
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5429
5513
|
|
5430
5514
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5431
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
5515
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5432
5516
|
|
5433
5517
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5434
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
5518
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5435
5519
|
|
5436
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5437
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5520
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5521
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5438
5522
|
|
5439
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5440
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5523
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5524
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5441
5525
|
|
5442
5526
|
// AWQ ScaleActivation layer
|
5443
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
5527
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5444
5528
|
}
|
5445
5529
|
} break;
|
5446
5530
|
case LLM_ARCH_STABLELM:
|
@@ -5469,17 +5553,17 @@ static bool llm_load_tensors(
|
|
5469
5553
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5470
5554
|
|
5471
5555
|
// optional bias tensors, present in Stable LM 2 1.6B
|
5472
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
5473
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
5474
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
5556
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5557
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5558
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5475
5559
|
|
5476
5560
|
// optional q and k layernorms, present in StableLM 2 12B
|
5477
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
5478
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
5561
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5562
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5479
5563
|
|
5480
5564
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5481
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
5482
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5565
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5566
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5483
5567
|
|
5484
5568
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5485
5569
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5522,12 +5606,10 @@ static bool llm_load_tensors(
|
|
5522
5606
|
// output
|
5523
5607
|
{
|
5524
5608
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5525
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5609
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5526
5610
|
// if output is NULL, init from the input tok embed
|
5527
5611
|
if (model.output == NULL) {
|
5528
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5529
|
-
ml.n_created--; // artificial tensor
|
5530
|
-
ml.size_data += ggml_nbytes(model.output);
|
5612
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5531
5613
|
}
|
5532
5614
|
}
|
5533
5615
|
|
@@ -5625,8 +5707,8 @@ static bool llm_load_tensors(
|
|
5625
5707
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5626
5708
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5627
5709
|
|
5628
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
5629
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5710
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5711
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5630
5712
|
|
5631
5713
|
if (layer.wqkv == nullptr) {
|
5632
5714
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
@@ -5663,17 +5745,20 @@ static bool llm_load_tensors(
|
|
5663
5745
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
5664
5746
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5665
5747
|
|
5666
|
-
auto& layer = model.layers[i];
|
5748
|
+
auto & layer = model.layers[i];
|
5667
5749
|
|
5668
5750
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5669
5751
|
|
5670
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
5671
|
-
layer.wo
|
5752
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5753
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5672
5754
|
|
5673
5755
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5674
5756
|
|
5675
5757
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5676
5758
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5759
|
+
|
5760
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5761
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5677
5762
|
}
|
5678
5763
|
} break;
|
5679
5764
|
case LLM_ARCH_PLAMO:
|
@@ -5842,9 +5927,7 @@ static bool llm_load_tensors(
|
|
5842
5927
|
|
5843
5928
|
// output
|
5844
5929
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5845
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
5846
|
-
ml.n_created--; // artificial tensor
|
5847
|
-
ml.size_data += ggml_nbytes(model.output);
|
5930
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
5848
5931
|
|
5849
5932
|
const int64_t n_ff = hparams.n_ff;
|
5850
5933
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -5879,12 +5962,10 @@ static bool llm_load_tensors(
|
|
5879
5962
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5880
5963
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5881
5964
|
|
5882
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5965
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5883
5966
|
// if output is NULL, init from the input tok embed
|
5884
5967
|
if (model.output == NULL) {
|
5885
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5886
|
-
ml.n_created--; // artificial tensor
|
5887
|
-
ml.size_data += ggml_nbytes(model.output);
|
5968
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5888
5969
|
}
|
5889
5970
|
|
5890
5971
|
}
|
@@ -5935,12 +6016,10 @@ static bool llm_load_tensors(
|
|
5935
6016
|
{
|
5936
6017
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5937
6018
|
|
5938
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6019
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5939
6020
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
5940
6021
|
if (model.output == NULL) {
|
5941
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5942
|
-
ml.n_created--; // artificial tensor
|
5943
|
-
ml.size_data += ggml_nbytes(model.output);
|
6022
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5944
6023
|
}
|
5945
6024
|
}
|
5946
6025
|
|
@@ -6001,9 +6080,7 @@ static bool llm_load_tensors(
|
|
6001
6080
|
{
|
6002
6081
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6003
6082
|
// init output from the input tok embed
|
6004
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6005
|
-
ml.n_created--; // artificial tensor
|
6006
|
-
ml.size_data += ggml_nbytes(model.output);
|
6083
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6007
6084
|
}
|
6008
6085
|
|
6009
6086
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -6035,12 +6112,10 @@ static bool llm_load_tensors(
|
|
6035
6112
|
|
6036
6113
|
// output
|
6037
6114
|
{
|
6038
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6115
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6039
6116
|
// if output is NULL, init from the input tok embed
|
6040
6117
|
if (model.output == NULL) {
|
6041
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6042
|
-
ml.n_created--; // artificial tensor
|
6043
|
-
ml.size_data += ggml_nbytes(model.output);
|
6118
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6044
6119
|
}
|
6045
6120
|
}
|
6046
6121
|
|
@@ -6060,6 +6135,81 @@ static bool llm_load_tensors(
|
|
6060
6135
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6061
6136
|
}
|
6062
6137
|
} break;
|
6138
|
+
case LLM_ARCH_GPTNEOX:
|
6139
|
+
{
|
6140
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6141
|
+
// output
|
6142
|
+
{
|
6143
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6144
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
6145
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6146
|
+
}
|
6147
|
+
|
6148
|
+
for (int i = 0; i < n_layer; ++i) {
|
6149
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6150
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6151
|
+
|
6152
|
+
auto & layer = model.layers[i];
|
6153
|
+
|
6154
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6155
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
6156
|
+
|
6157
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
6158
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
6159
|
+
|
6160
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6161
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
6162
|
+
|
6163
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6164
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
6165
|
+
|
6166
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
6167
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
6168
|
+
|
6169
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
6171
|
+
}
|
6172
|
+
} break;
|
6173
|
+
case LLM_ARCH_ARCTIC:
|
6174
|
+
{
|
6175
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6176
|
+
|
6177
|
+
// output
|
6178
|
+
{
|
6179
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6180
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6181
|
+
// if output is NULL, init from the input tok embed
|
6182
|
+
if (model.output == NULL) {
|
6183
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6184
|
+
}
|
6185
|
+
}
|
6186
|
+
|
6187
|
+
for (int i = 0; i < n_layer; ++i) {
|
6188
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6189
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6190
|
+
|
6191
|
+
auto & layer = model.layers[i];
|
6192
|
+
|
6193
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6194
|
+
|
6195
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
6196
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
6197
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
6198
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6199
|
+
|
6200
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6201
|
+
|
6202
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
6203
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
6204
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
6205
|
+
|
6206
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6207
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
6208
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
6209
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
6210
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6211
|
+
}
|
6212
|
+
} break;
|
6063
6213
|
default:
|
6064
6214
|
throw std::runtime_error("unknown architecture");
|
6065
6215
|
}
|
@@ -6324,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6324
6474
|
|
6325
6475
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
6326
6476
|
} else {
|
6327
|
-
|
6328
|
-
GGML_ASSERT(false && "not implemented");
|
6329
|
-
#endif
|
6330
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6477
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6331
6478
|
inpL = lctx.inp_embd;
|
6332
6479
|
ggml_set_input(lctx.inp_embd);
|
6333
6480
|
}
|
@@ -6652,7 +6799,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6652
6799
|
|
6653
6800
|
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6654
6801
|
|
6655
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6802
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6656
6803
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6657
6804
|
}
|
6658
6805
|
|
@@ -6661,7 +6808,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6661
6808
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6662
6809
|
cb(kq, "kq", il);
|
6663
6810
|
|
6664
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6811
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6665
6812
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6666
6813
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6667
6814
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6886,17 +7033,20 @@ struct llm_build_context {
|
|
6886
7033
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
6887
7034
|
ggml_set_input(lctx.inp_K_shift);
|
6888
7035
|
|
7036
|
+
|
6889
7037
|
for (int il = 0; il < n_layer; ++il) {
|
7038
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
6890
7039
|
struct ggml_tensor * tmp =
|
6891
7040
|
// we rotate only the first n_rot dimensions
|
6892
|
-
|
7041
|
+
ggml_rope_ext_inplace(ctx0,
|
6893
7042
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
6894
7043
|
n_embd_head_k, n_head_kv, n_ctx,
|
6895
7044
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
6896
7045
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6897
7046
|
0),
|
6898
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7047
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6899
7048
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7049
|
+
|
6900
7050
|
cb(tmp, "K_shifted", il);
|
6901
7051
|
ggml_build_forward_expand(gf, tmp);
|
6902
7052
|
}
|
@@ -6999,6 +7149,17 @@ struct llm_build_context {
|
|
6999
7149
|
return lctx.inp_pos;
|
7000
7150
|
}
|
7001
7151
|
|
7152
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
7153
|
+
// choose long/short freq factors based on the context size
|
7154
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7155
|
+
|
7156
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
7157
|
+
return model.layers[il].rope_long;
|
7158
|
+
}
|
7159
|
+
|
7160
|
+
return model.layers[il].rope_short;
|
7161
|
+
}
|
7162
|
+
|
7002
7163
|
struct ggml_tensor * build_inp_out_ids() {
|
7003
7164
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
7004
7165
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
@@ -7106,15 +7267,15 @@ struct llm_build_context {
|
|
7106
7267
|
cb(Vcur, "Vcur", il);
|
7107
7268
|
}
|
7108
7269
|
|
7109
|
-
Qcur =
|
7110
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7270
|
+
Qcur = ggml_rope_ext(
|
7271
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7111
7272
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7112
7273
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7113
7274
|
);
|
7114
7275
|
cb(Qcur, "Qcur", il);
|
7115
7276
|
|
7116
|
-
Kcur =
|
7117
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7277
|
+
Kcur = ggml_rope_ext(
|
7278
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7118
7279
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7119
7280
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7120
7281
|
);
|
@@ -7236,13 +7397,13 @@ struct llm_build_context {
|
|
7236
7397
|
|
7237
7398
|
switch (model.type) {
|
7238
7399
|
case MODEL_7B:
|
7239
|
-
Qcur =
|
7240
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
Qcur = ggml_rope_ext(
|
7401
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7241
7402
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7242
7403
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7243
7404
|
);
|
7244
|
-
Kcur =
|
7245
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7405
|
+
Kcur = ggml_rope_ext(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7246
7407
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7408
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7409
|
);
|
@@ -7348,15 +7509,15 @@ struct llm_build_context {
|
|
7348
7509
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7349
7510
|
cb(Vcur, "Vcur", il);
|
7350
7511
|
|
7351
|
-
Qcur =
|
7352
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7512
|
+
Qcur = ggml_rope_ext(
|
7513
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7353
7514
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7354
7515
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7355
7516
|
);
|
7356
7517
|
cb(Qcur, "Qcur", il);
|
7357
7518
|
|
7358
|
-
Kcur =
|
7359
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7519
|
+
Kcur = ggml_rope_ext(
|
7520
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7360
7521
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7361
7522
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7362
7523
|
);
|
@@ -7469,14 +7630,14 @@ struct llm_build_context {
|
|
7469
7630
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7470
7631
|
|
7471
7632
|
// using mode = 2 for neox mode
|
7472
|
-
Qcur =
|
7473
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7633
|
+
Qcur = ggml_rope_ext(
|
7634
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7474
7635
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7475
7636
|
);
|
7476
7637
|
cb(Qcur, "Qcur", il);
|
7477
7638
|
|
7478
|
-
Kcur =
|
7479
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7639
|
+
Kcur = ggml_rope_ext(
|
7640
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7480
7641
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7481
7642
|
);
|
7482
7643
|
cb(Kcur, "Kcur", il);
|
@@ -7592,15 +7753,15 @@ struct llm_build_context {
|
|
7592
7753
|
cb(Vcur, "Vcur", il);
|
7593
7754
|
}
|
7594
7755
|
|
7595
|
-
Qcur =
|
7596
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7756
|
+
Qcur = ggml_rope_ext(
|
7757
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7597
7758
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7598
7759
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7599
7760
|
);
|
7600
7761
|
cb(Qcur, "Qcur", il);
|
7601
7762
|
|
7602
|
-
Kcur =
|
7603
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7763
|
+
Kcur = ggml_rope_ext(
|
7764
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7604
7765
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7605
7766
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7606
7767
|
);
|
@@ -7744,15 +7905,15 @@ struct llm_build_context {
|
|
7744
7905
|
cb(Kcur, "Kcur", il);
|
7745
7906
|
cb(Vcur, "Vcur", il);
|
7746
7907
|
|
7747
|
-
Qcur =
|
7748
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7908
|
+
Qcur = ggml_rope_ext(
|
7909
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7749
7910
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7750
7911
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7751
7912
|
);
|
7752
7913
|
cb(Qcur, "Qcur", il);
|
7753
7914
|
|
7754
|
-
Kcur =
|
7755
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7915
|
+
Kcur = ggml_rope_ext(
|
7916
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7756
7917
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7757
7918
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7758
7919
|
);
|
@@ -7921,256 +8082,49 @@ struct llm_build_context {
|
|
7921
8082
|
return gf;
|
7922
8083
|
}
|
7923
8084
|
|
7924
|
-
struct ggml_cgraph *
|
8085
|
+
struct ggml_cgraph * build_refact() {
|
7925
8086
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7926
8087
|
|
7927
8088
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7928
|
-
GGML_ASSERT(n_embd_head
|
7929
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
8089
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7930
8090
|
|
7931
8091
|
struct ggml_tensor * cur;
|
7932
8092
|
struct ggml_tensor * inpL;
|
7933
8093
|
|
7934
8094
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7935
8095
|
|
7936
|
-
// inp_pos - contains the positions
|
7937
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
7938
|
-
|
7939
8096
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7940
8097
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7941
8098
|
|
7942
8099
|
for (int il = 0; il < n_layer; ++il) {
|
7943
|
-
struct ggml_tensor *
|
8100
|
+
struct ggml_tensor * inpSA = inpL;
|
7944
8101
|
|
7945
8102
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7946
|
-
model.layers[il].attn_norm,
|
7947
|
-
|
7948
|
-
LLM_NORM, cb, il);
|
8103
|
+
model.layers[il].attn_norm, NULL,
|
8104
|
+
LLM_NORM_RMS, cb, il);
|
7949
8105
|
cb(cur, "attn_norm", il);
|
7950
8106
|
|
7951
|
-
// self
|
8107
|
+
// self-attention
|
7952
8108
|
{
|
7953
|
-
|
7954
|
-
cb(
|
8109
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8110
|
+
cb(Qcur, "Qcur", il);
|
7955
8111
|
|
7956
|
-
|
7957
|
-
cb(
|
8112
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8113
|
+
cb(Kcur, "Kcur", il);
|
7958
8114
|
|
7959
|
-
|
7960
|
-
|
8115
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8116
|
+
cb(Vcur, "Vcur", il);
|
7961
8117
|
|
7962
|
-
|
7963
|
-
cb(
|
8118
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8119
|
+
cb(Kcur, "Kcur", il);
|
7964
8120
|
|
7965
|
-
|
7966
|
-
cb(
|
8121
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8122
|
+
cb(Qcur, "Qcur", il);
|
7967
8123
|
|
7968
|
-
|
7969
|
-
|
7970
|
-
|
7971
|
-
|
7972
|
-
0
|
7973
|
-
);
|
7974
|
-
cb(tmpq, "tmpq", il);
|
7975
|
-
|
7976
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
7977
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
7978
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
7979
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
7980
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
7981
|
-
);
|
7982
|
-
cb(tmpk, "tmpk", il);
|
7983
|
-
|
7984
|
-
// Q/K Layernorm
|
7985
|
-
tmpq = llm_build_norm(ctx0, tmpq, hparams,
|
7986
|
-
model.layers[il].attn_q_norm,
|
7987
|
-
model.layers[il].attn_q_norm_b,
|
7988
|
-
LLM_NORM, cb, il);
|
7989
|
-
cb(tmpq, "tmpq", il);
|
7990
|
-
|
7991
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
7992
|
-
model.layers[il].attn_k_norm,
|
7993
|
-
model.layers[il].attn_k_norm_b,
|
7994
|
-
LLM_NORM, cb, il);
|
7995
|
-
cb(tmpk, "tmpk", il);
|
7996
|
-
|
7997
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
7998
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
7999
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8000
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8001
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8002
|
-
0
|
8003
|
-
);
|
8004
|
-
cb(qrot, "qrot", il);
|
8005
|
-
|
8006
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
8007
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8008
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8009
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8010
|
-
0
|
8011
|
-
);
|
8012
|
-
cb(krot, "krot", il);
|
8013
|
-
|
8014
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
8015
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
8016
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8017
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8018
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8019
|
-
ggml_element_size(tmpq) * n_rot
|
8020
|
-
);
|
8021
|
-
cb(qpass, "qpass", il);
|
8022
|
-
|
8023
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
8024
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8025
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8026
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8027
|
-
ggml_element_size(tmpk) * n_rot
|
8028
|
-
);
|
8029
|
-
cb(kpass, "kpass", il);
|
8030
|
-
|
8031
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
8032
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8033
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8034
|
-
);
|
8035
|
-
cb(qrotated, "qrotated", il);
|
8036
|
-
|
8037
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
8038
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8039
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8040
|
-
);
|
8041
|
-
cb(krotated, "krotated", il);
|
8042
|
-
|
8043
|
-
// ggml currently only supports concatenation on dim=2
|
8044
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
8045
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
8046
|
-
cb(qrotated, "qrotated", il);
|
8047
|
-
|
8048
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
8049
|
-
cb(krotated, "krotated", il);
|
8050
|
-
|
8051
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
8052
|
-
cb(qpass, "qpass", il);
|
8053
|
-
|
8054
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
8055
|
-
cb(kpass, "kpass", il);
|
8056
|
-
|
8057
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
8058
|
-
cb(Qcur, "Qcur", il);
|
8059
|
-
|
8060
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
8061
|
-
cb(Kcur, "Kcur", il);
|
8062
|
-
|
8063
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
8064
|
-
cb(Q, "Q", il);
|
8065
|
-
|
8066
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
8067
|
-
cb(Kcur, "Kcur", il);
|
8068
|
-
|
8069
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
8070
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
8071
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
8072
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
8073
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
8074
|
-
);
|
8075
|
-
cb(Vcur, "Vcur", il);
|
8076
|
-
|
8077
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8078
|
-
model.layers[il].wo, model.layers[il].bo,
|
8079
|
-
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8080
|
-
}
|
8081
|
-
|
8082
|
-
if (il == n_layer - 1) {
|
8083
|
-
// skip computing output for unused tokens
|
8084
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8085
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8086
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8087
|
-
}
|
8088
|
-
|
8089
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
8090
|
-
cb(ffn_inp, "ffn_inp", il);
|
8091
|
-
|
8092
|
-
// feed-forward network
|
8093
|
-
{
|
8094
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8095
|
-
model.layers[il].ffn_norm,
|
8096
|
-
model.layers[il].ffn_norm_b,
|
8097
|
-
LLM_NORM, cb, il);
|
8098
|
-
cb(cur, "ffn_norm", il);
|
8099
|
-
|
8100
|
-
cur = llm_build_ffn(ctx0, cur,
|
8101
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8102
|
-
NULL, NULL,
|
8103
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8104
|
-
NULL,
|
8105
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
8106
|
-
cb(cur, "ffn_out", il);
|
8107
|
-
}
|
8108
|
-
|
8109
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
8110
|
-
cb(cur, "l_out", il);
|
8111
|
-
|
8112
|
-
inpL = cur;
|
8113
|
-
}
|
8114
|
-
|
8115
|
-
cur = inpL;
|
8116
|
-
|
8117
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
8118
|
-
model.output_norm,
|
8119
|
-
model.output_norm_b,
|
8120
|
-
LLM_NORM, cb, -1);
|
8121
|
-
cb(cur, "result_norm", -1);
|
8122
|
-
|
8123
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8124
|
-
cb(cur, "result_output", -1);
|
8125
|
-
|
8126
|
-
ggml_build_forward_expand(gf, cur);
|
8127
|
-
|
8128
|
-
return gf;
|
8129
|
-
}
|
8130
|
-
|
8131
|
-
struct ggml_cgraph * build_refact() {
|
8132
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8133
|
-
|
8134
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8135
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8136
|
-
|
8137
|
-
struct ggml_tensor * cur;
|
8138
|
-
struct ggml_tensor * inpL;
|
8139
|
-
|
8140
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8141
|
-
|
8142
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8143
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8144
|
-
|
8145
|
-
for (int il = 0; il < n_layer; ++il) {
|
8146
|
-
struct ggml_tensor * inpSA = inpL;
|
8147
|
-
|
8148
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
8149
|
-
model.layers[il].attn_norm, NULL,
|
8150
|
-
LLM_NORM_RMS, cb, il);
|
8151
|
-
cb(cur, "attn_norm", il);
|
8152
|
-
|
8153
|
-
// self-attention
|
8154
|
-
{
|
8155
|
-
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8156
|
-
cb(Qcur, "Qcur", il);
|
8157
|
-
|
8158
|
-
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8159
|
-
cb(Kcur, "Kcur", il);
|
8160
|
-
|
8161
|
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8162
|
-
cb(Vcur, "Vcur", il);
|
8163
|
-
|
8164
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8165
|
-
cb(Kcur, "Kcur", il);
|
8166
|
-
|
8167
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8168
|
-
cb(Qcur, "Qcur", il);
|
8169
|
-
|
8170
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8171
|
-
model.layers[il].wo, NULL,
|
8172
|
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8173
|
-
}
|
8124
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8125
|
+
model.layers[il].wo, NULL,
|
8126
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8127
|
+
}
|
8174
8128
|
|
8175
8129
|
if (il == n_layer - 1) {
|
8176
8130
|
// skip computing output for unused tokens
|
@@ -8304,15 +8258,15 @@ struct llm_build_context {
|
|
8304
8258
|
cb(Kcur, "Kcur", il);
|
8305
8259
|
cb(Vcur, "Vcur", il);
|
8306
8260
|
|
8307
|
-
Qcur =
|
8308
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8261
|
+
Qcur = ggml_rope_ext(
|
8262
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8309
8263
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8310
8264
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8311
8265
|
);
|
8312
8266
|
cb(Qcur, "Qcur", il);
|
8313
8267
|
|
8314
|
-
Kcur =
|
8315
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8268
|
+
Kcur = ggml_rope_ext(
|
8269
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8316
8270
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8317
8271
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8318
8272
|
);
|
@@ -8744,15 +8698,15 @@ struct llm_build_context {
|
|
8744
8698
|
}
|
8745
8699
|
|
8746
8700
|
|
8747
|
-
Qcur =
|
8748
|
-
ctx0, Qcur, inp_pos,
|
8701
|
+
Qcur = ggml_rope_ext(
|
8702
|
+
ctx0, Qcur, inp_pos, nullptr,
|
8749
8703
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8750
8704
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8751
8705
|
);
|
8752
8706
|
cb(Qcur, "Qcur", il);
|
8753
8707
|
|
8754
|
-
Kcur =
|
8755
|
-
ctx0, Kcur, inp_pos,
|
8708
|
+
Kcur = ggml_rope_ext(
|
8709
|
+
ctx0, Kcur, inp_pos, nullptr,
|
8756
8710
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8757
8711
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8758
8712
|
);
|
@@ -8864,14 +8818,14 @@ struct llm_build_context {
|
|
8864
8818
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8865
8819
|
|
8866
8820
|
// using mode = 2 for neox mode
|
8867
|
-
Qcur =
|
8868
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8821
|
+
Qcur = ggml_rope_ext(
|
8822
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8869
8823
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8870
8824
|
);
|
8871
8825
|
cb(Qcur, "Qcur", il);
|
8872
8826
|
|
8873
|
-
Kcur =
|
8874
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8827
|
+
Kcur = ggml_rope_ext(
|
8828
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8875
8829
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8876
8830
|
);
|
8877
8831
|
cb(Kcur, "Kcur", il);
|
@@ -8975,15 +8929,15 @@ struct llm_build_context {
|
|
8975
8929
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8976
8930
|
cb(Vcur, "Vcur", il);
|
8977
8931
|
|
8978
|
-
Qcur =
|
8979
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8932
|
+
Qcur = ggml_rope_ext(
|
8933
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8980
8934
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8981
8935
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8982
8936
|
);
|
8983
8937
|
cb(Qcur, "Qcur", il);
|
8984
8938
|
|
8985
|
-
Kcur =
|
8986
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8939
|
+
Kcur = ggml_rope_ext(
|
8940
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8987
8941
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8988
8942
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8989
8943
|
);
|
@@ -9089,15 +9043,15 @@ struct llm_build_context {
|
|
9089
9043
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9090
9044
|
cb(Vcur, "Vcur", il);
|
9091
9045
|
|
9092
|
-
Qcur =
|
9093
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9046
|
+
Qcur = ggml_rope_ext(
|
9047
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9094
9048
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9095
9049
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9096
9050
|
);
|
9097
9051
|
cb(Qcur, "Qcur", il);
|
9098
9052
|
|
9099
|
-
Kcur =
|
9100
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9053
|
+
Kcur = ggml_rope_ext(
|
9054
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9101
9055
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9102
9056
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9103
9057
|
);
|
@@ -9241,8 +9195,8 @@ struct llm_build_context {
|
|
9241
9195
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9242
9196
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9243
9197
|
|
9244
|
-
Qcur =
|
9245
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9198
|
+
Qcur = ggml_rope_ext(
|
9199
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9246
9200
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9247
9201
|
);
|
9248
9202
|
cb(Qcur, "Qcur", il);
|
@@ -9252,8 +9206,8 @@ struct llm_build_context {
|
|
9252
9206
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9253
9207
|
cb(Qcur, "Qcur", il);
|
9254
9208
|
|
9255
|
-
Kcur =
|
9256
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9209
|
+
Kcur = ggml_rope_ext(
|
9210
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9257
9211
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9258
9212
|
);
|
9259
9213
|
cb(Kcur, "Kcur", il);
|
@@ -9329,6 +9283,9 @@ struct llm_build_context {
|
|
9329
9283
|
|
9330
9284
|
// self-attention
|
9331
9285
|
{
|
9286
|
+
// rope freq factors for 128k context
|
9287
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
9288
|
+
|
9332
9289
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9333
9290
|
model.layers[il].attn_norm,
|
9334
9291
|
NULL,
|
@@ -9360,8 +9317,8 @@ struct llm_build_context {
|
|
9360
9317
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9361
9318
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9362
9319
|
|
9363
|
-
Qcur =
|
9364
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9320
|
+
Qcur = ggml_rope_ext(
|
9321
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9365
9322
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9366
9323
|
);
|
9367
9324
|
cb(Qcur, "Qcur", il);
|
@@ -9369,8 +9326,8 @@ struct llm_build_context {
|
|
9369
9326
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9370
9327
|
cb(Qcur, "Qcur", il);
|
9371
9328
|
|
9372
|
-
Kcur =
|
9373
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9329
|
+
Kcur = ggml_rope_ext(
|
9330
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9374
9331
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9375
9332
|
);
|
9376
9333
|
cb(Kcur, "Kcur", il);
|
@@ -9476,14 +9433,14 @@ struct llm_build_context {
|
|
9476
9433
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9477
9434
|
cb(Vcur, "Vcur", il);
|
9478
9435
|
|
9479
|
-
Qcur =
|
9480
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
9436
|
+
Qcur = ggml_rope_ext(
|
9437
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9481
9438
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9482
9439
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9483
9440
|
cb(Qcur, "Qcur", il);
|
9484
9441
|
|
9485
|
-
Kcur =
|
9486
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
9442
|
+
Kcur = ggml_rope_ext(
|
9443
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9487
9444
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9488
9445
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9489
9446
|
cb(Kcur, "Kcur", il);
|
@@ -9684,15 +9641,15 @@ struct llm_build_context {
|
|
9684
9641
|
cb(tmpk, "tmpk", il);
|
9685
9642
|
cb(Vcur, "Vcur", il);
|
9686
9643
|
|
9687
|
-
struct ggml_tensor * Qcur =
|
9688
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
9644
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9645
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9689
9646
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9690
9647
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9691
9648
|
);
|
9692
9649
|
cb(Qcur, "Qcur", il);
|
9693
9650
|
|
9694
|
-
struct ggml_tensor * Kcur =
|
9695
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9651
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9652
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9696
9653
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9697
9654
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9698
9655
|
);
|
@@ -9800,15 +9757,15 @@ struct llm_build_context {
|
|
9800
9757
|
// cb(Vcur, "Vcur", il);
|
9801
9758
|
// }
|
9802
9759
|
|
9803
|
-
Qcur =
|
9804
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9760
|
+
Qcur = ggml_rope_ext(
|
9761
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9805
9762
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9806
9763
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9807
9764
|
);
|
9808
9765
|
cb(Qcur, "Qcur", il);
|
9809
9766
|
|
9810
|
-
Kcur =
|
9811
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9767
|
+
Kcur = ggml_rope_ext(
|
9768
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9812
9769
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9813
9770
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9814
9771
|
);
|
@@ -9917,15 +9874,15 @@ struct llm_build_context {
|
|
9917
9874
|
cb(Vcur, "Vcur", il);
|
9918
9875
|
}
|
9919
9876
|
|
9920
|
-
Qcur =
|
9921
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9877
|
+
Qcur = ggml_rope_ext(
|
9878
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9922
9879
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9923
9880
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9924
9881
|
);
|
9925
9882
|
cb(Qcur, "Qcur", il);
|
9926
9883
|
|
9927
|
-
Kcur =
|
9928
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9884
|
+
Kcur = ggml_rope_ext(
|
9885
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9929
9886
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9930
9887
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9931
9888
|
);
|
@@ -10047,15 +10004,15 @@ struct llm_build_context {
|
|
10047
10004
|
cb(Vcur, "Vcur", il);
|
10048
10005
|
}
|
10049
10006
|
|
10050
|
-
Qcur =
|
10051
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10007
|
+
Qcur = ggml_rope_ext(
|
10008
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10052
10009
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10053
10010
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10054
10011
|
);
|
10055
10012
|
cb(Qcur, "Qcur", il);
|
10056
10013
|
|
10057
|
-
Kcur =
|
10058
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10014
|
+
Kcur = ggml_rope_ext(
|
10015
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10059
10016
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10060
10017
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10061
10018
|
);
|
@@ -10167,8 +10124,8 @@ struct llm_build_context {
|
|
10167
10124
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10168
10125
|
cb(Vcur, "Vcur", il);
|
10169
10126
|
|
10170
|
-
Qcur =
|
10171
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
10127
|
+
Qcur = ggml_rope_ext(
|
10128
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10172
10129
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10173
10130
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10174
10131
|
cb(Qcur, "Qcur", il);
|
@@ -10176,8 +10133,8 @@ struct llm_build_context {
|
|
10176
10133
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
10177
10134
|
cb(Qcur, "Qcur_scaled", il);
|
10178
10135
|
|
10179
|
-
Kcur =
|
10180
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
10136
|
+
Kcur = ggml_rope_ext(
|
10137
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10181
10138
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10182
10139
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10183
10140
|
cb(Kcur, "Kcur", il);
|
@@ -10287,15 +10244,15 @@ struct llm_build_context {
|
|
10287
10244
|
cb(Vcur, "Vcur", il);
|
10288
10245
|
}
|
10289
10246
|
|
10290
|
-
Qcur =
|
10291
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10247
|
+
Qcur = ggml_rope_ext(
|
10248
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10292
10249
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10293
10250
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10294
10251
|
);
|
10295
10252
|
cb(Qcur, "Qcur", il);
|
10296
10253
|
|
10297
|
-
Kcur =
|
10298
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10254
|
+
Kcur = ggml_rope_ext(
|
10255
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10299
10256
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10300
10257
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10301
10258
|
);
|
@@ -10577,15 +10534,15 @@ struct llm_build_context {
|
|
10577
10534
|
cb(Kcur, "Kcur", il);
|
10578
10535
|
}
|
10579
10536
|
|
10580
|
-
Qcur =
|
10581
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10537
|
+
Qcur = ggml_rope_ext(
|
10538
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10582
10539
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10583
10540
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10584
10541
|
);
|
10585
10542
|
cb(Qcur, "Qcur", il);
|
10586
10543
|
|
10587
|
-
Kcur =
|
10588
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10544
|
+
Kcur = ggml_rope_ext(
|
10545
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10589
10546
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10590
10547
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10591
10548
|
);
|
@@ -10708,15 +10665,15 @@ struct llm_build_context {
|
|
10708
10665
|
cb(Vcur, "Vcur", il);
|
10709
10666
|
}
|
10710
10667
|
|
10711
|
-
Qcur =
|
10712
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10668
|
+
Qcur = ggml_rope_ext(
|
10669
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10713
10670
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10714
10671
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10715
10672
|
);
|
10716
10673
|
cb(Qcur, "Qcur", il);
|
10717
10674
|
|
10718
|
-
Kcur =
|
10719
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10675
|
+
Kcur = ggml_rope_ext(
|
10676
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10720
10677
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10721
10678
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10722
10679
|
);
|
@@ -10780,6 +10737,274 @@ struct llm_build_context {
|
|
10780
10737
|
|
10781
10738
|
return gf;
|
10782
10739
|
}
|
10740
|
+
|
10741
|
+
struct ggml_cgraph * build_gptneox() {
|
10742
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10743
|
+
|
10744
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10745
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
10746
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10747
|
+
|
10748
|
+
struct ggml_tensor * cur;
|
10749
|
+
struct ggml_tensor * inpL;
|
10750
|
+
|
10751
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10752
|
+
|
10753
|
+
// inp_pos - contains the positions
|
10754
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10755
|
+
|
10756
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10757
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10758
|
+
|
10759
|
+
for (int il = 0; il < n_layer; ++il) {
|
10760
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10761
|
+
model.layers[il].attn_norm,
|
10762
|
+
model.layers[il].attn_norm_b,
|
10763
|
+
LLM_NORM, cb, il);
|
10764
|
+
cb(cur, "attn_norm", il);
|
10765
|
+
|
10766
|
+
// self-attention
|
10767
|
+
{
|
10768
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
10769
|
+
cb(cur, "wqkv", il);
|
10770
|
+
|
10771
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
10772
|
+
cb(cur, "bqkv", il);
|
10773
|
+
|
10774
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
10775
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
10776
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
10777
|
+
|
10778
|
+
cb(Qcur, "Qcur", il);
|
10779
|
+
cb(Kcur, "Kcur", il);
|
10780
|
+
cb(Vcur, "Vcur", il);
|
10781
|
+
|
10782
|
+
Qcur = ggml_rope_ext(
|
10783
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10784
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10785
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10786
|
+
);
|
10787
|
+
cb(Qcur, "Qcur", il);
|
10788
|
+
|
10789
|
+
Kcur = ggml_rope_ext(
|
10790
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10791
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10792
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10793
|
+
);
|
10794
|
+
cb(Kcur, "Kcur", il);
|
10795
|
+
|
10796
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10797
|
+
model.layers[il].wo, model.layers[il].bo,
|
10798
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10799
|
+
}
|
10800
|
+
|
10801
|
+
if (il == n_layer - 1) {
|
10802
|
+
// skip computing output for unused tokens
|
10803
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10804
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10805
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10806
|
+
}
|
10807
|
+
|
10808
|
+
// ffn
|
10809
|
+
if (hparams.use_par_res) {
|
10810
|
+
// attention and ffn are computed in parallel
|
10811
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
10812
|
+
|
10813
|
+
struct ggml_tensor * attn_out = cur;
|
10814
|
+
|
10815
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10816
|
+
model.layers[il].ffn_norm,
|
10817
|
+
model.layers[il].ffn_norm_b,
|
10818
|
+
LLM_NORM, cb, il);
|
10819
|
+
cb(cur, "ffn_norm", il);
|
10820
|
+
|
10821
|
+
cur = llm_build_ffn(ctx0, cur,
|
10822
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10823
|
+
NULL, NULL,
|
10824
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10825
|
+
NULL,
|
10826
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10827
|
+
cb(cur, "ffn_out", il);
|
10828
|
+
|
10829
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10830
|
+
cb(cur, "ffn_out", il);
|
10831
|
+
|
10832
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
10833
|
+
cb(inpL, "l_out", il);
|
10834
|
+
} else {
|
10835
|
+
// attention and ffn are computed sequentially
|
10836
|
+
// x = x + attn(ln1(x))
|
10837
|
+
// x = x + ffn(ln2(x))
|
10838
|
+
|
10839
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
10840
|
+
cb(ffn_inp, "ffn_inp", il);
|
10841
|
+
|
10842
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10843
|
+
model.layers[il].ffn_norm,
|
10844
|
+
model.layers[il].ffn_norm_b,
|
10845
|
+
LLM_NORM, cb, il);
|
10846
|
+
cb(cur, "ffn_norm", il);
|
10847
|
+
|
10848
|
+
cur = llm_build_ffn(ctx0, cur,
|
10849
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10850
|
+
NULL, NULL,
|
10851
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10852
|
+
NULL,
|
10853
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10854
|
+
cb(cur, "ffn_out", il);
|
10855
|
+
|
10856
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
10857
|
+
cb(inpL, "l_out", il);
|
10858
|
+
}
|
10859
|
+
}
|
10860
|
+
|
10861
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10862
|
+
model.output_norm,
|
10863
|
+
model.output_norm_b,
|
10864
|
+
LLM_NORM, cb, -1);
|
10865
|
+
cb(cur, "result_norm", -1);
|
10866
|
+
|
10867
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10868
|
+
cb(cur, "result_output", -1);
|
10869
|
+
|
10870
|
+
ggml_build_forward_expand(gf, cur);
|
10871
|
+
|
10872
|
+
return gf;
|
10873
|
+
}
|
10874
|
+
|
10875
|
+
struct ggml_cgraph * build_arctic() {
|
10876
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10877
|
+
|
10878
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10879
|
+
int32_t n_tokens = this->n_tokens;
|
10880
|
+
|
10881
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10882
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10883
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10884
|
+
|
10885
|
+
struct ggml_tensor * cur;
|
10886
|
+
struct ggml_tensor * inpL;
|
10887
|
+
|
10888
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10889
|
+
|
10890
|
+
// inp_pos - contains the positions
|
10891
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10892
|
+
|
10893
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10894
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10895
|
+
|
10896
|
+
for (int il = 0; il < n_layer; ++il) {
|
10897
|
+
struct ggml_tensor * inpSA = inpL;
|
10898
|
+
|
10899
|
+
// norm
|
10900
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10901
|
+
model.layers[il].attn_norm, NULL,
|
10902
|
+
LLM_NORM_RMS, cb, il);
|
10903
|
+
cb(cur, "attn_norm", il);
|
10904
|
+
|
10905
|
+
// self-attention
|
10906
|
+
{
|
10907
|
+
// compute Q and K and RoPE them
|
10908
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10909
|
+
cb(Qcur, "Qcur", il);
|
10910
|
+
|
10911
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10912
|
+
cb(Kcur, "Kcur", il);
|
10913
|
+
|
10914
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10915
|
+
cb(Vcur, "Vcur", il);
|
10916
|
+
|
10917
|
+
Qcur = ggml_rope_ext(
|
10918
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10919
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10920
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10921
|
+
);
|
10922
|
+
cb(Qcur, "Qcur", il);
|
10923
|
+
|
10924
|
+
Kcur = ggml_rope_ext(
|
10925
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10926
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10927
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10928
|
+
);
|
10929
|
+
cb(Kcur, "Kcur", il);
|
10930
|
+
|
10931
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10932
|
+
model.layers[il].wo, NULL,
|
10933
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10934
|
+
}
|
10935
|
+
|
10936
|
+
if (il == n_layer - 1) {
|
10937
|
+
// skip computing output for unused tokens
|
10938
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10939
|
+
n_tokens = n_outputs;
|
10940
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10941
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10942
|
+
}
|
10943
|
+
|
10944
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10945
|
+
cb(ffn_inp, "ffn_inp", il);
|
10946
|
+
|
10947
|
+
// feed-forward network
|
10948
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10949
|
+
model.layers[il].ffn_norm, NULL,
|
10950
|
+
LLM_NORM_RMS, cb, il);
|
10951
|
+
cb(cur, "ffn_norm", il);
|
10952
|
+
|
10953
|
+
cur = llm_build_ffn(ctx0, cur,
|
10954
|
+
model.layers[il].ffn_up, NULL,
|
10955
|
+
model.layers[il].ffn_gate, NULL,
|
10956
|
+
model.layers[il].ffn_down, NULL,
|
10957
|
+
NULL,
|
10958
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10959
|
+
cb(cur, "ffn_out", il);
|
10960
|
+
|
10961
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
10962
|
+
cb(ffn_out, "ffn_out", il);
|
10963
|
+
|
10964
|
+
// MoE
|
10965
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
10966
|
+
model.layers[il].ffn_norm_exps, NULL,
|
10967
|
+
LLM_NORM_RMS, cb, il);
|
10968
|
+
cb(cur, "ffn_norm_exps", il);
|
10969
|
+
|
10970
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
10971
|
+
model.layers[il].ffn_gate_inp,
|
10972
|
+
model.layers[il].ffn_up_exps,
|
10973
|
+
model.layers[il].ffn_gate_exps,
|
10974
|
+
model.layers[il].ffn_down_exps,
|
10975
|
+
n_expert, n_expert_used,
|
10976
|
+
LLM_FFN_SILU, true,
|
10977
|
+
cb, il);
|
10978
|
+
cb(cur, "ffn_moe_out", il);
|
10979
|
+
|
10980
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
10981
|
+
cb(cur, "ffn_out", il);
|
10982
|
+
|
10983
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10984
|
+
if (layer_dir != nullptr) {
|
10985
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10986
|
+
}
|
10987
|
+
cb(cur, "l_out", il);
|
10988
|
+
|
10989
|
+
// input for next layer
|
10990
|
+
inpL = cur;
|
10991
|
+
}
|
10992
|
+
|
10993
|
+
cur = inpL;
|
10994
|
+
|
10995
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10996
|
+
model.output_norm, NULL,
|
10997
|
+
LLM_NORM_RMS, cb, -1);
|
10998
|
+
cb(cur, "result_norm", -1);
|
10999
|
+
|
11000
|
+
// lm_head
|
11001
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11002
|
+
cb(cur, "result_output", -1);
|
11003
|
+
|
11004
|
+
ggml_build_forward_expand(gf, cur);
|
11005
|
+
|
11006
|
+
return gf;
|
11007
|
+
}
|
10783
11008
|
};
|
10784
11009
|
|
10785
11010
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -10896,10 +11121,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10896
11121
|
{
|
10897
11122
|
result = llm.build_starcoder();
|
10898
11123
|
} break;
|
10899
|
-
case LLM_ARCH_PERSIMMON:
|
10900
|
-
{
|
10901
|
-
result = llm.build_persimmon();
|
10902
|
-
} break;
|
10903
11124
|
case LLM_ARCH_REFACT:
|
10904
11125
|
{
|
10905
11126
|
result = llm.build_refact();
|
@@ -10994,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10994
11215
|
{
|
10995
11216
|
result = llm.build_olmo();
|
10996
11217
|
} break;
|
11218
|
+
case LLM_ARCH_GPTNEOX:
|
11219
|
+
{
|
11220
|
+
result = llm.build_gptneox();
|
11221
|
+
} break;
|
11222
|
+
case LLM_ARCH_ARCTIC:
|
11223
|
+
{
|
11224
|
+
result = llm.build_arctic();
|
11225
|
+
} break;
|
10997
11226
|
default:
|
10998
11227
|
GGML_ASSERT(false);
|
10999
11228
|
}
|
@@ -11339,11 +11568,6 @@ static void llama_graph_compute(
|
|
11339
11568
|
llama_context & lctx,
|
11340
11569
|
ggml_cgraph * gf,
|
11341
11570
|
int n_threads) {
|
11342
|
-
#ifdef GGML_USE_MPI
|
11343
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
11344
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
11345
|
-
#endif
|
11346
|
-
|
11347
11571
|
#ifdef GGML_USE_METAL
|
11348
11572
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
11349
11573
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -11358,10 +11582,6 @@ static void llama_graph_compute(
|
|
11358
11582
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
11359
11583
|
|
11360
11584
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
11361
|
-
|
11362
|
-
#ifdef GGML_USE_MPI
|
11363
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
11364
|
-
#endif
|
11365
11585
|
}
|
11366
11586
|
|
11367
11587
|
// decode a batch of tokens by evaluating the transformer
|
@@ -11399,12 +11619,6 @@ static int llama_decode_internal(
|
|
11399
11619
|
}
|
11400
11620
|
lctx.n_queued_tokens += n_tokens_all;
|
11401
11621
|
|
11402
|
-
#ifdef GGML_USE_MPI
|
11403
|
-
// TODO: needs fix after #3228
|
11404
|
-
GGML_ASSERT(false && "not implemented");
|
11405
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
11406
|
-
#endif
|
11407
|
-
|
11408
11622
|
auto & kv_self = lctx.kv_self;
|
11409
11623
|
|
11410
11624
|
const int64_t n_embd = hparams.n_embd;
|
@@ -12354,6 +12568,7 @@ struct llm_tokenizer_bpe {
|
|
12354
12568
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12355
12569
|
});
|
12356
12570
|
break;
|
12571
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
12357
12572
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12358
12573
|
word_collection = unicode_regex_split(text, {
|
12359
12574
|
// original regex from tokenizer.json
|
@@ -12788,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12788
13003
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12789
13004
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
12790
13005
|
|
13006
|
+
static const bool rtrim = true; //TODO: as param
|
13007
|
+
bool is_prev_special = false;
|
13008
|
+
bool special_token_rtrim = false;
|
13009
|
+
|
12791
13010
|
if (add_special && vocab.special_add_bos != 0) {
|
12792
13011
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12793
13012
|
output.push_back(vocab.special_bos_id);
|
13013
|
+
is_prev_special = true;
|
12794
13014
|
}
|
12795
13015
|
|
12796
13016
|
for (const auto & fragment : fragment_buffer) {
|
@@ -12802,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12802
13022
|
// and passing 'add space prefix' as bool argument
|
12803
13023
|
//
|
12804
13024
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
12805
|
-
|
12806
|
-
|
12807
|
-
|
13025
|
+
|
13026
|
+
if (special_token_rtrim) {
|
13027
|
+
size_t num_whitespaces = 0;
|
13028
|
+
while (isspace(raw_text[num_whitespaces])) {
|
13029
|
+
num_whitespaces++;
|
13030
|
+
}
|
13031
|
+
if (num_whitespaces == raw_text.size()) {
|
13032
|
+
continue; // skip if all whitespaces
|
13033
|
+
}
|
13034
|
+
raw_text = raw_text.substr(num_whitespaces);
|
13035
|
+
}
|
13036
|
+
|
13037
|
+
if (vocab.add_space_prefix) {
|
13038
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13039
|
+
raw_text = " " + raw_text;
|
12808
13040
|
}
|
12809
13041
|
}
|
12810
13042
|
|
@@ -12816,6 +13048,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12816
13048
|
tokenizer.tokenize(raw_text, output);
|
12817
13049
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
12818
13050
|
output.push_back(fragment.token);
|
13051
|
+
is_prev_special = true;
|
13052
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13053
|
+
special_token_rtrim = rtrim
|
13054
|
+
&& fragment.token != vocab.special_bos_id
|
13055
|
+
&& fragment.token != vocab.special_unk_id
|
13056
|
+
&& fragment.token != vocab.special_eos_id;
|
12819
13057
|
}
|
12820
13058
|
}
|
12821
13059
|
|
@@ -14518,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14518
14756
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
14519
14757
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
14520
14758
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
14521
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
14522
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
14523
14759
|
if (qs.model.type == MODEL_70B) {
|
14524
14760
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
14525
14761
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
@@ -15533,10 +15769,6 @@ void llama_backend_init(void) {
|
|
15533
15769
|
struct ggml_context * ctx = ggml_init(params);
|
15534
15770
|
ggml_free(ctx);
|
15535
15771
|
}
|
15536
|
-
|
15537
|
-
#ifdef GGML_USE_MPI
|
15538
|
-
ggml_mpi_backend_init();
|
15539
|
-
#endif
|
15540
15772
|
}
|
15541
15773
|
|
15542
15774
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
@@ -15546,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
15546
15778
|
}
|
15547
15779
|
|
15548
15780
|
void llama_backend_free(void) {
|
15549
|
-
#ifdef GGML_USE_MPI
|
15550
|
-
ggml_mpi_backend_free();
|
15551
|
-
#endif
|
15552
15781
|
ggml_quantize_free();
|
15553
15782
|
}
|
15554
15783
|
|
@@ -15691,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15691
15920
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
15692
15921
|
}
|
15693
15922
|
|
15923
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
15694
15924
|
cparams.causal_attn = hparams.causal_attn;
|
15695
15925
|
|
15696
15926
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
@@ -15949,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15949
16179
|
}
|
15950
16180
|
}
|
15951
16181
|
|
15952
|
-
#ifdef GGML_USE_MPI
|
15953
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
15954
|
-
|
15955
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
15956
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
15957
|
-
// TODO: needs fix after #3228
|
15958
|
-
GGML_ASSERT(false && "not implemented");
|
15959
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
15960
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
15961
|
-
llama_backend_free();
|
15962
|
-
exit(1);
|
15963
|
-
}
|
15964
|
-
#endif
|
15965
|
-
|
15966
16182
|
return ctx;
|
15967
16183
|
}
|
15968
16184
|
|
@@ -15999,7 +16215,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15999
16215
|
// these models do not use RoPE
|
16000
16216
|
case LLM_ARCH_GPT2:
|
16001
16217
|
case LLM_ARCH_GPTJ:
|
16002
|
-
case LLM_ARCH_GPTNEOX:
|
16003
16218
|
case LLM_ARCH_MPT:
|
16004
16219
|
case LLM_ARCH_REFACT:
|
16005
16220
|
case LLM_ARCH_BLOOM:
|
@@ -16019,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16019
16234
|
case LLM_ARCH_XVERSE:
|
16020
16235
|
case LLM_ARCH_COMMAND_R:
|
16021
16236
|
case LLM_ARCH_OLMO:
|
16237
|
+
case LLM_ARCH_ARCTIC:
|
16022
16238
|
return LLAMA_ROPE_TYPE_NORM;
|
16023
16239
|
|
16024
16240
|
// the pairs of head values are offset by n_rot/2
|
16025
16241
|
case LLM_ARCH_FALCON:
|
16026
16242
|
case LLM_ARCH_GROK:
|
16027
16243
|
case LLM_ARCH_DBRX:
|
16028
|
-
case LLM_ARCH_PERSIMMON:
|
16029
16244
|
case LLM_ARCH_BERT:
|
16030
16245
|
case LLM_ARCH_NOMIC_BERT:
|
16031
16246
|
case LLM_ARCH_STABLELM:
|
@@ -16036,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16036
16251
|
case LLM_ARCH_PHI3:
|
16037
16252
|
case LLM_ARCH_GEMMA:
|
16038
16253
|
case LLM_ARCH_STARCODER2:
|
16254
|
+
case LLM_ARCH_GPTNEOX:
|
16039
16255
|
return LLAMA_ROPE_TYPE_NEOX;
|
16040
16256
|
|
16041
16257
|
// all model arches should be listed explicitly here
|
@@ -16195,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16195
16411
|
}
|
16196
16412
|
|
16197
16413
|
// make tensors
|
16414
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
16198
16415
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
16199
16416
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
16200
16417
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
@@ -16203,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16203
16420
|
}
|
16204
16421
|
|
16205
16422
|
// allocate tensors / buffers and zero
|
16423
|
+
cvec.ctxs.reserve(ctx_map.size());
|
16424
|
+
cvec.bufs.reserve(ctx_map.size());
|
16206
16425
|
for (auto it : ctx_map) {
|
16207
16426
|
ggml_backend_buffer_type_t buft = it.first;
|
16208
16427
|
ggml_context * ctx = it.second;
|
@@ -17411,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
17411
17630
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
17412
17631
|
}
|
17413
17632
|
|
17633
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
17634
|
+
return ctx->cparams.n_threads;
|
17635
|
+
}
|
17636
|
+
|
17637
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
17638
|
+
return ctx->cparams.n_threads_batch;
|
17639
|
+
}
|
17640
|
+
|
17414
17641
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
17415
17642
|
ctx->abort_callback = abort_callback;
|
17416
17643
|
ctx->abort_callback_data = abort_callback_data;
|
@@ -17845,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
17845
18072
|
}
|
17846
18073
|
}
|
17847
18074
|
// llama2 templates seem to not care about "add_generation_prompt"
|
18075
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
18076
|
+
// Phi 3
|
18077
|
+
for (auto message : chat) {
|
18078
|
+
std::string role(message->role);
|
18079
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
18080
|
+
}
|
18081
|
+
if (add_ass) {
|
18082
|
+
ss << "<|assistant|>\n";
|
18083
|
+
}
|
17848
18084
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
17849
18085
|
// zephyr template
|
17850
18086
|
for (auto message : chat) {
|
@@ -17977,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
17977
18213
|
if (add_ass) {
|
17978
18214
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17979
18215
|
}
|
17980
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17981
|
-
// Phi 3
|
17982
|
-
for (auto message : chat) {
|
17983
|
-
std::string role(message->role);
|
17984
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17985
|
-
}
|
17986
|
-
if (add_ass) {
|
17987
|
-
ss << "<|assistant|>\n";
|
17988
|
-
}
|
17989
18216
|
} else {
|
17990
18217
|
// template not supported
|
17991
18218
|
return -1;
|
@@ -18107,6 +18334,7 @@ const char * llama_print_system_info(void) {
|
|
18107
18334
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
18108
18335
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
18109
18336
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
18337
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
18110
18338
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
18111
18339
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
18112
18340
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
@@ -18167,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
18167
18395
|
g_state.log_callback_user_data = user_data;
|
18168
18396
|
#ifdef GGML_USE_METAL
|
18169
18397
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18398
|
+
#elif defined(GGML_USE_CUDA)
|
18399
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18170
18400
|
#endif
|
18171
18401
|
}
|
18172
18402
|
|