llama_cpp 0.15.2 → 0.15.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +49 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +6 -17
- data/vendor/tmp/llama.cpp/ggml-common.h +0 -54
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +72 -30
- data/vendor/tmp/llama.cpp/ggml-cuda.h +1 -0
- data/vendor/tmp/llama.cpp/ggml-impl.h +40 -0
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +4 -0
- data/vendor/tmp/llama.cpp/ggml-metal.m +68 -70
- data/vendor/tmp/llama.cpp/ggml-metal.metal +24 -409
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +4 -1
- data/vendor/tmp/llama.cpp/ggml-quants.c +1879 -2450
- data/vendor/tmp/llama.cpp/ggml-rpc.cpp +176 -53
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +40 -500
- data/vendor/tmp/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +202 -225
- data/vendor/tmp/llama.cpp/ggml.c +376 -758
- data/vendor/tmp/llama.cpp/ggml.h +39 -27
- data/vendor/tmp/llama.cpp/llama.cpp +823 -593
- data/vendor/tmp/llama.cpp/llama.h +10 -3
- metadata +3 -3
@@ -26,16 +26,9 @@
|
|
26
26
|
#ifdef GGML_USE_METAL
|
27
27
|
# include "ggml-metal.h"
|
28
28
|
#endif
|
29
|
-
|
30
|
-
|
31
|
-
#
|
32
|
-
#ifndef QK_K
|
33
|
-
# ifdef GGML_QKK_64
|
34
|
-
# define QK_K 64
|
35
|
-
# else
|
36
|
-
# define QK_K 256
|
37
|
-
# endif
|
38
|
-
#endif
|
29
|
+
|
30
|
+
// TODO: replace with ggml API call
|
31
|
+
#define QK_K 256
|
39
32
|
|
40
33
|
#ifdef __has_include
|
41
34
|
#if __has_include(<unistd.h>)
|
@@ -110,7 +103,7 @@
|
|
110
103
|
#endif
|
111
104
|
|
112
105
|
#define LLAMA_MAX_NODES 8192
|
113
|
-
#define LLAMA_MAX_EXPERTS
|
106
|
+
#define LLAMA_MAX_EXPERTS 128
|
114
107
|
|
115
108
|
//
|
116
109
|
// logging
|
@@ -205,7 +198,6 @@ enum llm_arch {
|
|
205
198
|
LLM_ARCH_GPTNEOX,
|
206
199
|
LLM_ARCH_MPT,
|
207
200
|
LLM_ARCH_STARCODER,
|
208
|
-
LLM_ARCH_PERSIMMON,
|
209
201
|
LLM_ARCH_REFACT,
|
210
202
|
LLM_ARCH_BERT,
|
211
203
|
LLM_ARCH_NOMIC_BERT,
|
@@ -229,6 +221,7 @@ enum llm_arch {
|
|
229
221
|
LLM_ARCH_COMMAND_R,
|
230
222
|
LLM_ARCH_DBRX,
|
231
223
|
LLM_ARCH_OLMO,
|
224
|
+
LLM_ARCH_ARCTIC,
|
232
225
|
LLM_ARCH_UNKNOWN,
|
233
226
|
};
|
234
227
|
|
@@ -242,7 +235,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
242
235
|
{ LLM_ARCH_MPT, "mpt" },
|
243
236
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
244
237
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
245
|
-
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
246
238
|
{ LLM_ARCH_REFACT, "refact" },
|
247
239
|
{ LLM_ARCH_BERT, "bert" },
|
248
240
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
@@ -266,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
266
258
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
267
259
|
{ LLM_ARCH_DBRX, "dbrx" },
|
268
260
|
{ LLM_ARCH_OLMO, "olmo" },
|
261
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
269
262
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
270
263
|
};
|
271
264
|
|
@@ -309,6 +302,7 @@ enum llm_kv {
|
|
309
302
|
LLM_KV_ROPE_SCALE_LINEAR,
|
310
303
|
LLM_KV_ROPE_SCALING_TYPE,
|
311
304
|
LLM_KV_ROPE_SCALING_FACTOR,
|
305
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
312
306
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
313
307
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
314
308
|
|
@@ -386,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
386
380
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
387
381
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
388
382
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
383
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
389
384
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
390
385
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
391
386
|
|
@@ -441,6 +436,8 @@ enum llm_tensor {
|
|
441
436
|
LLM_TENSOR_OUTPUT,
|
442
437
|
LLM_TENSOR_OUTPUT_NORM,
|
443
438
|
LLM_TENSOR_ROPE_FREQS,
|
439
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
440
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
444
441
|
LLM_TENSOR_ATTN_Q,
|
445
442
|
LLM_TENSOR_ATTN_K,
|
446
443
|
LLM_TENSOR_ATTN_V,
|
@@ -460,6 +457,7 @@ enum llm_tensor {
|
|
460
457
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
461
458
|
LLM_TENSOR_FFN_GATE_EXP,
|
462
459
|
LLM_TENSOR_FFN_UP_EXP,
|
460
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
463
461
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
464
462
|
LLM_TENSOR_FFN_GATE_EXPS,
|
465
463
|
LLM_TENSOR_FFN_UP_EXPS,
|
@@ -598,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
598
596
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
599
597
|
},
|
600
598
|
},
|
601
|
-
{
|
602
|
-
LLM_ARCH_PERSIMMON,
|
603
|
-
{
|
604
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
605
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
606
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
607
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
608
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
609
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
610
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
611
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
612
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
613
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
614
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
615
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
616
|
-
},
|
617
|
-
},
|
618
599
|
{
|
619
600
|
LLM_ARCH_MPT,
|
620
601
|
{
|
@@ -825,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
825
806
|
{
|
826
807
|
LLM_ARCH_PHI3,
|
827
808
|
{
|
828
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
829
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
830
|
-
{ LLM_TENSOR_OUTPUT,
|
831
|
-
{
|
832
|
-
{
|
833
|
-
{
|
834
|
-
{
|
835
|
-
{
|
836
|
-
{
|
837
|
-
{
|
838
|
-
{
|
839
|
-
{
|
809
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
810
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
811
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
812
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
813
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
815
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
816
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
817
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
840
823
|
},
|
841
824
|
},
|
842
825
|
{
|
@@ -1052,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
1052
1035
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1053
1036
|
},
|
1054
1037
|
},
|
1038
|
+
{
|
1039
|
+
LLM_ARCH_ARCTIC,
|
1040
|
+
{
|
1041
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
1042
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
1043
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
1044
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
1045
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
1046
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
1047
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
1048
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
1049
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
1050
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
1051
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
1052
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
1053
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
1054
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
1055
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
1056
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
1057
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
1058
|
+
},
|
1059
|
+
},
|
1055
1060
|
{
|
1056
1061
|
LLM_ARCH_UNKNOWN,
|
1057
1062
|
{
|
@@ -1697,6 +1702,8 @@ struct llama_state {
|
|
1697
1702
|
llama_state() {
|
1698
1703
|
#ifdef GGML_USE_METAL
|
1699
1704
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
1705
|
+
#elif defined(GGML_USE_CUDA)
|
1706
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
1700
1707
|
#endif
|
1701
1708
|
}
|
1702
1709
|
|
@@ -1710,17 +1717,24 @@ static llama_state g_state;
|
|
1710
1717
|
// available llama models
|
1711
1718
|
enum e_model {
|
1712
1719
|
MODEL_UNKNOWN,
|
1720
|
+
MODEL_14M,
|
1713
1721
|
MODEL_17M,
|
1714
1722
|
MODEL_22M,
|
1715
1723
|
MODEL_33M,
|
1724
|
+
MODEL_70M,
|
1716
1725
|
MODEL_109M,
|
1717
1726
|
MODEL_137M,
|
1727
|
+
MODEL_160M,
|
1718
1728
|
MODEL_335M,
|
1729
|
+
MODEL_410M,
|
1719
1730
|
MODEL_0_5B,
|
1720
1731
|
MODEL_1B,
|
1732
|
+
MODEL_1_4B,
|
1721
1733
|
MODEL_2B,
|
1734
|
+
MODEL_2_8B,
|
1722
1735
|
MODEL_3B,
|
1723
1736
|
MODEL_4B,
|
1737
|
+
MODEL_6_9B,
|
1724
1738
|
MODEL_7B,
|
1725
1739
|
MODEL_8B,
|
1726
1740
|
MODEL_12B,
|
@@ -1743,6 +1757,7 @@ enum e_model {
|
|
1743
1757
|
MODEL_8x7B,
|
1744
1758
|
MODEL_8x22B,
|
1745
1759
|
MODEL_16x12B,
|
1760
|
+
MODEL_10B_128x3_66B,
|
1746
1761
|
};
|
1747
1762
|
|
1748
1763
|
static const size_t kiB = 1024;
|
@@ -1752,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
|
|
1752
1767
|
struct llama_hparams {
|
1753
1768
|
bool vocab_only;
|
1754
1769
|
bool rope_finetuned;
|
1770
|
+
bool use_par_res;
|
1755
1771
|
|
1756
1772
|
uint32_t n_vocab;
|
1757
1773
|
uint32_t n_ctx_train; // context size the model was trained on
|
@@ -1770,6 +1786,7 @@ struct llama_hparams {
|
|
1770
1786
|
float f_norm_eps;
|
1771
1787
|
float f_norm_rms_eps;
|
1772
1788
|
|
1789
|
+
float rope_attn_factor = 1.0f;
|
1773
1790
|
float rope_freq_base_train;
|
1774
1791
|
float rope_freq_scale_train;
|
1775
1792
|
uint32_t n_yarn_orig_ctx;
|
@@ -1818,6 +1835,7 @@ struct llama_hparams {
|
|
1818
1835
|
|
1819
1836
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
1820
1837
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
1838
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
1821
1839
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
1822
1840
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
1823
1841
|
|
@@ -1915,6 +1933,7 @@ struct llama_layer {
|
|
1915
1933
|
struct ggml_tensor * ffn_norm_b;
|
1916
1934
|
struct ggml_tensor * layer_out_norm;
|
1917
1935
|
struct ggml_tensor * layer_out_norm_b;
|
1936
|
+
struct ggml_tensor * ffn_norm_exps;
|
1918
1937
|
|
1919
1938
|
// ff
|
1920
1939
|
struct ggml_tensor * ffn_gate; // w1
|
@@ -1952,6 +1971,10 @@ struct llama_layer {
|
|
1952
1971
|
// mamba bias
|
1953
1972
|
struct ggml_tensor * ssm_conv1d_b;
|
1954
1973
|
struct ggml_tensor * ssm_dt_b;
|
1974
|
+
|
1975
|
+
// long rope factors
|
1976
|
+
struct ggml_tensor * rope_long = nullptr;
|
1977
|
+
struct ggml_tensor * rope_short = nullptr;
|
1955
1978
|
};
|
1956
1979
|
|
1957
1980
|
struct llama_kv_cell {
|
@@ -2268,10 +2291,6 @@ struct llama_context {
|
|
2268
2291
|
|
2269
2292
|
// control vectors
|
2270
2293
|
struct llama_control_vector cvec;
|
2271
|
-
|
2272
|
-
#ifdef GGML_USE_MPI
|
2273
|
-
ggml_mpi_context * ctx_mpi = NULL;
|
2274
|
-
#endif
|
2275
2294
|
};
|
2276
2295
|
|
2277
2296
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
@@ -2491,7 +2510,6 @@ static bool llama_kv_cache_init(
|
|
2491
2510
|
static bool llama_kv_cache_find_slot(
|
2492
2511
|
struct llama_kv_cache & cache,
|
2493
2512
|
const struct llama_batch & batch) {
|
2494
|
-
const uint32_t n_ctx = cache.size;
|
2495
2513
|
const uint32_t n_tokens = batch.n_tokens;
|
2496
2514
|
|
2497
2515
|
if (cache.recurrent) {
|
@@ -2542,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
|
|
2542
2560
|
}
|
2543
2561
|
// otherwise, one cell per token.
|
2544
2562
|
|
2545
|
-
if (n_tokens >
|
2546
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
2563
|
+
if (n_tokens > cache.size) {
|
2564
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
2547
2565
|
return false;
|
2548
2566
|
}
|
2549
2567
|
|
2550
2568
|
uint32_t n_tested = 0;
|
2551
2569
|
|
2552
2570
|
while (true) {
|
2553
|
-
if (cache.head + n_tokens >
|
2554
|
-
n_tested +=
|
2571
|
+
if (cache.head + n_tokens > cache.size) {
|
2572
|
+
n_tested += cache.size - cache.head;
|
2555
2573
|
cache.head = 0;
|
2556
2574
|
continue;
|
2557
2575
|
}
|
@@ -2570,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
|
|
2570
2588
|
break;
|
2571
2589
|
}
|
2572
2590
|
|
2573
|
-
if (n_tested >=
|
2591
|
+
if (n_tested >= cache.size) {
|
2574
2592
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
2575
2593
|
return false;
|
2576
2594
|
}
|
@@ -3330,6 +3348,39 @@ struct llama_model_loader {
|
|
3330
3348
|
return get_arr_n(llm_kv(kid), result, required);
|
3331
3349
|
}
|
3332
3350
|
|
3351
|
+
template<typename T>
|
3352
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
3353
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
3354
|
+
|
3355
|
+
if (kid < 0) {
|
3356
|
+
if (required) {
|
3357
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
3358
|
+
}
|
3359
|
+
return false;
|
3360
|
+
}
|
3361
|
+
|
3362
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
3363
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
3364
|
+
|
3365
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
3366
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
3367
|
+
}
|
3368
|
+
|
3369
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
3370
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
3371
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
3372
|
+
|
3373
|
+
result.resize(arr_info.length);
|
3374
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
3375
|
+
|
3376
|
+
return true;
|
3377
|
+
}
|
3378
|
+
|
3379
|
+
template<typename T>
|
3380
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
3381
|
+
return get_arr(llm_kv(kid), result, required);
|
3382
|
+
}
|
3383
|
+
|
3333
3384
|
template<typename T>
|
3334
3385
|
bool get_key(const std::string & key, T & result, const bool required = true) {
|
3335
3386
|
auto it = kv_overrides.find(key);
|
@@ -3404,11 +3455,15 @@ struct llama_model_loader {
|
|
3404
3455
|
return get_tensor_meta(get_tensor_name(i));
|
3405
3456
|
}
|
3406
3457
|
|
3407
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
3458
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
3408
3459
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
3409
3460
|
ggml_set_name(tensor, ggml_get_name(cur));
|
3410
3461
|
|
3411
|
-
|
3462
|
+
if (duplicated) {
|
3463
|
+
size_data += ggml_nbytes(cur);
|
3464
|
+
} else {
|
3465
|
+
n_created++;
|
3466
|
+
}
|
3412
3467
|
|
3413
3468
|
return tensor;
|
3414
3469
|
}
|
@@ -3443,14 +3498,17 @@ struct llama_model_loader {
|
|
3443
3498
|
return cur;
|
3444
3499
|
}
|
3445
3500
|
|
3446
|
-
|
3447
|
-
|
3501
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
3502
|
+
static const int TENSOR_DUPLICATED = 2;
|
3503
|
+
|
3504
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
3505
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
3448
3506
|
|
3449
3507
|
if (cur == NULL) {
|
3450
3508
|
return NULL;
|
3451
3509
|
}
|
3452
3510
|
|
3453
|
-
return create_tensor_for(ctx, cur);
|
3511
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
3454
3512
|
}
|
3455
3513
|
|
3456
3514
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
@@ -3750,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
3750
3808
|
|
3751
3809
|
static const char * llama_model_type_name(e_model type) {
|
3752
3810
|
switch (type) {
|
3753
|
-
case
|
3754
|
-
case
|
3755
|
-
case
|
3756
|
-
case
|
3757
|
-
case
|
3758
|
-
case
|
3759
|
-
case
|
3760
|
-
case
|
3761
|
-
case
|
3762
|
-
case
|
3763
|
-
case
|
3764
|
-
case
|
3765
|
-
case
|
3766
|
-
case
|
3767
|
-
case
|
3768
|
-
case
|
3769
|
-
case
|
3770
|
-
case
|
3771
|
-
case
|
3772
|
-
case
|
3773
|
-
case
|
3774
|
-
case
|
3775
|
-
case
|
3776
|
-
case
|
3777
|
-
case
|
3778
|
-
case
|
3779
|
-
case
|
3780
|
-
case
|
3781
|
-
case
|
3782
|
-
case
|
3783
|
-
|
3811
|
+
case MODEL_14M: return "14M";
|
3812
|
+
case MODEL_17M: return "17M";
|
3813
|
+
case MODEL_22M: return "22M";
|
3814
|
+
case MODEL_33M: return "33M";
|
3815
|
+
case MODEL_70M: return "70M";
|
3816
|
+
case MODEL_109M: return "109M";
|
3817
|
+
case MODEL_137M: return "137M";
|
3818
|
+
case MODEL_160M: return "160M";
|
3819
|
+
case MODEL_335M: return "335M";
|
3820
|
+
case MODEL_410M: return "410M";
|
3821
|
+
case MODEL_0_5B: return "0.5B";
|
3822
|
+
case MODEL_1B: return "1B";
|
3823
|
+
case MODEL_1_4B: return "1.4B";
|
3824
|
+
case MODEL_2B: return "2B";
|
3825
|
+
case MODEL_2_8B: return "2.8B";
|
3826
|
+
case MODEL_3B: return "3B";
|
3827
|
+
case MODEL_4B: return "4B";
|
3828
|
+
case MODEL_6_9B: return "6.9B";
|
3829
|
+
case MODEL_7B: return "7B";
|
3830
|
+
case MODEL_8B: return "8B";
|
3831
|
+
case MODEL_12B: return "12B";
|
3832
|
+
case MODEL_13B: return "13B";
|
3833
|
+
case MODEL_14B: return "14B";
|
3834
|
+
case MODEL_15B: return "15B";
|
3835
|
+
case MODEL_20B: return "20B";
|
3836
|
+
case MODEL_30B: return "30B";
|
3837
|
+
case MODEL_34B: return "34B";
|
3838
|
+
case MODEL_35B: return "35B";
|
3839
|
+
case MODEL_40B: return "40B";
|
3840
|
+
case MODEL_65B: return "65B";
|
3841
|
+
case MODEL_70B: return "70B";
|
3842
|
+
case MODEL_314B: return "314B";
|
3843
|
+
case MODEL_SMALL: return "0.1B";
|
3844
|
+
case MODEL_MEDIUM: return "0.4B";
|
3845
|
+
case MODEL_LARGE: return "0.8B";
|
3846
|
+
case MODEL_XL: return "1.5B";
|
3847
|
+
case MODEL_A2_7B: return "A2.7B";
|
3848
|
+
case MODEL_8x7B: return "8x7B";
|
3849
|
+
case MODEL_8x22B: return "8x22B";
|
3850
|
+
case MODEL_16x12B: return "16x12B";
|
3851
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
3852
|
+
default: return "?B";
|
3784
3853
|
}
|
3785
3854
|
}
|
3786
3855
|
|
@@ -3873,6 +3942,8 @@ static void llm_load_hparams(
|
|
3873
3942
|
}
|
3874
3943
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
3875
3944
|
|
3945
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
3946
|
+
|
3876
3947
|
// sanity check for n_rot (optional)
|
3877
3948
|
{
|
3878
3949
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
@@ -3972,14 +4043,6 @@ static void llm_load_hparams(
|
|
3972
4043
|
default: model.type = e_model::MODEL_UNKNOWN;
|
3973
4044
|
}
|
3974
4045
|
} break;
|
3975
|
-
case LLM_ARCH_PERSIMMON:
|
3976
|
-
{
|
3977
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3978
|
-
switch (hparams.n_layer) {
|
3979
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
3980
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
3981
|
-
}
|
3982
|
-
} break;
|
3983
4046
|
case LLM_ARCH_REFACT:
|
3984
4047
|
{
|
3985
4048
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
@@ -4121,6 +4184,7 @@ static void llm_load_hparams(
|
|
4121
4184
|
switch (hparams.n_layer) {
|
4122
4185
|
case 24: model.type = e_model::MODEL_1B; break;
|
4123
4186
|
case 32: model.type = e_model::MODEL_3B; break;
|
4187
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
4124
4188
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4125
4189
|
}
|
4126
4190
|
} break;
|
@@ -4261,6 +4325,65 @@ static void llm_load_hparams(
|
|
4261
4325
|
default: model.type = e_model::MODEL_UNKNOWN;
|
4262
4326
|
}
|
4263
4327
|
} break;
|
4328
|
+
case LLM_ARCH_GPTNEOX:
|
4329
|
+
{
|
4330
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
4331
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
4332
|
+
switch (hparams.n_layer) {
|
4333
|
+
case 6:
|
4334
|
+
switch (hparams.n_ff) {
|
4335
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
4336
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
4337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4338
|
+
} break;
|
4339
|
+
case 12:
|
4340
|
+
switch (hparams.n_ff) {
|
4341
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
4342
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4343
|
+
} break;
|
4344
|
+
case 16:
|
4345
|
+
switch (hparams.n_ff) {
|
4346
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
4347
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4348
|
+
} break;
|
4349
|
+
case 24:
|
4350
|
+
switch (hparams.n_ff) {
|
4351
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
4352
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
4353
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4354
|
+
} break;
|
4355
|
+
case 32:
|
4356
|
+
switch (hparams.n_ff) {
|
4357
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
4358
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
4359
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4360
|
+
} break;
|
4361
|
+
case 36:
|
4362
|
+
switch (hparams.n_ff) {
|
4363
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
4364
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4365
|
+
} break;
|
4366
|
+
case 44:
|
4367
|
+
switch (hparams.n_ff) {
|
4368
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
4369
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4370
|
+
} break;
|
4371
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4372
|
+
}
|
4373
|
+
} break;
|
4374
|
+
case LLM_ARCH_ARCTIC:
|
4375
|
+
{
|
4376
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
4377
|
+
|
4378
|
+
if (hparams.n_expert == 128) {
|
4379
|
+
switch (hparams.n_layer) {
|
4380
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
4381
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
4382
|
+
}
|
4383
|
+
} else {
|
4384
|
+
model.type = e_model::MODEL_UNKNOWN;
|
4385
|
+
}
|
4386
|
+
} break;
|
4264
4387
|
default: (void)0;
|
4265
4388
|
}
|
4266
4389
|
|
@@ -4461,6 +4584,9 @@ static void llm_load_vocab(
|
|
4461
4584
|
} else if (
|
4462
4585
|
tokenizer_pre == "qwen2") {
|
4463
4586
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
4587
|
+
} else if (
|
4588
|
+
tokenizer_pre == "stablelm2") {
|
4589
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
4464
4590
|
} else if (
|
4465
4591
|
tokenizer_pre == "olmo") {
|
4466
4592
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
@@ -4582,7 +4708,8 @@ static void llm_load_vocab(
|
|
4582
4708
|
(t.first == "<|eot_id|>" ||
|
4583
4709
|
t.first == "<|im_end|>" ||
|
4584
4710
|
t.first == "<|end|>" ||
|
4585
|
-
t.first == "<end_of_turn>"
|
4711
|
+
t.first == "<end_of_turn>" ||
|
4712
|
+
t.first == "<|endoftext|>"
|
4586
4713
|
)
|
4587
4714
|
) {
|
4588
4715
|
vocab.special_eot_id = t.second;
|
@@ -4908,6 +5035,7 @@ static bool llm_load_tensors(
|
|
4908
5035
|
// create tensors for the weights
|
4909
5036
|
{
|
4910
5037
|
const int64_t n_embd = hparams.n_embd;
|
5038
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
4911
5039
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4912
5040
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
4913
5041
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
@@ -4942,12 +5070,10 @@ static bool llm_load_tensors(
|
|
4942
5070
|
{
|
4943
5071
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
4944
5072
|
if (model.arch != LLM_ARCH_MINICPM){
|
4945
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5073
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4946
5074
|
// if output is NULL, init from the input tok embed
|
4947
5075
|
if (model.output == NULL) {
|
4948
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
4949
|
-
ml.n_created--; // artificial tensor
|
4950
|
-
ml.size_data += ggml_nbytes(model.output);
|
5076
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
4951
5077
|
}
|
4952
5078
|
}
|
4953
5079
|
}
|
@@ -4966,10 +5092,10 @@ static bool llm_load_tensors(
|
|
4966
5092
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
4967
5093
|
|
4968
5094
|
// optional bias tensors
|
4969
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
4970
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
4971
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
4972
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5095
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5096
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5097
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5098
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4973
5099
|
|
4974
5100
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
4975
5101
|
|
@@ -4980,7 +5106,7 @@ static bool llm_load_tensors(
|
|
4980
5106
|
} else {
|
4981
5107
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
4982
5108
|
|
4983
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5109
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
4984
5110
|
if (layer.ffn_gate_exps) {
|
4985
5111
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
4986
5112
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5022,12 +5148,10 @@ static bool llm_load_tensors(
|
|
5022
5148
|
// output
|
5023
5149
|
{
|
5024
5150
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5025
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5151
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5026
5152
|
// if output is NULL, init from the input tok embed
|
5027
5153
|
if (model.output == NULL) {
|
5028
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5029
|
-
ml.n_created--; // artificial tensor
|
5030
|
-
ml.size_data += ggml_nbytes(model.output);
|
5154
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5031
5155
|
}
|
5032
5156
|
}
|
5033
5157
|
|
@@ -5050,7 +5174,7 @@ static bool llm_load_tensors(
|
|
5050
5174
|
|
5051
5175
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
5052
5176
|
|
5053
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
5177
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5054
5178
|
if (layer.ffn_gate_exps) {
|
5055
5179
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
5056
5180
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
@@ -5152,11 +5276,9 @@ static bool llm_load_tensors(
|
|
5152
5276
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5153
5277
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5154
5278
|
|
5155
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5279
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5156
5280
|
if (!model.output) {
|
5157
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5158
|
-
ml.n_created--; // artificial tensor
|
5159
|
-
ml.size_data += ggml_nbytes(model.output);
|
5281
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5160
5282
|
}
|
5161
5283
|
}
|
5162
5284
|
|
@@ -5169,8 +5291,8 @@ static bool llm_load_tensors(
|
|
5169
5291
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5170
5292
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5171
5293
|
|
5172
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
5173
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
5294
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5295
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5174
5296
|
|
5175
5297
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5176
5298
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
@@ -5188,7 +5310,12 @@ static bool llm_load_tensors(
|
|
5188
5310
|
{
|
5189
5311
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5190
5312
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5191
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5313
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5314
|
+
if (!model.output) {
|
5315
|
+
// needs to be on GPU
|
5316
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5317
|
+
}
|
5318
|
+
|
5192
5319
|
}
|
5193
5320
|
|
5194
5321
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -5216,47 +5343,6 @@ static bool llm_load_tensors(
|
|
5216
5343
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5217
5344
|
}
|
5218
5345
|
} break;
|
5219
|
-
case LLM_ARCH_PERSIMMON:
|
5220
|
-
{
|
5221
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5222
|
-
|
5223
|
-
{
|
5224
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5225
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5226
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
5227
|
-
}
|
5228
|
-
|
5229
|
-
for (int i = 0; i < n_layer; ++i) {
|
5230
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
5231
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
5232
|
-
|
5233
|
-
auto & layer = model.layers[i];
|
5234
|
-
|
5235
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5236
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5237
|
-
|
5238
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5239
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
5240
|
-
|
5241
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5242
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
5243
|
-
|
5244
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5245
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
5246
|
-
|
5247
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5248
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
5249
|
-
|
5250
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5251
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
5252
|
-
|
5253
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
5254
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
5255
|
-
|
5256
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
5257
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
5258
|
-
}
|
5259
|
-
} break;
|
5260
5346
|
case LLM_ARCH_BERT:
|
5261
5347
|
case LLM_ARCH_NOMIC_BERT:
|
5262
5348
|
{
|
@@ -5325,14 +5411,14 @@ static bool llm_load_tensors(
|
|
5325
5411
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
5326
5412
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
5327
5413
|
|
5328
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5329
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5414
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5415
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5330
5416
|
|
5331
5417
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
5332
5418
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
5333
5419
|
|
5334
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5335
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5420
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5421
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5336
5422
|
|
5337
5423
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
5338
5424
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
@@ -5394,18 +5480,16 @@ static bool llm_load_tensors(
|
|
5394
5480
|
case LLM_ARCH_MPT:
|
5395
5481
|
{
|
5396
5482
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5397
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
5483
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5398
5484
|
|
5399
5485
|
// output
|
5400
5486
|
{
|
5401
5487
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5402
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
5488
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5403
5489
|
|
5404
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5490
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5405
5491
|
if (!model.output) {
|
5406
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
5407
|
-
ml.n_created--; // artificial tensor
|
5408
|
-
ml.size_data += ggml_nbytes(model.output);
|
5492
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
5409
5493
|
}
|
5410
5494
|
}
|
5411
5495
|
|
@@ -5416,31 +5500,31 @@ static bool llm_load_tensors(
|
|
5416
5500
|
auto & layer = model.layers[i];
|
5417
5501
|
|
5418
5502
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5419
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
5503
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5420
5504
|
|
5421
5505
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
5422
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5506
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5423
5507
|
|
5424
5508
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5425
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
5509
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5426
5510
|
|
5427
5511
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
5428
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5512
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5429
5513
|
|
5430
5514
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
5431
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
5515
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5432
5516
|
|
5433
5517
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
5434
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
5518
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5435
5519
|
|
5436
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
5437
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
5520
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5521
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5438
5522
|
|
5439
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
5440
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
5523
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5524
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5441
5525
|
|
5442
5526
|
// AWQ ScaleActivation layer
|
5443
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
5527
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5444
5528
|
}
|
5445
5529
|
} break;
|
5446
5530
|
case LLM_ARCH_STABLELM:
|
@@ -5469,17 +5553,17 @@ static bool llm_load_tensors(
|
|
5469
5553
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
5470
5554
|
|
5471
5555
|
// optional bias tensors, present in Stable LM 2 1.6B
|
5472
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
5473
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
5474
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
5556
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5557
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5558
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5475
5559
|
|
5476
5560
|
// optional q and k layernorms, present in StableLM 2 12B
|
5477
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
5478
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
5561
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5562
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5479
5563
|
|
5480
5564
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
5481
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
5482
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
5565
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5566
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5483
5567
|
|
5484
5568
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
5485
5569
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
@@ -5522,12 +5606,10 @@ static bool llm_load_tensors(
|
|
5522
5606
|
// output
|
5523
5607
|
{
|
5524
5608
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5525
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5609
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5526
5610
|
// if output is NULL, init from the input tok embed
|
5527
5611
|
if (model.output == NULL) {
|
5528
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5529
|
-
ml.n_created--; // artificial tensor
|
5530
|
-
ml.size_data += ggml_nbytes(model.output);
|
5612
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5531
5613
|
}
|
5532
5614
|
}
|
5533
5615
|
|
@@ -5625,8 +5707,8 @@ static bool llm_load_tensors(
|
|
5625
5707
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
5626
5708
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
5627
5709
|
|
5628
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
5629
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
5710
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5711
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5630
5712
|
|
5631
5713
|
if (layer.wqkv == nullptr) {
|
5632
5714
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
@@ -5663,17 +5745,20 @@ static bool llm_load_tensors(
|
|
5663
5745
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
5664
5746
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
5665
5747
|
|
5666
|
-
auto& layer = model.layers[i];
|
5748
|
+
auto & layer = model.layers[i];
|
5667
5749
|
|
5668
5750
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
5669
5751
|
|
5670
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
5671
|
-
layer.wo
|
5752
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5753
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
5672
5754
|
|
5673
5755
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
5674
5756
|
|
5675
5757
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
5676
5758
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
5759
|
+
|
5760
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5761
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
5677
5762
|
}
|
5678
5763
|
} break;
|
5679
5764
|
case LLM_ARCH_PLAMO:
|
@@ -5842,9 +5927,7 @@ static bool llm_load_tensors(
|
|
5842
5927
|
|
5843
5928
|
// output
|
5844
5929
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5845
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
5846
|
-
ml.n_created--; // artificial tensor
|
5847
|
-
ml.size_data += ggml_nbytes(model.output);
|
5930
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
5848
5931
|
|
5849
5932
|
const int64_t n_ff = hparams.n_ff;
|
5850
5933
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
@@ -5879,12 +5962,10 @@ static bool llm_load_tensors(
|
|
5879
5962
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5880
5963
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
5881
5964
|
|
5882
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
5965
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5883
5966
|
// if output is NULL, init from the input tok embed
|
5884
5967
|
if (model.output == NULL) {
|
5885
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5886
|
-
ml.n_created--; // artificial tensor
|
5887
|
-
ml.size_data += ggml_nbytes(model.output);
|
5968
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5888
5969
|
}
|
5889
5970
|
|
5890
5971
|
}
|
@@ -5935,12 +6016,10 @@ static bool llm_load_tensors(
|
|
5935
6016
|
{
|
5936
6017
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
5937
6018
|
|
5938
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6019
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
5939
6020
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
5940
6021
|
if (model.output == NULL) {
|
5941
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
5942
|
-
ml.n_created--; // artificial tensor
|
5943
|
-
ml.size_data += ggml_nbytes(model.output);
|
6022
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
5944
6023
|
}
|
5945
6024
|
}
|
5946
6025
|
|
@@ -6001,9 +6080,7 @@ static bool llm_load_tensors(
|
|
6001
6080
|
{
|
6002
6081
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6003
6082
|
// init output from the input tok embed
|
6004
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6005
|
-
ml.n_created--; // artificial tensor
|
6006
|
-
ml.size_data += ggml_nbytes(model.output);
|
6083
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6007
6084
|
}
|
6008
6085
|
|
6009
6086
|
for (int i = 0; i < n_layer; ++i) {
|
@@ -6035,12 +6112,10 @@ static bool llm_load_tensors(
|
|
6035
6112
|
|
6036
6113
|
// output
|
6037
6114
|
{
|
6038
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
6115
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6039
6116
|
// if output is NULL, init from the input tok embed
|
6040
6117
|
if (model.output == NULL) {
|
6041
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6042
|
-
ml.n_created--; // artificial tensor
|
6043
|
-
ml.size_data += ggml_nbytes(model.output);
|
6118
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6044
6119
|
}
|
6045
6120
|
}
|
6046
6121
|
|
@@ -6060,6 +6135,81 @@ static bool llm_load_tensors(
|
|
6060
6135
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6061
6136
|
}
|
6062
6137
|
} break;
|
6138
|
+
case LLM_ARCH_GPTNEOX:
|
6139
|
+
{
|
6140
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6141
|
+
// output
|
6142
|
+
{
|
6143
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6144
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
6145
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
6146
|
+
}
|
6147
|
+
|
6148
|
+
for (int i = 0; i < n_layer; ++i) {
|
6149
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6150
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6151
|
+
|
6152
|
+
auto & layer = model.layers[i];
|
6153
|
+
|
6154
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6155
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
6156
|
+
|
6157
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
6158
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
6159
|
+
|
6160
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6161
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
6162
|
+
|
6163
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6164
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
6165
|
+
|
6166
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
6167
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
6168
|
+
|
6169
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
6170
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
6171
|
+
}
|
6172
|
+
} break;
|
6173
|
+
case LLM_ARCH_ARCTIC:
|
6174
|
+
{
|
6175
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
6176
|
+
|
6177
|
+
// output
|
6178
|
+
{
|
6179
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
6180
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
6181
|
+
// if output is NULL, init from the input tok embed
|
6182
|
+
if (model.output == NULL) {
|
6183
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
6184
|
+
}
|
6185
|
+
}
|
6186
|
+
|
6187
|
+
for (int i = 0; i < n_layer; ++i) {
|
6188
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
6189
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
6190
|
+
|
6191
|
+
auto & layer = model.layers[i];
|
6192
|
+
|
6193
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
6194
|
+
|
6195
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
6196
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
6197
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
6198
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
6199
|
+
|
6200
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
6201
|
+
|
6202
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
6203
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
6204
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
6205
|
+
|
6206
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
6207
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
6208
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
6209
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
6210
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
6211
|
+
}
|
6212
|
+
} break;
|
6063
6213
|
default:
|
6064
6214
|
throw std::runtime_error("unknown architecture");
|
6065
6215
|
}
|
@@ -6324,10 +6474,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
6324
6474
|
|
6325
6475
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
6326
6476
|
} else {
|
6327
|
-
|
6328
|
-
GGML_ASSERT(false && "not implemented");
|
6329
|
-
#endif
|
6330
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6477
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
6331
6478
|
inpL = lctx.inp_embd;
|
6332
6479
|
ggml_set_input(lctx.inp_embd);
|
6333
6480
|
}
|
@@ -6652,7 +6799,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6652
6799
|
|
6653
6800
|
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
6654
6801
|
|
6655
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6802
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6656
6803
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
6657
6804
|
}
|
6658
6805
|
|
@@ -6661,7 +6808,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
6661
6808
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
6662
6809
|
cb(kq, "kq", il);
|
6663
6810
|
|
6664
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
6811
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
6665
6812
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
6666
6813
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
6667
6814
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
@@ -6886,17 +7033,20 @@ struct llm_build_context {
|
|
6886
7033
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
6887
7034
|
ggml_set_input(lctx.inp_K_shift);
|
6888
7035
|
|
7036
|
+
|
6889
7037
|
for (int il = 0; il < n_layer; ++il) {
|
7038
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
6890
7039
|
struct ggml_tensor * tmp =
|
6891
7040
|
// we rotate only the first n_rot dimensions
|
6892
|
-
|
7041
|
+
ggml_rope_ext_inplace(ctx0,
|
6893
7042
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
6894
7043
|
n_embd_head_k, n_head_kv, n_ctx,
|
6895
7044
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
6896
7045
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
6897
7046
|
0),
|
6898
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7047
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6899
7048
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7049
|
+
|
6900
7050
|
cb(tmp, "K_shifted", il);
|
6901
7051
|
ggml_build_forward_expand(gf, tmp);
|
6902
7052
|
}
|
@@ -6999,6 +7149,17 @@ struct llm_build_context {
|
|
6999
7149
|
return lctx.inp_pos;
|
7000
7150
|
}
|
7001
7151
|
|
7152
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
7153
|
+
// choose long/short freq factors based on the context size
|
7154
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
7155
|
+
|
7156
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
7157
|
+
return model.layers[il].rope_long;
|
7158
|
+
}
|
7159
|
+
|
7160
|
+
return model.layers[il].rope_short;
|
7161
|
+
}
|
7162
|
+
|
7002
7163
|
struct ggml_tensor * build_inp_out_ids() {
|
7003
7164
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
7004
7165
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
@@ -7106,15 +7267,15 @@ struct llm_build_context {
|
|
7106
7267
|
cb(Vcur, "Vcur", il);
|
7107
7268
|
}
|
7108
7269
|
|
7109
|
-
Qcur =
|
7110
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7270
|
+
Qcur = ggml_rope_ext(
|
7271
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7111
7272
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7112
7273
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7113
7274
|
);
|
7114
7275
|
cb(Qcur, "Qcur", il);
|
7115
7276
|
|
7116
|
-
Kcur =
|
7117
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7277
|
+
Kcur = ggml_rope_ext(
|
7278
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7118
7279
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7119
7280
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7120
7281
|
);
|
@@ -7236,13 +7397,13 @@ struct llm_build_context {
|
|
7236
7397
|
|
7237
7398
|
switch (model.type) {
|
7238
7399
|
case MODEL_7B:
|
7239
|
-
Qcur =
|
7240
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7400
|
+
Qcur = ggml_rope_ext(
|
7401
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7241
7402
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7242
7403
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7243
7404
|
);
|
7244
|
-
Kcur =
|
7245
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7405
|
+
Kcur = ggml_rope_ext(
|
7406
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7246
7407
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7408
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7409
|
);
|
@@ -7348,15 +7509,15 @@ struct llm_build_context {
|
|
7348
7509
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
7349
7510
|
cb(Vcur, "Vcur", il);
|
7350
7511
|
|
7351
|
-
Qcur =
|
7352
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7512
|
+
Qcur = ggml_rope_ext(
|
7513
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7353
7514
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7354
7515
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7355
7516
|
);
|
7356
7517
|
cb(Qcur, "Qcur", il);
|
7357
7518
|
|
7358
|
-
Kcur =
|
7359
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7519
|
+
Kcur = ggml_rope_ext(
|
7520
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7360
7521
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7361
7522
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7362
7523
|
);
|
@@ -7469,14 +7630,14 @@ struct llm_build_context {
|
|
7469
7630
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
7470
7631
|
|
7471
7632
|
// using mode = 2 for neox mode
|
7472
|
-
Qcur =
|
7473
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7633
|
+
Qcur = ggml_rope_ext(
|
7634
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7474
7635
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7475
7636
|
);
|
7476
7637
|
cb(Qcur, "Qcur", il);
|
7477
7638
|
|
7478
|
-
Kcur =
|
7479
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
7639
|
+
Kcur = ggml_rope_ext(
|
7640
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
7480
7641
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
7481
7642
|
);
|
7482
7643
|
cb(Kcur, "Kcur", il);
|
@@ -7592,15 +7753,15 @@ struct llm_build_context {
|
|
7592
7753
|
cb(Vcur, "Vcur", il);
|
7593
7754
|
}
|
7594
7755
|
|
7595
|
-
Qcur =
|
7596
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7756
|
+
Qcur = ggml_rope_ext(
|
7757
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7597
7758
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7598
7759
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7599
7760
|
);
|
7600
7761
|
cb(Qcur, "Qcur", il);
|
7601
7762
|
|
7602
|
-
Kcur =
|
7603
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7763
|
+
Kcur = ggml_rope_ext(
|
7764
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7604
7765
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7605
7766
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7606
7767
|
);
|
@@ -7744,15 +7905,15 @@ struct llm_build_context {
|
|
7744
7905
|
cb(Kcur, "Kcur", il);
|
7745
7906
|
cb(Vcur, "Vcur", il);
|
7746
7907
|
|
7747
|
-
Qcur =
|
7748
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7908
|
+
Qcur = ggml_rope_ext(
|
7909
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
7749
7910
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7750
7911
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7751
7912
|
);
|
7752
7913
|
cb(Qcur, "Qcur", il);
|
7753
7914
|
|
7754
|
-
Kcur =
|
7755
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7915
|
+
Kcur = ggml_rope_ext(
|
7916
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
7756
7917
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7757
7918
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7758
7919
|
);
|
@@ -7921,256 +8082,49 @@ struct llm_build_context {
|
|
7921
8082
|
return gf;
|
7922
8083
|
}
|
7923
8084
|
|
7924
|
-
struct ggml_cgraph *
|
8085
|
+
struct ggml_cgraph * build_refact() {
|
7925
8086
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
7926
8087
|
|
7927
8088
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
7928
|
-
GGML_ASSERT(n_embd_head
|
7929
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
8089
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
7930
8090
|
|
7931
8091
|
struct ggml_tensor * cur;
|
7932
8092
|
struct ggml_tensor * inpL;
|
7933
8093
|
|
7934
8094
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
7935
8095
|
|
7936
|
-
// inp_pos - contains the positions
|
7937
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
7938
|
-
|
7939
8096
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
7940
8097
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
7941
8098
|
|
7942
8099
|
for (int il = 0; il < n_layer; ++il) {
|
7943
|
-
struct ggml_tensor *
|
8100
|
+
struct ggml_tensor * inpSA = inpL;
|
7944
8101
|
|
7945
8102
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
7946
|
-
model.layers[il].attn_norm,
|
7947
|
-
|
7948
|
-
LLM_NORM, cb, il);
|
8103
|
+
model.layers[il].attn_norm, NULL,
|
8104
|
+
LLM_NORM_RMS, cb, il);
|
7949
8105
|
cb(cur, "attn_norm", il);
|
7950
8106
|
|
7951
|
-
// self
|
8107
|
+
// self-attention
|
7952
8108
|
{
|
7953
|
-
|
7954
|
-
cb(
|
8109
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8110
|
+
cb(Qcur, "Qcur", il);
|
7955
8111
|
|
7956
|
-
|
7957
|
-
cb(
|
8112
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8113
|
+
cb(Kcur, "Kcur", il);
|
7958
8114
|
|
7959
|
-
|
7960
|
-
|
8115
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8116
|
+
cb(Vcur, "Vcur", il);
|
7961
8117
|
|
7962
|
-
|
7963
|
-
cb(
|
8118
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8119
|
+
cb(Kcur, "Kcur", il);
|
7964
8120
|
|
7965
|
-
|
7966
|
-
cb(
|
8121
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8122
|
+
cb(Qcur, "Qcur", il);
|
7967
8123
|
|
7968
|
-
|
7969
|
-
|
7970
|
-
|
7971
|
-
|
7972
|
-
0
|
7973
|
-
);
|
7974
|
-
cb(tmpq, "tmpq", il);
|
7975
|
-
|
7976
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
7977
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
7978
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
7979
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
7980
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
7981
|
-
);
|
7982
|
-
cb(tmpk, "tmpk", il);
|
7983
|
-
|
7984
|
-
// Q/K Layernorm
|
7985
|
-
tmpq = llm_build_norm(ctx0, tmpq, hparams,
|
7986
|
-
model.layers[il].attn_q_norm,
|
7987
|
-
model.layers[il].attn_q_norm_b,
|
7988
|
-
LLM_NORM, cb, il);
|
7989
|
-
cb(tmpq, "tmpq", il);
|
7990
|
-
|
7991
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
7992
|
-
model.layers[il].attn_k_norm,
|
7993
|
-
model.layers[il].attn_k_norm_b,
|
7994
|
-
LLM_NORM, cb, il);
|
7995
|
-
cb(tmpk, "tmpk", il);
|
7996
|
-
|
7997
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
7998
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
7999
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8000
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8001
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8002
|
-
0
|
8003
|
-
);
|
8004
|
-
cb(qrot, "qrot", il);
|
8005
|
-
|
8006
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
8007
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8008
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8009
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8010
|
-
0
|
8011
|
-
);
|
8012
|
-
cb(krot, "krot", il);
|
8013
|
-
|
8014
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
8015
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
8016
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
8017
|
-
ggml_element_size(tmpq) * n_embd_head,
|
8018
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
8019
|
-
ggml_element_size(tmpq) * n_rot
|
8020
|
-
);
|
8021
|
-
cb(qpass, "qpass", il);
|
8022
|
-
|
8023
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
8024
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
8025
|
-
ggml_element_size(tmpk) * n_embd_head,
|
8026
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
8027
|
-
ggml_element_size(tmpk) * n_rot
|
8028
|
-
);
|
8029
|
-
cb(kpass, "kpass", il);
|
8030
|
-
|
8031
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
8032
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8033
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8034
|
-
);
|
8035
|
-
cb(qrotated, "qrotated", il);
|
8036
|
-
|
8037
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
8038
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8039
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8040
|
-
);
|
8041
|
-
cb(krotated, "krotated", il);
|
8042
|
-
|
8043
|
-
// ggml currently only supports concatenation on dim=2
|
8044
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
8045
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
8046
|
-
cb(qrotated, "qrotated", il);
|
8047
|
-
|
8048
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
8049
|
-
cb(krotated, "krotated", il);
|
8050
|
-
|
8051
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
8052
|
-
cb(qpass, "qpass", il);
|
8053
|
-
|
8054
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
8055
|
-
cb(kpass, "kpass", il);
|
8056
|
-
|
8057
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
8058
|
-
cb(Qcur, "Qcur", il);
|
8059
|
-
|
8060
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
8061
|
-
cb(Kcur, "Kcur", il);
|
8062
|
-
|
8063
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
8064
|
-
cb(Q, "Q", il);
|
8065
|
-
|
8066
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
8067
|
-
cb(Kcur, "Kcur", il);
|
8068
|
-
|
8069
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
8070
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
8071
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
8072
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
8073
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
8074
|
-
);
|
8075
|
-
cb(Vcur, "Vcur", il);
|
8076
|
-
|
8077
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8078
|
-
model.layers[il].wo, model.layers[il].bo,
|
8079
|
-
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8080
|
-
}
|
8081
|
-
|
8082
|
-
if (il == n_layer - 1) {
|
8083
|
-
// skip computing output for unused tokens
|
8084
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
8085
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
8086
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
8087
|
-
}
|
8088
|
-
|
8089
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
8090
|
-
cb(ffn_inp, "ffn_inp", il);
|
8091
|
-
|
8092
|
-
// feed-forward network
|
8093
|
-
{
|
8094
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
8095
|
-
model.layers[il].ffn_norm,
|
8096
|
-
model.layers[il].ffn_norm_b,
|
8097
|
-
LLM_NORM, cb, il);
|
8098
|
-
cb(cur, "ffn_norm", il);
|
8099
|
-
|
8100
|
-
cur = llm_build_ffn(ctx0, cur,
|
8101
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
8102
|
-
NULL, NULL,
|
8103
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
8104
|
-
NULL,
|
8105
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
8106
|
-
cb(cur, "ffn_out", il);
|
8107
|
-
}
|
8108
|
-
|
8109
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
8110
|
-
cb(cur, "l_out", il);
|
8111
|
-
|
8112
|
-
inpL = cur;
|
8113
|
-
}
|
8114
|
-
|
8115
|
-
cur = inpL;
|
8116
|
-
|
8117
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
8118
|
-
model.output_norm,
|
8119
|
-
model.output_norm_b,
|
8120
|
-
LLM_NORM, cb, -1);
|
8121
|
-
cb(cur, "result_norm", -1);
|
8122
|
-
|
8123
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
8124
|
-
cb(cur, "result_output", -1);
|
8125
|
-
|
8126
|
-
ggml_build_forward_expand(gf, cur);
|
8127
|
-
|
8128
|
-
return gf;
|
8129
|
-
}
|
8130
|
-
|
8131
|
-
struct ggml_cgraph * build_refact() {
|
8132
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
8133
|
-
|
8134
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
8135
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
8136
|
-
|
8137
|
-
struct ggml_tensor * cur;
|
8138
|
-
struct ggml_tensor * inpL;
|
8139
|
-
|
8140
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
8141
|
-
|
8142
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
8143
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
8144
|
-
|
8145
|
-
for (int il = 0; il < n_layer; ++il) {
|
8146
|
-
struct ggml_tensor * inpSA = inpL;
|
8147
|
-
|
8148
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
8149
|
-
model.layers[il].attn_norm, NULL,
|
8150
|
-
LLM_NORM_RMS, cb, il);
|
8151
|
-
cb(cur, "attn_norm", il);
|
8152
|
-
|
8153
|
-
// self-attention
|
8154
|
-
{
|
8155
|
-
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
8156
|
-
cb(Qcur, "Qcur", il);
|
8157
|
-
|
8158
|
-
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
8159
|
-
cb(Kcur, "Kcur", il);
|
8160
|
-
|
8161
|
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
8162
|
-
cb(Vcur, "Vcur", il);
|
8163
|
-
|
8164
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8165
|
-
cb(Kcur, "Kcur", il);
|
8166
|
-
|
8167
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
8168
|
-
cb(Qcur, "Qcur", il);
|
8169
|
-
|
8170
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8171
|
-
model.layers[il].wo, NULL,
|
8172
|
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8173
|
-
}
|
8124
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
8125
|
+
model.layers[il].wo, NULL,
|
8126
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
8127
|
+
}
|
8174
8128
|
|
8175
8129
|
if (il == n_layer - 1) {
|
8176
8130
|
// skip computing output for unused tokens
|
@@ -8304,15 +8258,15 @@ struct llm_build_context {
|
|
8304
8258
|
cb(Kcur, "Kcur", il);
|
8305
8259
|
cb(Vcur, "Vcur", il);
|
8306
8260
|
|
8307
|
-
Qcur =
|
8308
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8261
|
+
Qcur = ggml_rope_ext(
|
8262
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8309
8263
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8310
8264
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8311
8265
|
);
|
8312
8266
|
cb(Qcur, "Qcur", il);
|
8313
8267
|
|
8314
|
-
Kcur =
|
8315
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8268
|
+
Kcur = ggml_rope_ext(
|
8269
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8316
8270
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8317
8271
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8318
8272
|
);
|
@@ -8744,15 +8698,15 @@ struct llm_build_context {
|
|
8744
8698
|
}
|
8745
8699
|
|
8746
8700
|
|
8747
|
-
Qcur =
|
8748
|
-
ctx0, Qcur, inp_pos,
|
8701
|
+
Qcur = ggml_rope_ext(
|
8702
|
+
ctx0, Qcur, inp_pos, nullptr,
|
8749
8703
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8750
8704
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8751
8705
|
);
|
8752
8706
|
cb(Qcur, "Qcur", il);
|
8753
8707
|
|
8754
|
-
Kcur =
|
8755
|
-
ctx0, Kcur, inp_pos,
|
8708
|
+
Kcur = ggml_rope_ext(
|
8709
|
+
ctx0, Kcur, inp_pos, nullptr,
|
8756
8710
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8757
8711
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8758
8712
|
);
|
@@ -8864,14 +8818,14 @@ struct llm_build_context {
|
|
8864
8818
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
8865
8819
|
|
8866
8820
|
// using mode = 2 for neox mode
|
8867
|
-
Qcur =
|
8868
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8821
|
+
Qcur = ggml_rope_ext(
|
8822
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8869
8823
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8870
8824
|
);
|
8871
8825
|
cb(Qcur, "Qcur", il);
|
8872
8826
|
|
8873
|
-
Kcur =
|
8874
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
8827
|
+
Kcur = ggml_rope_ext(
|
8828
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
8875
8829
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
8876
8830
|
);
|
8877
8831
|
cb(Kcur, "Kcur", il);
|
@@ -8975,15 +8929,15 @@ struct llm_build_context {
|
|
8975
8929
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
8976
8930
|
cb(Vcur, "Vcur", il);
|
8977
8931
|
|
8978
|
-
Qcur =
|
8979
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
8932
|
+
Qcur = ggml_rope_ext(
|
8933
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
8980
8934
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8981
8935
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8982
8936
|
);
|
8983
8937
|
cb(Qcur, "Qcur", il);
|
8984
8938
|
|
8985
|
-
Kcur =
|
8986
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
8939
|
+
Kcur = ggml_rope_ext(
|
8940
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
8987
8941
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
8988
8942
|
ext_factor, attn_factor, beta_fast, beta_slow
|
8989
8943
|
);
|
@@ -9089,15 +9043,15 @@ struct llm_build_context {
|
|
9089
9043
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
9090
9044
|
cb(Vcur, "Vcur", il);
|
9091
9045
|
|
9092
|
-
Qcur =
|
9093
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9046
|
+
Qcur = ggml_rope_ext(
|
9047
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9094
9048
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9095
9049
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9096
9050
|
);
|
9097
9051
|
cb(Qcur, "Qcur", il);
|
9098
9052
|
|
9099
|
-
Kcur =
|
9100
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9053
|
+
Kcur = ggml_rope_ext(
|
9054
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9101
9055
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9102
9056
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9103
9057
|
);
|
@@ -9241,8 +9195,8 @@ struct llm_build_context {
|
|
9241
9195
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9242
9196
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9243
9197
|
|
9244
|
-
Qcur =
|
9245
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9198
|
+
Qcur = ggml_rope_ext(
|
9199
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9246
9200
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9247
9201
|
);
|
9248
9202
|
cb(Qcur, "Qcur", il);
|
@@ -9252,8 +9206,8 @@ struct llm_build_context {
|
|
9252
9206
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
9253
9207
|
cb(Qcur, "Qcur", il);
|
9254
9208
|
|
9255
|
-
Kcur =
|
9256
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9209
|
+
Kcur = ggml_rope_ext(
|
9210
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
9257
9211
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9258
9212
|
);
|
9259
9213
|
cb(Kcur, "Kcur", il);
|
@@ -9329,6 +9283,9 @@ struct llm_build_context {
|
|
9329
9283
|
|
9330
9284
|
// self-attention
|
9331
9285
|
{
|
9286
|
+
// rope freq factors for 128k context
|
9287
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
9288
|
+
|
9332
9289
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
9333
9290
|
model.layers[il].attn_norm,
|
9334
9291
|
NULL,
|
@@ -9360,8 +9317,8 @@ struct llm_build_context {
|
|
9360
9317
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
9361
9318
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
9362
9319
|
|
9363
|
-
Qcur =
|
9364
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9320
|
+
Qcur = ggml_rope_ext(
|
9321
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9365
9322
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9366
9323
|
);
|
9367
9324
|
cb(Qcur, "Qcur", il);
|
@@ -9369,8 +9326,8 @@ struct llm_build_context {
|
|
9369
9326
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
9370
9327
|
cb(Qcur, "Qcur", il);
|
9371
9328
|
|
9372
|
-
Kcur =
|
9373
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
9329
|
+
Kcur = ggml_rope_ext(
|
9330
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
9374
9331
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
9375
9332
|
);
|
9376
9333
|
cb(Kcur, "Kcur", il);
|
@@ -9476,14 +9433,14 @@ struct llm_build_context {
|
|
9476
9433
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
9477
9434
|
cb(Vcur, "Vcur", il);
|
9478
9435
|
|
9479
|
-
Qcur =
|
9480
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
9436
|
+
Qcur = ggml_rope_ext(
|
9437
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
9481
9438
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9482
9439
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9483
9440
|
cb(Qcur, "Qcur", il);
|
9484
9441
|
|
9485
|
-
Kcur =
|
9486
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
9442
|
+
Kcur = ggml_rope_ext(
|
9443
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
9487
9444
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9488
9445
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
9489
9446
|
cb(Kcur, "Kcur", il);
|
@@ -9684,15 +9641,15 @@ struct llm_build_context {
|
|
9684
9641
|
cb(tmpk, "tmpk", il);
|
9685
9642
|
cb(Vcur, "Vcur", il);
|
9686
9643
|
|
9687
|
-
struct ggml_tensor * Qcur =
|
9688
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
9644
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
9645
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9689
9646
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9690
9647
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9691
9648
|
);
|
9692
9649
|
cb(Qcur, "Qcur", il);
|
9693
9650
|
|
9694
|
-
struct ggml_tensor * Kcur =
|
9695
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9651
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
9652
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9696
9653
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9697
9654
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9698
9655
|
);
|
@@ -9800,15 +9757,15 @@ struct llm_build_context {
|
|
9800
9757
|
// cb(Vcur, "Vcur", il);
|
9801
9758
|
// }
|
9802
9759
|
|
9803
|
-
Qcur =
|
9804
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9760
|
+
Qcur = ggml_rope_ext(
|
9761
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9805
9762
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9806
9763
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9807
9764
|
);
|
9808
9765
|
cb(Qcur, "Qcur", il);
|
9809
9766
|
|
9810
|
-
Kcur =
|
9811
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9767
|
+
Kcur = ggml_rope_ext(
|
9768
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9812
9769
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9813
9770
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9814
9771
|
);
|
@@ -9917,15 +9874,15 @@ struct llm_build_context {
|
|
9917
9874
|
cb(Vcur, "Vcur", il);
|
9918
9875
|
}
|
9919
9876
|
|
9920
|
-
Qcur =
|
9921
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
9877
|
+
Qcur = ggml_rope_ext(
|
9878
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
9922
9879
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9923
9880
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9924
9881
|
);
|
9925
9882
|
cb(Qcur, "Qcur", il);
|
9926
9883
|
|
9927
|
-
Kcur =
|
9928
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
9884
|
+
Kcur = ggml_rope_ext(
|
9885
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
9929
9886
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
9930
9887
|
ext_factor, attn_factor, beta_fast, beta_slow
|
9931
9888
|
);
|
@@ -10047,15 +10004,15 @@ struct llm_build_context {
|
|
10047
10004
|
cb(Vcur, "Vcur", il);
|
10048
10005
|
}
|
10049
10006
|
|
10050
|
-
Qcur =
|
10051
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10007
|
+
Qcur = ggml_rope_ext(
|
10008
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10052
10009
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10053
10010
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10054
10011
|
);
|
10055
10012
|
cb(Qcur, "Qcur", il);
|
10056
10013
|
|
10057
|
-
Kcur =
|
10058
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10014
|
+
Kcur = ggml_rope_ext(
|
10015
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10059
10016
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10060
10017
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10061
10018
|
);
|
@@ -10167,8 +10124,8 @@ struct llm_build_context {
|
|
10167
10124
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10168
10125
|
cb(Vcur, "Vcur", il);
|
10169
10126
|
|
10170
|
-
Qcur =
|
10171
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
10127
|
+
Qcur = ggml_rope_ext(
|
10128
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
10172
10129
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10173
10130
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10174
10131
|
cb(Qcur, "Qcur", il);
|
@@ -10176,8 +10133,8 @@ struct llm_build_context {
|
|
10176
10133
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
10177
10134
|
cb(Qcur, "Qcur_scaled", il);
|
10178
10135
|
|
10179
|
-
Kcur =
|
10180
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
10136
|
+
Kcur = ggml_rope_ext(
|
10137
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
10181
10138
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10182
10139
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
10183
10140
|
cb(Kcur, "Kcur", il);
|
@@ -10287,15 +10244,15 @@ struct llm_build_context {
|
|
10287
10244
|
cb(Vcur, "Vcur", il);
|
10288
10245
|
}
|
10289
10246
|
|
10290
|
-
Qcur =
|
10291
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10247
|
+
Qcur = ggml_rope_ext(
|
10248
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10292
10249
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10293
10250
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10294
10251
|
);
|
10295
10252
|
cb(Qcur, "Qcur", il);
|
10296
10253
|
|
10297
|
-
Kcur =
|
10298
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10254
|
+
Kcur = ggml_rope_ext(
|
10255
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10299
10256
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10300
10257
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10301
10258
|
);
|
@@ -10577,15 +10534,15 @@ struct llm_build_context {
|
|
10577
10534
|
cb(Kcur, "Kcur", il);
|
10578
10535
|
}
|
10579
10536
|
|
10580
|
-
Qcur =
|
10581
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10537
|
+
Qcur = ggml_rope_ext(
|
10538
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10582
10539
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10583
10540
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10584
10541
|
);
|
10585
10542
|
cb(Qcur, "Qcur", il);
|
10586
10543
|
|
10587
|
-
Kcur =
|
10588
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10544
|
+
Kcur = ggml_rope_ext(
|
10545
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10589
10546
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10590
10547
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10591
10548
|
);
|
@@ -10708,15 +10665,15 @@ struct llm_build_context {
|
|
10708
10665
|
cb(Vcur, "Vcur", il);
|
10709
10666
|
}
|
10710
10667
|
|
10711
|
-
Qcur =
|
10712
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
10668
|
+
Qcur = ggml_rope_ext(
|
10669
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10713
10670
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10714
10671
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10715
10672
|
);
|
10716
10673
|
cb(Qcur, "Qcur", il);
|
10717
10674
|
|
10718
|
-
Kcur =
|
10719
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
10675
|
+
Kcur = ggml_rope_ext(
|
10676
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10720
10677
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10721
10678
|
ext_factor, attn_factor, beta_fast, beta_slow
|
10722
10679
|
);
|
@@ -10780,6 +10737,274 @@ struct llm_build_context {
|
|
10780
10737
|
|
10781
10738
|
return gf;
|
10782
10739
|
}
|
10740
|
+
|
10741
|
+
struct ggml_cgraph * build_gptneox() {
|
10742
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10743
|
+
|
10744
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10745
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
10746
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10747
|
+
|
10748
|
+
struct ggml_tensor * cur;
|
10749
|
+
struct ggml_tensor * inpL;
|
10750
|
+
|
10751
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10752
|
+
|
10753
|
+
// inp_pos - contains the positions
|
10754
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10755
|
+
|
10756
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10757
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10758
|
+
|
10759
|
+
for (int il = 0; il < n_layer; ++il) {
|
10760
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10761
|
+
model.layers[il].attn_norm,
|
10762
|
+
model.layers[il].attn_norm_b,
|
10763
|
+
LLM_NORM, cb, il);
|
10764
|
+
cb(cur, "attn_norm", il);
|
10765
|
+
|
10766
|
+
// self-attention
|
10767
|
+
{
|
10768
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
10769
|
+
cb(cur, "wqkv", il);
|
10770
|
+
|
10771
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
10772
|
+
cb(cur, "bqkv", il);
|
10773
|
+
|
10774
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
10775
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
10776
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
10777
|
+
|
10778
|
+
cb(Qcur, "Qcur", il);
|
10779
|
+
cb(Kcur, "Kcur", il);
|
10780
|
+
cb(Vcur, "Vcur", il);
|
10781
|
+
|
10782
|
+
Qcur = ggml_rope_ext(
|
10783
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10784
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10785
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10786
|
+
);
|
10787
|
+
cb(Qcur, "Qcur", il);
|
10788
|
+
|
10789
|
+
Kcur = ggml_rope_ext(
|
10790
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10791
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10792
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10793
|
+
);
|
10794
|
+
cb(Kcur, "Kcur", il);
|
10795
|
+
|
10796
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10797
|
+
model.layers[il].wo, model.layers[il].bo,
|
10798
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10799
|
+
}
|
10800
|
+
|
10801
|
+
if (il == n_layer - 1) {
|
10802
|
+
// skip computing output for unused tokens
|
10803
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10804
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10805
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
10806
|
+
}
|
10807
|
+
|
10808
|
+
// ffn
|
10809
|
+
if (hparams.use_par_res) {
|
10810
|
+
// attention and ffn are computed in parallel
|
10811
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
10812
|
+
|
10813
|
+
struct ggml_tensor * attn_out = cur;
|
10814
|
+
|
10815
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10816
|
+
model.layers[il].ffn_norm,
|
10817
|
+
model.layers[il].ffn_norm_b,
|
10818
|
+
LLM_NORM, cb, il);
|
10819
|
+
cb(cur, "ffn_norm", il);
|
10820
|
+
|
10821
|
+
cur = llm_build_ffn(ctx0, cur,
|
10822
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10823
|
+
NULL, NULL,
|
10824
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10825
|
+
NULL,
|
10826
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10827
|
+
cb(cur, "ffn_out", il);
|
10828
|
+
|
10829
|
+
cur = ggml_add(ctx0, cur, inpL);
|
10830
|
+
cb(cur, "ffn_out", il);
|
10831
|
+
|
10832
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
10833
|
+
cb(inpL, "l_out", il);
|
10834
|
+
} else {
|
10835
|
+
// attention and ffn are computed sequentially
|
10836
|
+
// x = x + attn(ln1(x))
|
10837
|
+
// x = x + ffn(ln2(x))
|
10838
|
+
|
10839
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
10840
|
+
cb(ffn_inp, "ffn_inp", il);
|
10841
|
+
|
10842
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10843
|
+
model.layers[il].ffn_norm,
|
10844
|
+
model.layers[il].ffn_norm_b,
|
10845
|
+
LLM_NORM, cb, il);
|
10846
|
+
cb(cur, "ffn_norm", il);
|
10847
|
+
|
10848
|
+
cur = llm_build_ffn(ctx0, cur,
|
10849
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
10850
|
+
NULL, NULL,
|
10851
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
10852
|
+
NULL,
|
10853
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
10854
|
+
cb(cur, "ffn_out", il);
|
10855
|
+
|
10856
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
10857
|
+
cb(inpL, "l_out", il);
|
10858
|
+
}
|
10859
|
+
}
|
10860
|
+
|
10861
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10862
|
+
model.output_norm,
|
10863
|
+
model.output_norm_b,
|
10864
|
+
LLM_NORM, cb, -1);
|
10865
|
+
cb(cur, "result_norm", -1);
|
10866
|
+
|
10867
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
10868
|
+
cb(cur, "result_output", -1);
|
10869
|
+
|
10870
|
+
ggml_build_forward_expand(gf, cur);
|
10871
|
+
|
10872
|
+
return gf;
|
10873
|
+
}
|
10874
|
+
|
10875
|
+
struct ggml_cgraph * build_arctic() {
|
10876
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
10877
|
+
|
10878
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
10879
|
+
int32_t n_tokens = this->n_tokens;
|
10880
|
+
|
10881
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
10882
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
10883
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
10884
|
+
|
10885
|
+
struct ggml_tensor * cur;
|
10886
|
+
struct ggml_tensor * inpL;
|
10887
|
+
|
10888
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
10889
|
+
|
10890
|
+
// inp_pos - contains the positions
|
10891
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
10892
|
+
|
10893
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
10894
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
10895
|
+
|
10896
|
+
for (int il = 0; il < n_layer; ++il) {
|
10897
|
+
struct ggml_tensor * inpSA = inpL;
|
10898
|
+
|
10899
|
+
// norm
|
10900
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
10901
|
+
model.layers[il].attn_norm, NULL,
|
10902
|
+
LLM_NORM_RMS, cb, il);
|
10903
|
+
cb(cur, "attn_norm", il);
|
10904
|
+
|
10905
|
+
// self-attention
|
10906
|
+
{
|
10907
|
+
// compute Q and K and RoPE them
|
10908
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
10909
|
+
cb(Qcur, "Qcur", il);
|
10910
|
+
|
10911
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
10912
|
+
cb(Kcur, "Kcur", il);
|
10913
|
+
|
10914
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
10915
|
+
cb(Vcur, "Vcur", il);
|
10916
|
+
|
10917
|
+
Qcur = ggml_rope_ext(
|
10918
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
10919
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10920
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10921
|
+
);
|
10922
|
+
cb(Qcur, "Qcur", il);
|
10923
|
+
|
10924
|
+
Kcur = ggml_rope_ext(
|
10925
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
10926
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
10927
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
10928
|
+
);
|
10929
|
+
cb(Kcur, "Kcur", il);
|
10930
|
+
|
10931
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
10932
|
+
model.layers[il].wo, NULL,
|
10933
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
10934
|
+
}
|
10935
|
+
|
10936
|
+
if (il == n_layer - 1) {
|
10937
|
+
// skip computing output for unused tokens
|
10938
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
10939
|
+
n_tokens = n_outputs;
|
10940
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
10941
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
10942
|
+
}
|
10943
|
+
|
10944
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
10945
|
+
cb(ffn_inp, "ffn_inp", il);
|
10946
|
+
|
10947
|
+
// feed-forward network
|
10948
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
10949
|
+
model.layers[il].ffn_norm, NULL,
|
10950
|
+
LLM_NORM_RMS, cb, il);
|
10951
|
+
cb(cur, "ffn_norm", il);
|
10952
|
+
|
10953
|
+
cur = llm_build_ffn(ctx0, cur,
|
10954
|
+
model.layers[il].ffn_up, NULL,
|
10955
|
+
model.layers[il].ffn_gate, NULL,
|
10956
|
+
model.layers[il].ffn_down, NULL,
|
10957
|
+
NULL,
|
10958
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
10959
|
+
cb(cur, "ffn_out", il);
|
10960
|
+
|
10961
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
10962
|
+
cb(ffn_out, "ffn_out", il);
|
10963
|
+
|
10964
|
+
// MoE
|
10965
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
10966
|
+
model.layers[il].ffn_norm_exps, NULL,
|
10967
|
+
LLM_NORM_RMS, cb, il);
|
10968
|
+
cb(cur, "ffn_norm_exps", il);
|
10969
|
+
|
10970
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
10971
|
+
model.layers[il].ffn_gate_inp,
|
10972
|
+
model.layers[il].ffn_up_exps,
|
10973
|
+
model.layers[il].ffn_gate_exps,
|
10974
|
+
model.layers[il].ffn_down_exps,
|
10975
|
+
n_expert, n_expert_used,
|
10976
|
+
LLM_FFN_SILU, true,
|
10977
|
+
cb, il);
|
10978
|
+
cb(cur, "ffn_moe_out", il);
|
10979
|
+
|
10980
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
10981
|
+
cb(cur, "ffn_out", il);
|
10982
|
+
|
10983
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
10984
|
+
if (layer_dir != nullptr) {
|
10985
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
10986
|
+
}
|
10987
|
+
cb(cur, "l_out", il);
|
10988
|
+
|
10989
|
+
// input for next layer
|
10990
|
+
inpL = cur;
|
10991
|
+
}
|
10992
|
+
|
10993
|
+
cur = inpL;
|
10994
|
+
|
10995
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
10996
|
+
model.output_norm, NULL,
|
10997
|
+
LLM_NORM_RMS, cb, -1);
|
10998
|
+
cb(cur, "result_norm", -1);
|
10999
|
+
|
11000
|
+
// lm_head
|
11001
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
11002
|
+
cb(cur, "result_output", -1);
|
11003
|
+
|
11004
|
+
ggml_build_forward_expand(gf, cur);
|
11005
|
+
|
11006
|
+
return gf;
|
11007
|
+
}
|
10783
11008
|
};
|
10784
11009
|
|
10785
11010
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
@@ -10896,10 +11121,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10896
11121
|
{
|
10897
11122
|
result = llm.build_starcoder();
|
10898
11123
|
} break;
|
10899
|
-
case LLM_ARCH_PERSIMMON:
|
10900
|
-
{
|
10901
|
-
result = llm.build_persimmon();
|
10902
|
-
} break;
|
10903
11124
|
case LLM_ARCH_REFACT:
|
10904
11125
|
{
|
10905
11126
|
result = llm.build_refact();
|
@@ -10994,6 +11215,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
10994
11215
|
{
|
10995
11216
|
result = llm.build_olmo();
|
10996
11217
|
} break;
|
11218
|
+
case LLM_ARCH_GPTNEOX:
|
11219
|
+
{
|
11220
|
+
result = llm.build_gptneox();
|
11221
|
+
} break;
|
11222
|
+
case LLM_ARCH_ARCTIC:
|
11223
|
+
{
|
11224
|
+
result = llm.build_arctic();
|
11225
|
+
} break;
|
10997
11226
|
default:
|
10998
11227
|
GGML_ASSERT(false);
|
10999
11228
|
}
|
@@ -11339,11 +11568,6 @@ static void llama_graph_compute(
|
|
11339
11568
|
llama_context & lctx,
|
11340
11569
|
ggml_cgraph * gf,
|
11341
11570
|
int n_threads) {
|
11342
|
-
#ifdef GGML_USE_MPI
|
11343
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
11344
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
11345
|
-
#endif
|
11346
|
-
|
11347
11571
|
#ifdef GGML_USE_METAL
|
11348
11572
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
11349
11573
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
@@ -11358,10 +11582,6 @@ static void llama_graph_compute(
|
|
11358
11582
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
11359
11583
|
|
11360
11584
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
11361
|
-
|
11362
|
-
#ifdef GGML_USE_MPI
|
11363
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
11364
|
-
#endif
|
11365
11585
|
}
|
11366
11586
|
|
11367
11587
|
// decode a batch of tokens by evaluating the transformer
|
@@ -11399,12 +11619,6 @@ static int llama_decode_internal(
|
|
11399
11619
|
}
|
11400
11620
|
lctx.n_queued_tokens += n_tokens_all;
|
11401
11621
|
|
11402
|
-
#ifdef GGML_USE_MPI
|
11403
|
-
// TODO: needs fix after #3228
|
11404
|
-
GGML_ASSERT(false && "not implemented");
|
11405
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
11406
|
-
#endif
|
11407
|
-
|
11408
11622
|
auto & kv_self = lctx.kv_self;
|
11409
11623
|
|
11410
11624
|
const int64_t n_embd = hparams.n_embd;
|
@@ -12354,6 +12568,7 @@ struct llm_tokenizer_bpe {
|
|
12354
12568
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
12355
12569
|
});
|
12356
12570
|
break;
|
12571
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
12357
12572
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
12358
12573
|
word_collection = unicode_regex_split(text, {
|
12359
12574
|
// original regex from tokenizer.json
|
@@ -12788,9 +13003,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12788
13003
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
12789
13004
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
12790
13005
|
|
13006
|
+
static const bool rtrim = true; //TODO: as param
|
13007
|
+
bool is_prev_special = false;
|
13008
|
+
bool special_token_rtrim = false;
|
13009
|
+
|
12791
13010
|
if (add_special && vocab.special_add_bos != 0) {
|
12792
13011
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
12793
13012
|
output.push_back(vocab.special_bos_id);
|
13013
|
+
is_prev_special = true;
|
12794
13014
|
}
|
12795
13015
|
|
12796
13016
|
for (const auto & fragment : fragment_buffer) {
|
@@ -12802,9 +13022,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12802
13022
|
// and passing 'add space prefix' as bool argument
|
12803
13023
|
//
|
12804
13024
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
12805
|
-
|
12806
|
-
|
12807
|
-
|
13025
|
+
|
13026
|
+
if (special_token_rtrim) {
|
13027
|
+
size_t num_whitespaces = 0;
|
13028
|
+
while (isspace(raw_text[num_whitespaces])) {
|
13029
|
+
num_whitespaces++;
|
13030
|
+
}
|
13031
|
+
if (num_whitespaces == raw_text.size()) {
|
13032
|
+
continue; // skip if all whitespaces
|
13033
|
+
}
|
13034
|
+
raw_text = raw_text.substr(num_whitespaces);
|
13035
|
+
}
|
13036
|
+
|
13037
|
+
if (vocab.add_space_prefix) {
|
13038
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
13039
|
+
raw_text = " " + raw_text;
|
12808
13040
|
}
|
12809
13041
|
}
|
12810
13042
|
|
@@ -12816,6 +13048,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
12816
13048
|
tokenizer.tokenize(raw_text, output);
|
12817
13049
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
12818
13050
|
output.push_back(fragment.token);
|
13051
|
+
is_prev_special = true;
|
13052
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
13053
|
+
special_token_rtrim = rtrim
|
13054
|
+
&& fragment.token != vocab.special_bos_id
|
13055
|
+
&& fragment.token != vocab.special_unk_id
|
13056
|
+
&& fragment.token != vocab.special_eos_id;
|
12819
13057
|
}
|
12820
13058
|
}
|
12821
13059
|
|
@@ -14518,8 +14756,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
14518
14756
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
14519
14757
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
14520
14758
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
14521
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
14522
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
14523
14759
|
if (qs.model.type == MODEL_70B) {
|
14524
14760
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
14525
14761
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
@@ -15533,10 +15769,6 @@ void llama_backend_init(void) {
|
|
15533
15769
|
struct ggml_context * ctx = ggml_init(params);
|
15534
15770
|
ggml_free(ctx);
|
15535
15771
|
}
|
15536
|
-
|
15537
|
-
#ifdef GGML_USE_MPI
|
15538
|
-
ggml_mpi_backend_init();
|
15539
|
-
#endif
|
15540
15772
|
}
|
15541
15773
|
|
15542
15774
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
@@ -15546,9 +15778,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
15546
15778
|
}
|
15547
15779
|
|
15548
15780
|
void llama_backend_free(void) {
|
15549
|
-
#ifdef GGML_USE_MPI
|
15550
|
-
ggml_mpi_backend_free();
|
15551
|
-
#endif
|
15552
15781
|
ggml_quantize_free();
|
15553
15782
|
}
|
15554
15783
|
|
@@ -15691,6 +15920,7 @@ struct llama_context * llama_new_context_with_model(
|
|
15691
15920
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
15692
15921
|
}
|
15693
15922
|
|
15923
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
15694
15924
|
cparams.causal_attn = hparams.causal_attn;
|
15695
15925
|
|
15696
15926
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
@@ -15949,20 +16179,6 @@ struct llama_context * llama_new_context_with_model(
|
|
15949
16179
|
}
|
15950
16180
|
}
|
15951
16181
|
|
15952
|
-
#ifdef GGML_USE_MPI
|
15953
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
15954
|
-
|
15955
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
15956
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
15957
|
-
// TODO: needs fix after #3228
|
15958
|
-
GGML_ASSERT(false && "not implemented");
|
15959
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
15960
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
15961
|
-
llama_backend_free();
|
15962
|
-
exit(1);
|
15963
|
-
}
|
15964
|
-
#endif
|
15965
|
-
|
15966
16182
|
return ctx;
|
15967
16183
|
}
|
15968
16184
|
|
@@ -15999,7 +16215,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
15999
16215
|
// these models do not use RoPE
|
16000
16216
|
case LLM_ARCH_GPT2:
|
16001
16217
|
case LLM_ARCH_GPTJ:
|
16002
|
-
case LLM_ARCH_GPTNEOX:
|
16003
16218
|
case LLM_ARCH_MPT:
|
16004
16219
|
case LLM_ARCH_REFACT:
|
16005
16220
|
case LLM_ARCH_BLOOM:
|
@@ -16019,13 +16234,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16019
16234
|
case LLM_ARCH_XVERSE:
|
16020
16235
|
case LLM_ARCH_COMMAND_R:
|
16021
16236
|
case LLM_ARCH_OLMO:
|
16237
|
+
case LLM_ARCH_ARCTIC:
|
16022
16238
|
return LLAMA_ROPE_TYPE_NORM;
|
16023
16239
|
|
16024
16240
|
// the pairs of head values are offset by n_rot/2
|
16025
16241
|
case LLM_ARCH_FALCON:
|
16026
16242
|
case LLM_ARCH_GROK:
|
16027
16243
|
case LLM_ARCH_DBRX:
|
16028
|
-
case LLM_ARCH_PERSIMMON:
|
16029
16244
|
case LLM_ARCH_BERT:
|
16030
16245
|
case LLM_ARCH_NOMIC_BERT:
|
16031
16246
|
case LLM_ARCH_STABLELM:
|
@@ -16036,6 +16251,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
16036
16251
|
case LLM_ARCH_PHI3:
|
16037
16252
|
case LLM_ARCH_GEMMA:
|
16038
16253
|
case LLM_ARCH_STARCODER2:
|
16254
|
+
case LLM_ARCH_GPTNEOX:
|
16039
16255
|
return LLAMA_ROPE_TYPE_NEOX;
|
16040
16256
|
|
16041
16257
|
// all model arches should be listed explicitly here
|
@@ -16195,6 +16411,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16195
16411
|
}
|
16196
16412
|
|
16197
16413
|
// make tensors
|
16414
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
16198
16415
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
16199
16416
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
16200
16417
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
@@ -16203,6 +16420,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
16203
16420
|
}
|
16204
16421
|
|
16205
16422
|
// allocate tensors / buffers and zero
|
16423
|
+
cvec.ctxs.reserve(ctx_map.size());
|
16424
|
+
cvec.bufs.reserve(ctx_map.size());
|
16206
16425
|
for (auto it : ctx_map) {
|
16207
16426
|
ggml_backend_buffer_type_t buft = it.first;
|
16208
16427
|
ggml_context * ctx = it.second;
|
@@ -17411,6 +17630,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
17411
17630
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
17412
17631
|
}
|
17413
17632
|
|
17633
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
17634
|
+
return ctx->cparams.n_threads;
|
17635
|
+
}
|
17636
|
+
|
17637
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
17638
|
+
return ctx->cparams.n_threads_batch;
|
17639
|
+
}
|
17640
|
+
|
17414
17641
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
17415
17642
|
ctx->abort_callback = abort_callback;
|
17416
17643
|
ctx->abort_callback_data = abort_callback_data;
|
@@ -17845,6 +18072,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
17845
18072
|
}
|
17846
18073
|
}
|
17847
18074
|
// llama2 templates seem to not care about "add_generation_prompt"
|
18075
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
18076
|
+
// Phi 3
|
18077
|
+
for (auto message : chat) {
|
18078
|
+
std::string role(message->role);
|
18079
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
18080
|
+
}
|
18081
|
+
if (add_ass) {
|
18082
|
+
ss << "<|assistant|>\n";
|
18083
|
+
}
|
17848
18084
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
17849
18085
|
// zephyr template
|
17850
18086
|
for (auto message : chat) {
|
@@ -17977,15 +18213,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
17977
18213
|
if (add_ass) {
|
17978
18214
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
17979
18215
|
}
|
17980
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
17981
|
-
// Phi 3
|
17982
|
-
for (auto message : chat) {
|
17983
|
-
std::string role(message->role);
|
17984
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
17985
|
-
}
|
17986
|
-
if (add_ass) {
|
17987
|
-
ss << "<|assistant|>\n";
|
17988
|
-
}
|
17989
18216
|
} else {
|
17990
18217
|
// template not supported
|
17991
18218
|
return -1;
|
@@ -18107,6 +18334,7 @@ const char * llama_print_system_info(void) {
|
|
18107
18334
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
18108
18335
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
18109
18336
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
18337
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
18110
18338
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
18111
18339
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
18112
18340
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
@@ -18167,6 +18395,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
18167
18395
|
g_state.log_callback_user_data = user_data;
|
18168
18396
|
#ifdef GGML_USE_METAL
|
18169
18397
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18398
|
+
#elif defined(GGML_USE_CUDA)
|
18399
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
18170
18400
|
#endif
|
18171
18401
|
}
|
18172
18402
|
|