@fugood/llama.node 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +1 -1
- package/src/LlamaContext.cpp +2 -2
- package/src/LoadSessionWorker.cpp +1 -0
- package/src/llama.cpp/CMakeLists.txt +72 -46
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +732 -752
- package/src/llama.cpp/common/common.h +47 -41
- package/src/llama.cpp/common/grammar-parser.cpp +1 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +89 -7
- package/src/llama.cpp/common/sampling.h +5 -0
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +2 -2
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +13 -8
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +24 -16
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +78 -14
- package/src/llama.cpp/examples/server/server.cpp +21 -9
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +0 -1
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +4 -0
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3700 -2041
- package/src/llama.cpp/ggml-rpc.cpp +188 -56
- package/src/llama.cpp/ggml-sycl.cpp +99 -530
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +202 -225
- package/src/llama.cpp/ggml.c +1034 -1154
- package/src/llama.cpp/ggml.h +59 -31
- package/src/llama.cpp/llama.cpp +859 -609
- package/src/llama.cpp/llama.h +19 -6
- package/src/llama.cpp/requirements.txt +0 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +113 -47
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
package/src/llama.cpp/llama.cpp
CHANGED
|
@@ -26,16 +26,9 @@
|
|
|
26
26
|
#ifdef GGML_USE_METAL
|
|
27
27
|
# include "ggml-metal.h"
|
|
28
28
|
#endif
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
#
|
|
32
|
-
#ifndef QK_K
|
|
33
|
-
# ifdef GGML_QKK_64
|
|
34
|
-
# define QK_K 64
|
|
35
|
-
# else
|
|
36
|
-
# define QK_K 256
|
|
37
|
-
# endif
|
|
38
|
-
#endif
|
|
29
|
+
|
|
30
|
+
// TODO: replace with ggml API call
|
|
31
|
+
#define QK_K 256
|
|
39
32
|
|
|
40
33
|
#ifdef __has_include
|
|
41
34
|
#if __has_include(<unistd.h>)
|
|
@@ -110,7 +103,7 @@
|
|
|
110
103
|
#endif
|
|
111
104
|
|
|
112
105
|
#define LLAMA_MAX_NODES 8192
|
|
113
|
-
#define LLAMA_MAX_EXPERTS
|
|
106
|
+
#define LLAMA_MAX_EXPERTS 128
|
|
114
107
|
|
|
115
108
|
//
|
|
116
109
|
// logging
|
|
@@ -205,7 +198,6 @@ enum llm_arch {
|
|
|
205
198
|
LLM_ARCH_GPTNEOX,
|
|
206
199
|
LLM_ARCH_MPT,
|
|
207
200
|
LLM_ARCH_STARCODER,
|
|
208
|
-
LLM_ARCH_PERSIMMON,
|
|
209
201
|
LLM_ARCH_REFACT,
|
|
210
202
|
LLM_ARCH_BERT,
|
|
211
203
|
LLM_ARCH_NOMIC_BERT,
|
|
@@ -229,6 +221,7 @@ enum llm_arch {
|
|
|
229
221
|
LLM_ARCH_COMMAND_R,
|
|
230
222
|
LLM_ARCH_DBRX,
|
|
231
223
|
LLM_ARCH_OLMO,
|
|
224
|
+
LLM_ARCH_ARCTIC,
|
|
232
225
|
LLM_ARCH_UNKNOWN,
|
|
233
226
|
};
|
|
234
227
|
|
|
@@ -242,7 +235,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
242
235
|
{ LLM_ARCH_MPT, "mpt" },
|
|
243
236
|
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
|
244
237
|
{ LLM_ARCH_STARCODER, "starcoder" },
|
|
245
|
-
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
|
246
238
|
{ LLM_ARCH_REFACT, "refact" },
|
|
247
239
|
{ LLM_ARCH_BERT, "bert" },
|
|
248
240
|
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
@@ -266,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
|
266
258
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
|
267
259
|
{ LLM_ARCH_DBRX, "dbrx" },
|
|
268
260
|
{ LLM_ARCH_OLMO, "olmo" },
|
|
261
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
|
269
262
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
270
263
|
};
|
|
271
264
|
|
|
@@ -309,6 +302,7 @@ enum llm_kv {
|
|
|
309
302
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
310
303
|
LLM_KV_ROPE_SCALING_TYPE,
|
|
311
304
|
LLM_KV_ROPE_SCALING_FACTOR,
|
|
305
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
|
312
306
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
313
307
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
|
314
308
|
|
|
@@ -386,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
386
380
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
387
381
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
388
382
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
|
383
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
|
389
384
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
|
390
385
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
|
391
386
|
|
|
@@ -441,6 +436,8 @@ enum llm_tensor {
|
|
|
441
436
|
LLM_TENSOR_OUTPUT,
|
|
442
437
|
LLM_TENSOR_OUTPUT_NORM,
|
|
443
438
|
LLM_TENSOR_ROPE_FREQS,
|
|
439
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
|
440
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
|
444
441
|
LLM_TENSOR_ATTN_Q,
|
|
445
442
|
LLM_TENSOR_ATTN_K,
|
|
446
443
|
LLM_TENSOR_ATTN_V,
|
|
@@ -460,6 +457,7 @@ enum llm_tensor {
|
|
|
460
457
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
|
461
458
|
LLM_TENSOR_FFN_GATE_EXP,
|
|
462
459
|
LLM_TENSOR_FFN_UP_EXP,
|
|
460
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
|
463
461
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
|
464
462
|
LLM_TENSOR_FFN_GATE_EXPS,
|
|
465
463
|
LLM_TENSOR_FFN_UP_EXPS,
|
|
@@ -598,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
598
596
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
599
597
|
},
|
|
600
598
|
},
|
|
601
|
-
{
|
|
602
|
-
LLM_ARCH_PERSIMMON,
|
|
603
|
-
{
|
|
604
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
|
605
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
|
606
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
|
607
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
|
608
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
|
609
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
|
610
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
|
611
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
|
612
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
|
613
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
|
614
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
|
615
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
|
616
|
-
},
|
|
617
|
-
},
|
|
618
599
|
{
|
|
619
600
|
LLM_ARCH_MPT,
|
|
620
601
|
{
|
|
@@ -825,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
825
806
|
{
|
|
826
807
|
LLM_ARCH_PHI3,
|
|
827
808
|
{
|
|
828
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
|
829
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
|
830
|
-
{ LLM_TENSOR_OUTPUT,
|
|
831
|
-
{
|
|
832
|
-
{
|
|
833
|
-
{
|
|
834
|
-
{
|
|
835
|
-
{
|
|
836
|
-
{
|
|
837
|
-
{
|
|
838
|
-
{
|
|
839
|
-
{
|
|
809
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
810
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
811
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
812
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
|
813
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
815
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
816
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
817
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
840
823
|
},
|
|
841
824
|
},
|
|
842
825
|
{
|
|
@@ -1052,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
1052
1035
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1053
1036
|
},
|
|
1054
1037
|
},
|
|
1038
|
+
{
|
|
1039
|
+
LLM_ARCH_ARCTIC,
|
|
1040
|
+
{
|
|
1041
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1042
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1043
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1044
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1045
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1046
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1047
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1048
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1049
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1050
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1051
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1052
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1053
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1054
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
|
1055
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1056
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1057
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1058
|
+
},
|
|
1059
|
+
},
|
|
1055
1060
|
{
|
|
1056
1061
|
LLM_ARCH_UNKNOWN,
|
|
1057
1062
|
{
|
|
@@ -1697,6 +1702,8 @@ struct llama_state {
|
|
|
1697
1702
|
llama_state() {
|
|
1698
1703
|
#ifdef GGML_USE_METAL
|
|
1699
1704
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
|
1705
|
+
#elif defined(GGML_USE_CUDA)
|
|
1706
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
|
1700
1707
|
#endif
|
|
1701
1708
|
}
|
|
1702
1709
|
|
|
@@ -1710,17 +1717,24 @@ static llama_state g_state;
|
|
|
1710
1717
|
// available llama models
|
|
1711
1718
|
enum e_model {
|
|
1712
1719
|
MODEL_UNKNOWN,
|
|
1720
|
+
MODEL_14M,
|
|
1713
1721
|
MODEL_17M,
|
|
1714
1722
|
MODEL_22M,
|
|
1715
1723
|
MODEL_33M,
|
|
1724
|
+
MODEL_70M,
|
|
1716
1725
|
MODEL_109M,
|
|
1717
1726
|
MODEL_137M,
|
|
1727
|
+
MODEL_160M,
|
|
1718
1728
|
MODEL_335M,
|
|
1729
|
+
MODEL_410M,
|
|
1719
1730
|
MODEL_0_5B,
|
|
1720
1731
|
MODEL_1B,
|
|
1732
|
+
MODEL_1_4B,
|
|
1721
1733
|
MODEL_2B,
|
|
1734
|
+
MODEL_2_8B,
|
|
1722
1735
|
MODEL_3B,
|
|
1723
1736
|
MODEL_4B,
|
|
1737
|
+
MODEL_6_9B,
|
|
1724
1738
|
MODEL_7B,
|
|
1725
1739
|
MODEL_8B,
|
|
1726
1740
|
MODEL_12B,
|
|
@@ -1743,6 +1757,7 @@ enum e_model {
|
|
|
1743
1757
|
MODEL_8x7B,
|
|
1744
1758
|
MODEL_8x22B,
|
|
1745
1759
|
MODEL_16x12B,
|
|
1760
|
+
MODEL_10B_128x3_66B,
|
|
1746
1761
|
};
|
|
1747
1762
|
|
|
1748
1763
|
static const size_t kiB = 1024;
|
|
@@ -1752,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
|
|
|
1752
1767
|
struct llama_hparams {
|
|
1753
1768
|
bool vocab_only;
|
|
1754
1769
|
bool rope_finetuned;
|
|
1770
|
+
bool use_par_res;
|
|
1755
1771
|
|
|
1756
1772
|
uint32_t n_vocab;
|
|
1757
1773
|
uint32_t n_ctx_train; // context size the model was trained on
|
|
@@ -1770,6 +1786,7 @@ struct llama_hparams {
|
|
|
1770
1786
|
float f_norm_eps;
|
|
1771
1787
|
float f_norm_rms_eps;
|
|
1772
1788
|
|
|
1789
|
+
float rope_attn_factor = 1.0f;
|
|
1773
1790
|
float rope_freq_base_train;
|
|
1774
1791
|
float rope_freq_scale_train;
|
|
1775
1792
|
uint32_t n_yarn_orig_ctx;
|
|
@@ -1818,6 +1835,7 @@ struct llama_hparams {
|
|
|
1818
1835
|
|
|
1819
1836
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
|
1820
1837
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
|
1838
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
|
1821
1839
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
|
1822
1840
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
|
1823
1841
|
|
|
@@ -1915,6 +1933,7 @@ struct llama_layer {
|
|
|
1915
1933
|
struct ggml_tensor * ffn_norm_b;
|
|
1916
1934
|
struct ggml_tensor * layer_out_norm;
|
|
1917
1935
|
struct ggml_tensor * layer_out_norm_b;
|
|
1936
|
+
struct ggml_tensor * ffn_norm_exps;
|
|
1918
1937
|
|
|
1919
1938
|
// ff
|
|
1920
1939
|
struct ggml_tensor * ffn_gate; // w1
|
|
@@ -1952,6 +1971,10 @@ struct llama_layer {
|
|
|
1952
1971
|
// mamba bias
|
|
1953
1972
|
struct ggml_tensor * ssm_conv1d_b;
|
|
1954
1973
|
struct ggml_tensor * ssm_dt_b;
|
|
1974
|
+
|
|
1975
|
+
// long rope factors
|
|
1976
|
+
struct ggml_tensor * rope_long = nullptr;
|
|
1977
|
+
struct ggml_tensor * rope_short = nullptr;
|
|
1955
1978
|
};
|
|
1956
1979
|
|
|
1957
1980
|
struct llama_kv_cell {
|
|
@@ -2268,10 +2291,6 @@ struct llama_context {
|
|
|
2268
2291
|
|
|
2269
2292
|
// control vectors
|
|
2270
2293
|
struct llama_control_vector cvec;
|
|
2271
|
-
|
|
2272
|
-
#ifdef GGML_USE_MPI
|
|
2273
|
-
ggml_mpi_context * ctx_mpi = NULL;
|
|
2274
|
-
#endif
|
|
2275
2294
|
};
|
|
2276
2295
|
|
|
2277
2296
|
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
|
@@ -2491,7 +2510,6 @@ static bool llama_kv_cache_init(
|
|
|
2491
2510
|
static bool llama_kv_cache_find_slot(
|
|
2492
2511
|
struct llama_kv_cache & cache,
|
|
2493
2512
|
const struct llama_batch & batch) {
|
|
2494
|
-
const uint32_t n_ctx = cache.size;
|
|
2495
2513
|
const uint32_t n_tokens = batch.n_tokens;
|
|
2496
2514
|
|
|
2497
2515
|
if (cache.recurrent) {
|
|
@@ -2542,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
|
|
|
2542
2560
|
}
|
|
2543
2561
|
// otherwise, one cell per token.
|
|
2544
2562
|
|
|
2545
|
-
if (n_tokens >
|
|
2546
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
|
2563
|
+
if (n_tokens > cache.size) {
|
|
2564
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
|
2547
2565
|
return false;
|
|
2548
2566
|
}
|
|
2549
2567
|
|
|
2550
2568
|
uint32_t n_tested = 0;
|
|
2551
2569
|
|
|
2552
2570
|
while (true) {
|
|
2553
|
-
if (cache.head + n_tokens >
|
|
2554
|
-
n_tested +=
|
|
2571
|
+
if (cache.head + n_tokens > cache.size) {
|
|
2572
|
+
n_tested += cache.size - cache.head;
|
|
2555
2573
|
cache.head = 0;
|
|
2556
2574
|
continue;
|
|
2557
2575
|
}
|
|
@@ -2570,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
|
|
|
2570
2588
|
break;
|
|
2571
2589
|
}
|
|
2572
2590
|
|
|
2573
|
-
if (n_tested >=
|
|
2591
|
+
if (n_tested >= cache.size) {
|
|
2574
2592
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
|
2575
2593
|
return false;
|
|
2576
2594
|
}
|
|
@@ -3330,6 +3348,39 @@ struct llama_model_loader {
|
|
|
3330
3348
|
return get_arr_n(llm_kv(kid), result, required);
|
|
3331
3349
|
}
|
|
3332
3350
|
|
|
3351
|
+
template<typename T>
|
|
3352
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
|
3353
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
|
3354
|
+
|
|
3355
|
+
if (kid < 0) {
|
|
3356
|
+
if (required) {
|
|
3357
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
|
3358
|
+
}
|
|
3359
|
+
return false;
|
|
3360
|
+
}
|
|
3361
|
+
|
|
3362
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
|
3363
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
|
3364
|
+
|
|
3365
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
|
3366
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
|
3367
|
+
}
|
|
3368
|
+
|
|
3369
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
|
3370
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
|
3371
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
|
3372
|
+
|
|
3373
|
+
result.resize(arr_info.length);
|
|
3374
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
|
3375
|
+
|
|
3376
|
+
return true;
|
|
3377
|
+
}
|
|
3378
|
+
|
|
3379
|
+
template<typename T>
|
|
3380
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
|
3381
|
+
return get_arr(llm_kv(kid), result, required);
|
|
3382
|
+
}
|
|
3383
|
+
|
|
3333
3384
|
template<typename T>
|
|
3334
3385
|
bool get_key(const std::string & key, T & result, const bool required = true) {
|
|
3335
3386
|
auto it = kv_overrides.find(key);
|
|
@@ -3404,11 +3455,15 @@ struct llama_model_loader {
|
|
|
3404
3455
|
return get_tensor_meta(get_tensor_name(i));
|
|
3405
3456
|
}
|
|
3406
3457
|
|
|
3407
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
|
3458
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
|
3408
3459
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
|
3409
3460
|
ggml_set_name(tensor, ggml_get_name(cur));
|
|
3410
3461
|
|
|
3411
|
-
|
|
3462
|
+
if (duplicated) {
|
|
3463
|
+
size_data += ggml_nbytes(cur);
|
|
3464
|
+
} else {
|
|
3465
|
+
n_created++;
|
|
3466
|
+
}
|
|
3412
3467
|
|
|
3413
3468
|
return tensor;
|
|
3414
3469
|
}
|
|
@@ -3443,14 +3498,17 @@ struct llama_model_loader {
|
|
|
3443
3498
|
return cur;
|
|
3444
3499
|
}
|
|
3445
3500
|
|
|
3446
|
-
|
|
3447
|
-
|
|
3501
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
|
3502
|
+
static const int TENSOR_DUPLICATED = 2;
|
|
3503
|
+
|
|
3504
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
|
3505
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
|
3448
3506
|
|
|
3449
3507
|
if (cur == NULL) {
|
|
3450
3508
|
return NULL;
|
|
3451
3509
|
}
|
|
3452
3510
|
|
|
3453
|
-
return create_tensor_for(ctx, cur);
|
|
3511
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
|
3454
3512
|
}
|
|
3455
3513
|
|
|
3456
3514
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
|
@@ -3750,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
3750
3808
|
|
|
3751
3809
|
static const char * llama_model_type_name(e_model type) {
|
|
3752
3810
|
switch (type) {
|
|
3753
|
-
case
|
|
3754
|
-
case
|
|
3755
|
-
case
|
|
3756
|
-
case
|
|
3757
|
-
case
|
|
3758
|
-
case
|
|
3759
|
-
case
|
|
3760
|
-
case
|
|
3761
|
-
case
|
|
3762
|
-
case
|
|
3763
|
-
case
|
|
3764
|
-
case
|
|
3765
|
-
case
|
|
3766
|
-
case
|
|
3767
|
-
case
|
|
3768
|
-
case
|
|
3769
|
-
case
|
|
3770
|
-
case
|
|
3771
|
-
case
|
|
3772
|
-
case
|
|
3773
|
-
case
|
|
3774
|
-
case
|
|
3775
|
-
case
|
|
3776
|
-
case
|
|
3777
|
-
case
|
|
3778
|
-
case
|
|
3779
|
-
case
|
|
3780
|
-
case
|
|
3781
|
-
case
|
|
3782
|
-
case
|
|
3783
|
-
|
|
3811
|
+
case MODEL_14M: return "14M";
|
|
3812
|
+
case MODEL_17M: return "17M";
|
|
3813
|
+
case MODEL_22M: return "22M";
|
|
3814
|
+
case MODEL_33M: return "33M";
|
|
3815
|
+
case MODEL_70M: return "70M";
|
|
3816
|
+
case MODEL_109M: return "109M";
|
|
3817
|
+
case MODEL_137M: return "137M";
|
|
3818
|
+
case MODEL_160M: return "160M";
|
|
3819
|
+
case MODEL_335M: return "335M";
|
|
3820
|
+
case MODEL_410M: return "410M";
|
|
3821
|
+
case MODEL_0_5B: return "0.5B";
|
|
3822
|
+
case MODEL_1B: return "1B";
|
|
3823
|
+
case MODEL_1_4B: return "1.4B";
|
|
3824
|
+
case MODEL_2B: return "2B";
|
|
3825
|
+
case MODEL_2_8B: return "2.8B";
|
|
3826
|
+
case MODEL_3B: return "3B";
|
|
3827
|
+
case MODEL_4B: return "4B";
|
|
3828
|
+
case MODEL_6_9B: return "6.9B";
|
|
3829
|
+
case MODEL_7B: return "7B";
|
|
3830
|
+
case MODEL_8B: return "8B";
|
|
3831
|
+
case MODEL_12B: return "12B";
|
|
3832
|
+
case MODEL_13B: return "13B";
|
|
3833
|
+
case MODEL_14B: return "14B";
|
|
3834
|
+
case MODEL_15B: return "15B";
|
|
3835
|
+
case MODEL_20B: return "20B";
|
|
3836
|
+
case MODEL_30B: return "30B";
|
|
3837
|
+
case MODEL_34B: return "34B";
|
|
3838
|
+
case MODEL_35B: return "35B";
|
|
3839
|
+
case MODEL_40B: return "40B";
|
|
3840
|
+
case MODEL_65B: return "65B";
|
|
3841
|
+
case MODEL_70B: return "70B";
|
|
3842
|
+
case MODEL_314B: return "314B";
|
|
3843
|
+
case MODEL_SMALL: return "0.1B";
|
|
3844
|
+
case MODEL_MEDIUM: return "0.4B";
|
|
3845
|
+
case MODEL_LARGE: return "0.8B";
|
|
3846
|
+
case MODEL_XL: return "1.5B";
|
|
3847
|
+
case MODEL_A2_7B: return "A2.7B";
|
|
3848
|
+
case MODEL_8x7B: return "8x7B";
|
|
3849
|
+
case MODEL_8x22B: return "8x22B";
|
|
3850
|
+
case MODEL_16x12B: return "16x12B";
|
|
3851
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
|
3852
|
+
default: return "?B";
|
|
3784
3853
|
}
|
|
3785
3854
|
}
|
|
3786
3855
|
|
|
@@ -3873,6 +3942,8 @@ static void llm_load_hparams(
|
|
|
3873
3942
|
}
|
|
3874
3943
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
3875
3944
|
|
|
3945
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
3946
|
+
|
|
3876
3947
|
// sanity check for n_rot (optional)
|
|
3877
3948
|
{
|
|
3878
3949
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
|
@@ -3972,14 +4043,6 @@ static void llm_load_hparams(
|
|
|
3972
4043
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3973
4044
|
}
|
|
3974
4045
|
} break;
|
|
3975
|
-
case LLM_ARCH_PERSIMMON:
|
|
3976
|
-
{
|
|
3977
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3978
|
-
switch (hparams.n_layer) {
|
|
3979
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
|
3980
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3981
|
-
}
|
|
3982
|
-
} break;
|
|
3983
4046
|
case LLM_ARCH_REFACT:
|
|
3984
4047
|
{
|
|
3985
4048
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -4121,6 +4184,7 @@ static void llm_load_hparams(
|
|
|
4121
4184
|
switch (hparams.n_layer) {
|
|
4122
4185
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
4123
4186
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
4187
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
|
4124
4188
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4125
4189
|
}
|
|
4126
4190
|
} break;
|
|
@@ -4261,6 +4325,65 @@ static void llm_load_hparams(
|
|
|
4261
4325
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4262
4326
|
}
|
|
4263
4327
|
} break;
|
|
4328
|
+
case LLM_ARCH_GPTNEOX:
|
|
4329
|
+
{
|
|
4330
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
4331
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
|
4332
|
+
switch (hparams.n_layer) {
|
|
4333
|
+
case 6:
|
|
4334
|
+
switch (hparams.n_ff) {
|
|
4335
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
|
4336
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
|
4337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4338
|
+
} break;
|
|
4339
|
+
case 12:
|
|
4340
|
+
switch (hparams.n_ff) {
|
|
4341
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
|
4342
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4343
|
+
} break;
|
|
4344
|
+
case 16:
|
|
4345
|
+
switch (hparams.n_ff) {
|
|
4346
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
|
4347
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4348
|
+
} break;
|
|
4349
|
+
case 24:
|
|
4350
|
+
switch (hparams.n_ff) {
|
|
4351
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
|
4352
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
|
4353
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4354
|
+
} break;
|
|
4355
|
+
case 32:
|
|
4356
|
+
switch (hparams.n_ff) {
|
|
4357
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
|
4358
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
|
4359
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4360
|
+
} break;
|
|
4361
|
+
case 36:
|
|
4362
|
+
switch (hparams.n_ff) {
|
|
4363
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
|
4364
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4365
|
+
} break;
|
|
4366
|
+
case 44:
|
|
4367
|
+
switch (hparams.n_ff) {
|
|
4368
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
|
4369
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4370
|
+
} break;
|
|
4371
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4372
|
+
}
|
|
4373
|
+
} break;
|
|
4374
|
+
case LLM_ARCH_ARCTIC:
|
|
4375
|
+
{
|
|
4376
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
4377
|
+
|
|
4378
|
+
if (hparams.n_expert == 128) {
|
|
4379
|
+
switch (hparams.n_layer) {
|
|
4380
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
|
4381
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4382
|
+
}
|
|
4383
|
+
} else {
|
|
4384
|
+
model.type = e_model::MODEL_UNKNOWN;
|
|
4385
|
+
}
|
|
4386
|
+
} break;
|
|
4264
4387
|
default: (void)0;
|
|
4265
4388
|
}
|
|
4266
4389
|
|
|
@@ -4461,12 +4584,18 @@ static void llm_load_vocab(
|
|
|
4461
4584
|
} else if (
|
|
4462
4585
|
tokenizer_pre == "qwen2") {
|
|
4463
4586
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
4587
|
+
} else if (
|
|
4588
|
+
tokenizer_pre == "stablelm2") {
|
|
4589
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
|
4464
4590
|
} else if (
|
|
4465
4591
|
tokenizer_pre == "olmo") {
|
|
4466
4592
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
|
4467
4593
|
} else if (
|
|
4468
4594
|
tokenizer_pre == "dbrx") {
|
|
4469
4595
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
|
4596
|
+
} else if (
|
|
4597
|
+
tokenizer_pre == "smaug-bpe") {
|
|
4598
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
|
4470
4599
|
} else {
|
|
4471
4600
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
4472
4601
|
}
|
|
@@ -4582,7 +4711,8 @@ static void llm_load_vocab(
|
|
|
4582
4711
|
(t.first == "<|eot_id|>" ||
|
|
4583
4712
|
t.first == "<|im_end|>" ||
|
|
4584
4713
|
t.first == "<|end|>" ||
|
|
4585
|
-
t.first == "<end_of_turn>"
|
|
4714
|
+
t.first == "<end_of_turn>" ||
|
|
4715
|
+
t.first == "<|endoftext|>"
|
|
4586
4716
|
)
|
|
4587
4717
|
) {
|
|
4588
4718
|
vocab.special_eot_id = t.second;
|
|
@@ -4908,6 +5038,7 @@ static bool llm_load_tensors(
|
|
|
4908
5038
|
// create tensors for the weights
|
|
4909
5039
|
{
|
|
4910
5040
|
const int64_t n_embd = hparams.n_embd;
|
|
5041
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
|
4911
5042
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
4912
5043
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
4913
5044
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
@@ -4942,12 +5073,10 @@ static bool llm_load_tensors(
|
|
|
4942
5073
|
{
|
|
4943
5074
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
4944
5075
|
if (model.arch != LLM_ARCH_MINICPM){
|
|
4945
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5076
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4946
5077
|
// if output is NULL, init from the input tok embed
|
|
4947
5078
|
if (model.output == NULL) {
|
|
4948
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4949
|
-
ml.n_created--; // artificial tensor
|
|
4950
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5079
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
4951
5080
|
}
|
|
4952
5081
|
}
|
|
4953
5082
|
}
|
|
@@ -4966,10 +5095,10 @@ static bool llm_load_tensors(
|
|
|
4966
5095
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
4967
5096
|
|
|
4968
5097
|
// optional bias tensors
|
|
4969
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
4970
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
4971
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
4972
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
|
5098
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5099
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5100
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5101
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4973
5102
|
|
|
4974
5103
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
4975
5104
|
|
|
@@ -4980,7 +5109,7 @@ static bool llm_load_tensors(
|
|
|
4980
5109
|
} else {
|
|
4981
5110
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
4982
5111
|
|
|
4983
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
|
5112
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4984
5113
|
if (layer.ffn_gate_exps) {
|
|
4985
5114
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
|
4986
5115
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
@@ -5022,12 +5151,10 @@ static bool llm_load_tensors(
|
|
|
5022
5151
|
// output
|
|
5023
5152
|
{
|
|
5024
5153
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5025
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5154
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5026
5155
|
// if output is NULL, init from the input tok embed
|
|
5027
5156
|
if (model.output == NULL) {
|
|
5028
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5029
|
-
ml.n_created--; // artificial tensor
|
|
5030
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5157
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5031
5158
|
}
|
|
5032
5159
|
}
|
|
5033
5160
|
|
|
@@ -5050,7 +5177,7 @@ static bool llm_load_tensors(
|
|
|
5050
5177
|
|
|
5051
5178
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
5052
5179
|
|
|
5053
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
|
5180
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5054
5181
|
if (layer.ffn_gate_exps) {
|
|
5055
5182
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
|
5056
5183
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
@@ -5152,11 +5279,9 @@ static bool llm_load_tensors(
|
|
|
5152
5279
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5153
5280
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5154
5281
|
|
|
5155
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5282
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5156
5283
|
if (!model.output) {
|
|
5157
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
|
5158
|
-
ml.n_created--; // artificial tensor
|
|
5159
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5284
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
|
5160
5285
|
}
|
|
5161
5286
|
}
|
|
5162
5287
|
|
|
@@ -5169,8 +5294,8 @@ static bool llm_load_tensors(
|
|
|
5169
5294
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5170
5295
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
5171
5296
|
|
|
5172
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
|
5173
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
|
5297
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5298
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5174
5299
|
|
|
5175
5300
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
5176
5301
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
@@ -5188,7 +5313,12 @@ static bool llm_load_tensors(
|
|
|
5188
5313
|
{
|
|
5189
5314
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5190
5315
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5191
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
5316
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5317
|
+
if (!model.output) {
|
|
5318
|
+
// needs to be on GPU
|
|
5319
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5320
|
+
}
|
|
5321
|
+
|
|
5192
5322
|
}
|
|
5193
5323
|
|
|
5194
5324
|
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -5216,47 +5346,6 @@ static bool llm_load_tensors(
|
|
|
5216
5346
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
5217
5347
|
}
|
|
5218
5348
|
} break;
|
|
5219
|
-
case LLM_ARCH_PERSIMMON:
|
|
5220
|
-
{
|
|
5221
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5222
|
-
|
|
5223
|
-
{
|
|
5224
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5225
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5226
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
5227
|
-
}
|
|
5228
|
-
|
|
5229
|
-
for (int i = 0; i < n_layer; ++i) {
|
|
5230
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
5231
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
5232
|
-
|
|
5233
|
-
auto & layer = model.layers[i];
|
|
5234
|
-
|
|
5235
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5236
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
5237
|
-
|
|
5238
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
5239
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
5240
|
-
|
|
5241
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
5242
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
5243
|
-
|
|
5244
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
5245
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
5246
|
-
|
|
5247
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5248
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
5249
|
-
|
|
5250
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
5251
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
5252
|
-
|
|
5253
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
|
5254
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
|
5255
|
-
|
|
5256
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
|
5257
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
|
5258
|
-
}
|
|
5259
|
-
} break;
|
|
5260
5349
|
case LLM_ARCH_BERT:
|
|
5261
5350
|
case LLM_ARCH_NOMIC_BERT:
|
|
5262
5351
|
{
|
|
@@ -5325,14 +5414,14 @@ static bool llm_load_tensors(
|
|
|
5325
5414
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
5326
5415
|
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
|
5327
5416
|
|
|
5328
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
|
5329
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
|
5417
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5418
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5330
5419
|
|
|
5331
5420
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
5332
5421
|
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
5333
5422
|
|
|
5334
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
|
5335
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
|
5423
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5424
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5336
5425
|
|
|
5337
5426
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
5338
5427
|
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
|
@@ -5394,18 +5483,16 @@ static bool llm_load_tensors(
|
|
|
5394
5483
|
case LLM_ARCH_MPT:
|
|
5395
5484
|
{
|
|
5396
5485
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5397
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
|
5486
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5398
5487
|
|
|
5399
5488
|
// output
|
|
5400
5489
|
{
|
|
5401
5490
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5402
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
|
5491
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5403
5492
|
|
|
5404
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5493
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5405
5494
|
if (!model.output) {
|
|
5406
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
|
5407
|
-
ml.n_created--; // artificial tensor
|
|
5408
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5495
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
|
5409
5496
|
}
|
|
5410
5497
|
}
|
|
5411
5498
|
|
|
@@ -5416,31 +5503,31 @@ static bool llm_load_tensors(
|
|
|
5416
5503
|
auto & layer = model.layers[i];
|
|
5417
5504
|
|
|
5418
5505
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5419
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
|
5506
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5420
5507
|
|
|
5421
5508
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
5422
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
|
5509
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5423
5510
|
|
|
5424
5511
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
5425
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
|
5512
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5426
5513
|
|
|
5427
5514
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
5428
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
|
5515
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5429
5516
|
|
|
5430
5517
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
5431
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
|
5518
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5432
5519
|
|
|
5433
5520
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5434
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
|
5521
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5435
5522
|
|
|
5436
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
|
5437
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
|
5523
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5524
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5438
5525
|
|
|
5439
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
|
5440
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
|
5526
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5527
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5441
5528
|
|
|
5442
5529
|
// AWQ ScaleActivation layer
|
|
5443
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
|
5530
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5444
5531
|
}
|
|
5445
5532
|
} break;
|
|
5446
5533
|
case LLM_ARCH_STABLELM:
|
|
@@ -5469,17 +5556,17 @@ static bool llm_load_tensors(
|
|
|
5469
5556
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
5470
5557
|
|
|
5471
5558
|
// optional bias tensors, present in Stable LM 2 1.6B
|
|
5472
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
5473
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
5474
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
5559
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5560
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5561
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5475
5562
|
|
|
5476
5563
|
// optional q and k layernorms, present in StableLM 2 12B
|
|
5477
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
|
5478
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
|
5564
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5565
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5479
5566
|
|
|
5480
5567
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
|
5481
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
|
5482
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
|
5568
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5569
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5483
5570
|
|
|
5484
5571
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5485
5572
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
@@ -5522,12 +5609,10 @@ static bool llm_load_tensors(
|
|
|
5522
5609
|
// output
|
|
5523
5610
|
{
|
|
5524
5611
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5525
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5612
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5526
5613
|
// if output is NULL, init from the input tok embed
|
|
5527
5614
|
if (model.output == NULL) {
|
|
5528
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5529
|
-
ml.n_created--; // artificial tensor
|
|
5530
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5615
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5531
5616
|
}
|
|
5532
5617
|
}
|
|
5533
5618
|
|
|
@@ -5625,8 +5710,8 @@ static bool llm_load_tensors(
|
|
|
5625
5710
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5626
5711
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
5627
5712
|
|
|
5628
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
|
5629
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
|
5713
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5714
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5630
5715
|
|
|
5631
5716
|
if (layer.wqkv == nullptr) {
|
|
5632
5717
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
@@ -5663,17 +5748,20 @@ static bool llm_load_tensors(
|
|
|
5663
5748
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
|
5664
5749
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
|
5665
5750
|
|
|
5666
|
-
auto& layer = model.layers[i];
|
|
5751
|
+
auto & layer = model.layers[i];
|
|
5667
5752
|
|
|
5668
5753
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
|
5669
5754
|
|
|
5670
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
|
5671
|
-
layer.wo
|
|
5755
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5756
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
|
5672
5757
|
|
|
5673
5758
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
|
5674
5759
|
|
|
5675
5760
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
|
5676
5761
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
|
5762
|
+
|
|
5763
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
5764
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
5677
5765
|
}
|
|
5678
5766
|
} break;
|
|
5679
5767
|
case LLM_ARCH_PLAMO:
|
|
@@ -5842,9 +5930,7 @@ static bool llm_load_tensors(
|
|
|
5842
5930
|
|
|
5843
5931
|
// output
|
|
5844
5932
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5845
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
|
5846
|
-
ml.n_created--; // artificial tensor
|
|
5847
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5933
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
|
5848
5934
|
|
|
5849
5935
|
const int64_t n_ff = hparams.n_ff;
|
|
5850
5936
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
@@ -5879,12 +5965,10 @@ static bool llm_load_tensors(
|
|
|
5879
5965
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5880
5966
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5881
5967
|
|
|
5882
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5968
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5883
5969
|
// if output is NULL, init from the input tok embed
|
|
5884
5970
|
if (model.output == NULL) {
|
|
5885
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5886
|
-
ml.n_created--; // artificial tensor
|
|
5887
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5971
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5888
5972
|
}
|
|
5889
5973
|
|
|
5890
5974
|
}
|
|
@@ -5935,12 +6019,10 @@ static bool llm_load_tensors(
|
|
|
5935
6019
|
{
|
|
5936
6020
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5937
6021
|
|
|
5938
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
6022
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5939
6023
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
5940
6024
|
if (model.output == NULL) {
|
|
5941
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5942
|
-
ml.n_created--; // artificial tensor
|
|
5943
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
6025
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5944
6026
|
}
|
|
5945
6027
|
}
|
|
5946
6028
|
|
|
@@ -6001,9 +6083,7 @@ static bool llm_load_tensors(
|
|
|
6001
6083
|
{
|
|
6002
6084
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
6003
6085
|
// init output from the input tok embed
|
|
6004
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
6005
|
-
ml.n_created--; // artificial tensor
|
|
6006
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
6086
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
6007
6087
|
}
|
|
6008
6088
|
|
|
6009
6089
|
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -6035,12 +6115,10 @@ static bool llm_load_tensors(
|
|
|
6035
6115
|
|
|
6036
6116
|
// output
|
|
6037
6117
|
{
|
|
6038
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
6118
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
6039
6119
|
// if output is NULL, init from the input tok embed
|
|
6040
6120
|
if (model.output == NULL) {
|
|
6041
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
6042
|
-
ml.n_created--; // artificial tensor
|
|
6043
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
6121
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
6044
6122
|
}
|
|
6045
6123
|
}
|
|
6046
6124
|
|
|
@@ -6060,6 +6138,81 @@ static bool llm_load_tensors(
|
|
|
6060
6138
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
6061
6139
|
}
|
|
6062
6140
|
} break;
|
|
6141
|
+
case LLM_ARCH_GPTNEOX:
|
|
6142
|
+
{
|
|
6143
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
6144
|
+
// output
|
|
6145
|
+
{
|
|
6146
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
6147
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
6148
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
6149
|
+
}
|
|
6150
|
+
|
|
6151
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6152
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
6153
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
6154
|
+
|
|
6155
|
+
auto & layer = model.layers[i];
|
|
6156
|
+
|
|
6157
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
6158
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
6159
|
+
|
|
6160
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
6161
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
6162
|
+
|
|
6163
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
6164
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
6165
|
+
|
|
6166
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
6167
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
6168
|
+
|
|
6169
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
6170
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
6171
|
+
|
|
6172
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
6173
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
6174
|
+
}
|
|
6175
|
+
} break;
|
|
6176
|
+
case LLM_ARCH_ARCTIC:
|
|
6177
|
+
{
|
|
6178
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
6179
|
+
|
|
6180
|
+
// output
|
|
6181
|
+
{
|
|
6182
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
6183
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
6184
|
+
// if output is NULL, init from the input tok embed
|
|
6185
|
+
if (model.output == NULL) {
|
|
6186
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
6187
|
+
}
|
|
6188
|
+
}
|
|
6189
|
+
|
|
6190
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6191
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
6192
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
6193
|
+
|
|
6194
|
+
auto & layer = model.layers[i];
|
|
6195
|
+
|
|
6196
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
6197
|
+
|
|
6198
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
6199
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
6200
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
6201
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
6202
|
+
|
|
6203
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
6204
|
+
|
|
6205
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
|
6206
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
|
6207
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
|
6208
|
+
|
|
6209
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
6210
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
|
6211
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
|
6212
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
|
6213
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
6214
|
+
}
|
|
6215
|
+
} break;
|
|
6063
6216
|
default:
|
|
6064
6217
|
throw std::runtime_error("unknown architecture");
|
|
6065
6218
|
}
|
|
@@ -6324,10 +6477,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
|
6324
6477
|
|
|
6325
6478
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
|
6326
6479
|
} else {
|
|
6327
|
-
|
|
6328
|
-
GGML_ASSERT(false && "not implemented");
|
|
6329
|
-
#endif
|
|
6330
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
6480
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
6331
6481
|
inpL = lctx.inp_embd;
|
|
6332
6482
|
ggml_set_input(lctx.inp_embd);
|
|
6333
6483
|
}
|
|
@@ -6622,6 +6772,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6622
6772
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
6623
6773
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
6624
6774
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
|
6775
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
6625
6776
|
|
|
6626
6777
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
|
6627
6778
|
cb(q, "q", il);
|
|
@@ -6644,23 +6795,23 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6644
6795
|
struct ggml_tensor * v =
|
|
6645
6796
|
ggml_view_3d(ctx, kv.v_l[il],
|
|
6646
6797
|
n_embd_head_v, n_kv, n_head_kv,
|
|
6647
|
-
ggml_row_size(kv.v_l[il]->type,
|
|
6648
|
-
ggml_row_size(kv.v_l[il]->type,
|
|
6798
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
|
6799
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
|
6649
6800
|
0);
|
|
6650
6801
|
cb(v, "v", il);
|
|
6651
6802
|
|
|
6652
6803
|
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
6653
6804
|
|
|
6654
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
|
6805
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
|
6655
6806
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
|
6656
6807
|
}
|
|
6657
6808
|
|
|
6658
|
-
cur = ggml_reshape_2d(ctx, cur,
|
|
6809
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
|
6659
6810
|
} else {
|
|
6660
6811
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
6661
6812
|
cb(kq, "kq", il);
|
|
6662
6813
|
|
|
6663
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
|
6814
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
|
6664
6815
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
|
6665
6816
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
|
6666
6817
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
@@ -6700,7 +6851,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6700
6851
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
|
6701
6852
|
cb(kqv_merged, "kqv_merged", il);
|
|
6702
6853
|
|
|
6703
|
-
cur = ggml_cont_2d(ctx, kqv_merged,
|
|
6854
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
|
6704
6855
|
cb(cur, "kqv_merged_cont", il);
|
|
6705
6856
|
}
|
|
6706
6857
|
|
|
@@ -6885,17 +7036,20 @@ struct llm_build_context {
|
|
|
6885
7036
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
|
6886
7037
|
ggml_set_input(lctx.inp_K_shift);
|
|
6887
7038
|
|
|
7039
|
+
|
|
6888
7040
|
for (int il = 0; il < n_layer; ++il) {
|
|
7041
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
6889
7042
|
struct ggml_tensor * tmp =
|
|
6890
7043
|
// we rotate only the first n_rot dimensions
|
|
6891
|
-
|
|
7044
|
+
ggml_rope_ext_inplace(ctx0,
|
|
6892
7045
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
|
6893
7046
|
n_embd_head_k, n_head_kv, n_ctx,
|
|
6894
7047
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
|
6895
7048
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
6896
7049
|
0),
|
|
6897
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7050
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6898
7051
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7052
|
+
|
|
6899
7053
|
cb(tmp, "K_shifted", il);
|
|
6900
7054
|
ggml_build_forward_expand(gf, tmp);
|
|
6901
7055
|
}
|
|
@@ -6998,6 +7152,17 @@ struct llm_build_context {
|
|
|
6998
7152
|
return lctx.inp_pos;
|
|
6999
7153
|
}
|
|
7000
7154
|
|
|
7155
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
|
7156
|
+
// choose long/short freq factors based on the context size
|
|
7157
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
7158
|
+
|
|
7159
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
|
7160
|
+
return model.layers[il].rope_long;
|
|
7161
|
+
}
|
|
7162
|
+
|
|
7163
|
+
return model.layers[il].rope_short;
|
|
7164
|
+
}
|
|
7165
|
+
|
|
7001
7166
|
struct ggml_tensor * build_inp_out_ids() {
|
|
7002
7167
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
|
7003
7168
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
|
@@ -7105,15 +7270,15 @@ struct llm_build_context {
|
|
|
7105
7270
|
cb(Vcur, "Vcur", il);
|
|
7106
7271
|
}
|
|
7107
7272
|
|
|
7108
|
-
Qcur =
|
|
7109
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7273
|
+
Qcur = ggml_rope_ext(
|
|
7274
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7110
7275
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7111
7276
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7112
7277
|
);
|
|
7113
7278
|
cb(Qcur, "Qcur", il);
|
|
7114
7279
|
|
|
7115
|
-
Kcur =
|
|
7116
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7280
|
+
Kcur = ggml_rope_ext(
|
|
7281
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7117
7282
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7118
7283
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7119
7284
|
);
|
|
@@ -7235,13 +7400,13 @@ struct llm_build_context {
|
|
|
7235
7400
|
|
|
7236
7401
|
switch (model.type) {
|
|
7237
7402
|
case MODEL_7B:
|
|
7238
|
-
Qcur =
|
|
7239
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7403
|
+
Qcur = ggml_rope_ext(
|
|
7404
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7240
7405
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7241
7406
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7242
7407
|
);
|
|
7243
|
-
Kcur =
|
|
7244
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7408
|
+
Kcur = ggml_rope_ext(
|
|
7409
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7245
7410
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7246
7411
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7247
7412
|
);
|
|
@@ -7347,15 +7512,15 @@ struct llm_build_context {
|
|
|
7347
7512
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
7348
7513
|
cb(Vcur, "Vcur", il);
|
|
7349
7514
|
|
|
7350
|
-
Qcur =
|
|
7351
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7515
|
+
Qcur = ggml_rope_ext(
|
|
7516
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7352
7517
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7353
7518
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7354
7519
|
);
|
|
7355
7520
|
cb(Qcur, "Qcur", il);
|
|
7356
7521
|
|
|
7357
|
-
Kcur =
|
|
7358
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7522
|
+
Kcur = ggml_rope_ext(
|
|
7523
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7359
7524
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7360
7525
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7361
7526
|
);
|
|
@@ -7468,14 +7633,14 @@ struct llm_build_context {
|
|
|
7468
7633
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7469
7634
|
|
|
7470
7635
|
// using mode = 2 for neox mode
|
|
7471
|
-
Qcur =
|
|
7472
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
7636
|
+
Qcur = ggml_rope_ext(
|
|
7637
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
7473
7638
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7474
7639
|
);
|
|
7475
7640
|
cb(Qcur, "Qcur", il);
|
|
7476
7641
|
|
|
7477
|
-
Kcur =
|
|
7478
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
7642
|
+
Kcur = ggml_rope_ext(
|
|
7643
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
7479
7644
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7480
7645
|
);
|
|
7481
7646
|
cb(Kcur, "Kcur", il);
|
|
@@ -7591,15 +7756,15 @@ struct llm_build_context {
|
|
|
7591
7756
|
cb(Vcur, "Vcur", il);
|
|
7592
7757
|
}
|
|
7593
7758
|
|
|
7594
|
-
Qcur =
|
|
7595
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7759
|
+
Qcur = ggml_rope_ext(
|
|
7760
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7596
7761
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7597
7762
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7598
7763
|
);
|
|
7599
7764
|
cb(Qcur, "Qcur", il);
|
|
7600
7765
|
|
|
7601
|
-
Kcur =
|
|
7602
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7766
|
+
Kcur = ggml_rope_ext(
|
|
7767
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7603
7768
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7604
7769
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7605
7770
|
);
|
|
@@ -7743,15 +7908,15 @@ struct llm_build_context {
|
|
|
7743
7908
|
cb(Kcur, "Kcur", il);
|
|
7744
7909
|
cb(Vcur, "Vcur", il);
|
|
7745
7910
|
|
|
7746
|
-
Qcur =
|
|
7747
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7911
|
+
Qcur = ggml_rope_ext(
|
|
7912
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7748
7913
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7749
7914
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7750
7915
|
);
|
|
7751
7916
|
cb(Qcur, "Qcur", il);
|
|
7752
7917
|
|
|
7753
|
-
Kcur =
|
|
7754
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7918
|
+
Kcur = ggml_rope_ext(
|
|
7919
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7755
7920
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7756
7921
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7757
7922
|
);
|
|
@@ -7920,256 +8085,49 @@ struct llm_build_context {
|
|
|
7920
8085
|
return gf;
|
|
7921
8086
|
}
|
|
7922
8087
|
|
|
7923
|
-
struct ggml_cgraph *
|
|
8088
|
+
struct ggml_cgraph * build_refact() {
|
|
7924
8089
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
7925
8090
|
|
|
7926
8091
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7927
|
-
GGML_ASSERT(n_embd_head
|
|
7928
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
|
8092
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7929
8093
|
|
|
7930
8094
|
struct ggml_tensor * cur;
|
|
7931
8095
|
struct ggml_tensor * inpL;
|
|
7932
8096
|
|
|
7933
8097
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
7934
8098
|
|
|
7935
|
-
// inp_pos - contains the positions
|
|
7936
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
7937
|
-
|
|
7938
8099
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7939
8100
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
7940
8101
|
|
|
7941
8102
|
for (int il = 0; il < n_layer; ++il) {
|
|
7942
|
-
struct ggml_tensor *
|
|
8103
|
+
struct ggml_tensor * inpSA = inpL;
|
|
7943
8104
|
|
|
7944
8105
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
7945
|
-
model.layers[il].attn_norm,
|
|
7946
|
-
|
|
7947
|
-
LLM_NORM, cb, il);
|
|
8106
|
+
model.layers[il].attn_norm, NULL,
|
|
8107
|
+
LLM_NORM_RMS, cb, il);
|
|
7948
8108
|
cb(cur, "attn_norm", il);
|
|
7949
8109
|
|
|
7950
|
-
// self
|
|
8110
|
+
// self-attention
|
|
7951
8111
|
{
|
|
7952
|
-
|
|
7953
|
-
cb(
|
|
8112
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
8113
|
+
cb(Qcur, "Qcur", il);
|
|
7954
8114
|
|
|
7955
|
-
|
|
7956
|
-
cb(
|
|
8115
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
8116
|
+
cb(Kcur, "Kcur", il);
|
|
7957
8117
|
|
|
7958
|
-
|
|
7959
|
-
|
|
8118
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
8119
|
+
cb(Vcur, "Vcur", il);
|
|
7960
8120
|
|
|
7961
|
-
|
|
7962
|
-
cb(
|
|
8121
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8122
|
+
cb(Kcur, "Kcur", il);
|
|
7963
8123
|
|
|
7964
|
-
|
|
7965
|
-
cb(
|
|
8124
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8125
|
+
cb(Qcur, "Qcur", il);
|
|
7966
8126
|
|
|
7967
|
-
|
|
7968
|
-
|
|
7969
|
-
|
|
7970
|
-
|
|
7971
|
-
0
|
|
7972
|
-
);
|
|
7973
|
-
cb(tmpq, "tmpq", il);
|
|
7974
|
-
|
|
7975
|
-
struct ggml_tensor * tmpk = ggml_view_3d(
|
|
7976
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
|
7977
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
|
7978
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
|
7979
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
|
7980
|
-
);
|
|
7981
|
-
cb(tmpk, "tmpk", il);
|
|
7982
|
-
|
|
7983
|
-
// Q/K Layernorm
|
|
7984
|
-
tmpq = llm_build_norm(ctx0, tmpq, hparams,
|
|
7985
|
-
model.layers[il].attn_q_norm,
|
|
7986
|
-
model.layers[il].attn_q_norm_b,
|
|
7987
|
-
LLM_NORM, cb, il);
|
|
7988
|
-
cb(tmpq, "tmpq", il);
|
|
7989
|
-
|
|
7990
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
|
7991
|
-
model.layers[il].attn_k_norm,
|
|
7992
|
-
model.layers[il].attn_k_norm_b,
|
|
7993
|
-
LLM_NORM, cb, il);
|
|
7994
|
-
cb(tmpk, "tmpk", il);
|
|
7995
|
-
|
|
7996
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
|
7997
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
|
7998
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
7999
|
-
ggml_element_size(tmpq) * n_embd_head,
|
|
8000
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
8001
|
-
0
|
|
8002
|
-
);
|
|
8003
|
-
cb(qrot, "qrot", il);
|
|
8004
|
-
|
|
8005
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
|
8006
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
8007
|
-
ggml_element_size(tmpk) * n_embd_head,
|
|
8008
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
8009
|
-
0
|
|
8010
|
-
);
|
|
8011
|
-
cb(krot, "krot", il);
|
|
8012
|
-
|
|
8013
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
|
8014
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
|
8015
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
8016
|
-
ggml_element_size(tmpq) * n_embd_head,
|
|
8017
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
8018
|
-
ggml_element_size(tmpq) * n_rot
|
|
8019
|
-
);
|
|
8020
|
-
cb(qpass, "qpass", il);
|
|
8021
|
-
|
|
8022
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
|
8023
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
8024
|
-
ggml_element_size(tmpk) * n_embd_head,
|
|
8025
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
8026
|
-
ggml_element_size(tmpk) * n_rot
|
|
8027
|
-
);
|
|
8028
|
-
cb(kpass, "kpass", il);
|
|
8029
|
-
|
|
8030
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
|
8031
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
8032
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8033
|
-
);
|
|
8034
|
-
cb(qrotated, "qrotated", il);
|
|
8035
|
-
|
|
8036
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
|
8037
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
8038
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8039
|
-
);
|
|
8040
|
-
cb(krotated, "krotated", il);
|
|
8041
|
-
|
|
8042
|
-
// ggml currently only supports concatenation on dim=2
|
|
8043
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
|
8044
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
|
8045
|
-
cb(qrotated, "qrotated", il);
|
|
8046
|
-
|
|
8047
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
|
8048
|
-
cb(krotated, "krotated", il);
|
|
8049
|
-
|
|
8050
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
|
8051
|
-
cb(qpass, "qpass", il);
|
|
8052
|
-
|
|
8053
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
|
8054
|
-
cb(kpass, "kpass", il);
|
|
8055
|
-
|
|
8056
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
|
8057
|
-
cb(Qcur, "Qcur", il);
|
|
8058
|
-
|
|
8059
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
|
8060
|
-
cb(Kcur, "Kcur", il);
|
|
8061
|
-
|
|
8062
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
|
8063
|
-
cb(Q, "Q", il);
|
|
8064
|
-
|
|
8065
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
|
8066
|
-
cb(Kcur, "Kcur", il);
|
|
8067
|
-
|
|
8068
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
|
8069
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
|
8070
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
|
8071
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
|
8072
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
|
8073
|
-
);
|
|
8074
|
-
cb(Vcur, "Vcur", il);
|
|
8075
|
-
|
|
8076
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8077
|
-
model.layers[il].wo, model.layers[il].bo,
|
|
8078
|
-
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8079
|
-
}
|
|
8080
|
-
|
|
8081
|
-
if (il == n_layer - 1) {
|
|
8082
|
-
// skip computing output for unused tokens
|
|
8083
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8084
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8085
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
8086
|
-
}
|
|
8087
|
-
|
|
8088
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
|
8089
|
-
cb(ffn_inp, "ffn_inp", il);
|
|
8090
|
-
|
|
8091
|
-
// feed-forward network
|
|
8092
|
-
{
|
|
8093
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
8094
|
-
model.layers[il].ffn_norm,
|
|
8095
|
-
model.layers[il].ffn_norm_b,
|
|
8096
|
-
LLM_NORM, cb, il);
|
|
8097
|
-
cb(cur, "ffn_norm", il);
|
|
8098
|
-
|
|
8099
|
-
cur = llm_build_ffn(ctx0, cur,
|
|
8100
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
8101
|
-
NULL, NULL,
|
|
8102
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
8103
|
-
NULL,
|
|
8104
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
|
8105
|
-
cb(cur, "ffn_out", il);
|
|
8106
|
-
}
|
|
8107
|
-
|
|
8108
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8109
|
-
cb(cur, "l_out", il);
|
|
8110
|
-
|
|
8111
|
-
inpL = cur;
|
|
8112
|
-
}
|
|
8113
|
-
|
|
8114
|
-
cur = inpL;
|
|
8115
|
-
|
|
8116
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
|
8117
|
-
model.output_norm,
|
|
8118
|
-
model.output_norm_b,
|
|
8119
|
-
LLM_NORM, cb, -1);
|
|
8120
|
-
cb(cur, "result_norm", -1);
|
|
8121
|
-
|
|
8122
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
8123
|
-
cb(cur, "result_output", -1);
|
|
8124
|
-
|
|
8125
|
-
ggml_build_forward_expand(gf, cur);
|
|
8126
|
-
|
|
8127
|
-
return gf;
|
|
8128
|
-
}
|
|
8129
|
-
|
|
8130
|
-
struct ggml_cgraph * build_refact() {
|
|
8131
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
8132
|
-
|
|
8133
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8134
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8135
|
-
|
|
8136
|
-
struct ggml_tensor * cur;
|
|
8137
|
-
struct ggml_tensor * inpL;
|
|
8138
|
-
|
|
8139
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
8140
|
-
|
|
8141
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
8142
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
8143
|
-
|
|
8144
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
8145
|
-
struct ggml_tensor * inpSA = inpL;
|
|
8146
|
-
|
|
8147
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
8148
|
-
model.layers[il].attn_norm, NULL,
|
|
8149
|
-
LLM_NORM_RMS, cb, il);
|
|
8150
|
-
cb(cur, "attn_norm", il);
|
|
8151
|
-
|
|
8152
|
-
// self-attention
|
|
8153
|
-
{
|
|
8154
|
-
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
8155
|
-
cb(Qcur, "Qcur", il);
|
|
8156
|
-
|
|
8157
|
-
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
8158
|
-
cb(Kcur, "Kcur", il);
|
|
8159
|
-
|
|
8160
|
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
8161
|
-
cb(Vcur, "Vcur", il);
|
|
8162
|
-
|
|
8163
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8164
|
-
cb(Kcur, "Kcur", il);
|
|
8165
|
-
|
|
8166
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8167
|
-
cb(Qcur, "Qcur", il);
|
|
8168
|
-
|
|
8169
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8170
|
-
model.layers[il].wo, NULL,
|
|
8171
|
-
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8172
|
-
}
|
|
8127
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8128
|
+
model.layers[il].wo, NULL,
|
|
8129
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8130
|
+
}
|
|
8173
8131
|
|
|
8174
8132
|
if (il == n_layer - 1) {
|
|
8175
8133
|
// skip computing output for unused tokens
|
|
@@ -8303,15 +8261,15 @@ struct llm_build_context {
|
|
|
8303
8261
|
cb(Kcur, "Kcur", il);
|
|
8304
8262
|
cb(Vcur, "Vcur", il);
|
|
8305
8263
|
|
|
8306
|
-
Qcur =
|
|
8307
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
8264
|
+
Qcur = ggml_rope_ext(
|
|
8265
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8308
8266
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8309
8267
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8310
8268
|
);
|
|
8311
8269
|
cb(Qcur, "Qcur", il);
|
|
8312
8270
|
|
|
8313
|
-
Kcur =
|
|
8314
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
8271
|
+
Kcur = ggml_rope_ext(
|
|
8272
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8315
8273
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8316
8274
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8317
8275
|
);
|
|
@@ -8743,15 +8701,15 @@ struct llm_build_context {
|
|
|
8743
8701
|
}
|
|
8744
8702
|
|
|
8745
8703
|
|
|
8746
|
-
Qcur =
|
|
8747
|
-
ctx0, Qcur, inp_pos,
|
|
8704
|
+
Qcur = ggml_rope_ext(
|
|
8705
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
8748
8706
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8749
8707
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8750
8708
|
);
|
|
8751
8709
|
cb(Qcur, "Qcur", il);
|
|
8752
8710
|
|
|
8753
|
-
Kcur =
|
|
8754
|
-
ctx0, Kcur, inp_pos,
|
|
8711
|
+
Kcur = ggml_rope_ext(
|
|
8712
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
8755
8713
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8756
8714
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8757
8715
|
);
|
|
@@ -8863,14 +8821,14 @@ struct llm_build_context {
|
|
|
8863
8821
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8864
8822
|
|
|
8865
8823
|
// using mode = 2 for neox mode
|
|
8866
|
-
Qcur =
|
|
8867
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
8824
|
+
Qcur = ggml_rope_ext(
|
|
8825
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
8868
8826
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8869
8827
|
);
|
|
8870
8828
|
cb(Qcur, "Qcur", il);
|
|
8871
8829
|
|
|
8872
|
-
Kcur =
|
|
8873
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
8830
|
+
Kcur = ggml_rope_ext(
|
|
8831
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
8874
8832
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8875
8833
|
);
|
|
8876
8834
|
cb(Kcur, "Kcur", il);
|
|
@@ -8974,15 +8932,15 @@ struct llm_build_context {
|
|
|
8974
8932
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8975
8933
|
cb(Vcur, "Vcur", il);
|
|
8976
8934
|
|
|
8977
|
-
Qcur =
|
|
8978
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
8935
|
+
Qcur = ggml_rope_ext(
|
|
8936
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8979
8937
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8980
8938
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8981
8939
|
);
|
|
8982
8940
|
cb(Qcur, "Qcur", il);
|
|
8983
8941
|
|
|
8984
|
-
Kcur =
|
|
8985
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
8942
|
+
Kcur = ggml_rope_ext(
|
|
8943
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8986
8944
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8987
8945
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8988
8946
|
);
|
|
@@ -9088,15 +9046,15 @@ struct llm_build_context {
|
|
|
9088
9046
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
9089
9047
|
cb(Vcur, "Vcur", il);
|
|
9090
9048
|
|
|
9091
|
-
Qcur =
|
|
9092
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9049
|
+
Qcur = ggml_rope_ext(
|
|
9050
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9093
9051
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9094
9052
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9095
9053
|
);
|
|
9096
9054
|
cb(Qcur, "Qcur", il);
|
|
9097
9055
|
|
|
9098
|
-
Kcur =
|
|
9099
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9056
|
+
Kcur = ggml_rope_ext(
|
|
9057
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9100
9058
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9101
9059
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9102
9060
|
);
|
|
@@ -9240,8 +9198,8 @@ struct llm_build_context {
|
|
|
9240
9198
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9241
9199
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9242
9200
|
|
|
9243
|
-
Qcur =
|
|
9244
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9201
|
+
Qcur = ggml_rope_ext(
|
|
9202
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
9245
9203
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9246
9204
|
);
|
|
9247
9205
|
cb(Qcur, "Qcur", il);
|
|
@@ -9251,8 +9209,8 @@ struct llm_build_context {
|
|
|
9251
9209
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
9252
9210
|
cb(Qcur, "Qcur", il);
|
|
9253
9211
|
|
|
9254
|
-
Kcur =
|
|
9255
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9212
|
+
Kcur = ggml_rope_ext(
|
|
9213
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
9256
9214
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9257
9215
|
);
|
|
9258
9216
|
cb(Kcur, "Kcur", il);
|
|
@@ -9328,6 +9286,9 @@ struct llm_build_context {
|
|
|
9328
9286
|
|
|
9329
9287
|
// self-attention
|
|
9330
9288
|
{
|
|
9289
|
+
// rope freq factors for 128k context
|
|
9290
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
9291
|
+
|
|
9331
9292
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
|
9332
9293
|
model.layers[il].attn_norm,
|
|
9333
9294
|
NULL,
|
|
@@ -9359,8 +9320,8 @@ struct llm_build_context {
|
|
|
9359
9320
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9360
9321
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9361
9322
|
|
|
9362
|
-
Qcur =
|
|
9363
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9323
|
+
Qcur = ggml_rope_ext(
|
|
9324
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
|
9364
9325
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9365
9326
|
);
|
|
9366
9327
|
cb(Qcur, "Qcur", il);
|
|
@@ -9368,8 +9329,8 @@ struct llm_build_context {
|
|
|
9368
9329
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
|
9369
9330
|
cb(Qcur, "Qcur", il);
|
|
9370
9331
|
|
|
9371
|
-
Kcur =
|
|
9372
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9332
|
+
Kcur = ggml_rope_ext(
|
|
9333
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
|
9373
9334
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9374
9335
|
);
|
|
9375
9336
|
cb(Kcur, "Kcur", il);
|
|
@@ -9475,14 +9436,14 @@ struct llm_build_context {
|
|
|
9475
9436
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
9476
9437
|
cb(Vcur, "Vcur", il);
|
|
9477
9438
|
|
|
9478
|
-
Qcur =
|
|
9479
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
|
9439
|
+
Qcur = ggml_rope_ext(
|
|
9440
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
|
9480
9441
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9481
9442
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9482
9443
|
cb(Qcur, "Qcur", il);
|
|
9483
9444
|
|
|
9484
|
-
Kcur =
|
|
9485
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
|
9445
|
+
Kcur = ggml_rope_ext(
|
|
9446
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9486
9447
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9487
9448
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9488
9449
|
cb(Kcur, "Kcur", il);
|
|
@@ -9683,15 +9644,15 @@ struct llm_build_context {
|
|
|
9683
9644
|
cb(tmpk, "tmpk", il);
|
|
9684
9645
|
cb(Vcur, "Vcur", il);
|
|
9685
9646
|
|
|
9686
|
-
struct ggml_tensor * Qcur =
|
|
9687
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9647
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
|
9648
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9688
9649
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9689
9650
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9690
9651
|
);
|
|
9691
9652
|
cb(Qcur, "Qcur", il);
|
|
9692
9653
|
|
|
9693
|
-
struct ggml_tensor * Kcur =
|
|
9694
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9654
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
|
9655
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9695
9656
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9696
9657
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9697
9658
|
);
|
|
@@ -9799,15 +9760,15 @@ struct llm_build_context {
|
|
|
9799
9760
|
// cb(Vcur, "Vcur", il);
|
|
9800
9761
|
// }
|
|
9801
9762
|
|
|
9802
|
-
Qcur =
|
|
9803
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9763
|
+
Qcur = ggml_rope_ext(
|
|
9764
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9804
9765
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9805
9766
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9806
9767
|
);
|
|
9807
9768
|
cb(Qcur, "Qcur", il);
|
|
9808
9769
|
|
|
9809
|
-
Kcur =
|
|
9810
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9770
|
+
Kcur = ggml_rope_ext(
|
|
9771
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9811
9772
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9812
9773
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9813
9774
|
);
|
|
@@ -9916,15 +9877,15 @@ struct llm_build_context {
|
|
|
9916
9877
|
cb(Vcur, "Vcur", il);
|
|
9917
9878
|
}
|
|
9918
9879
|
|
|
9919
|
-
Qcur =
|
|
9920
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9880
|
+
Qcur = ggml_rope_ext(
|
|
9881
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9921
9882
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9922
9883
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9923
9884
|
);
|
|
9924
9885
|
cb(Qcur, "Qcur", il);
|
|
9925
9886
|
|
|
9926
|
-
Kcur =
|
|
9927
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9887
|
+
Kcur = ggml_rope_ext(
|
|
9888
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9928
9889
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9929
9890
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9930
9891
|
);
|
|
@@ -10046,15 +10007,15 @@ struct llm_build_context {
|
|
|
10046
10007
|
cb(Vcur, "Vcur", il);
|
|
10047
10008
|
}
|
|
10048
10009
|
|
|
10049
|
-
Qcur =
|
|
10050
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10010
|
+
Qcur = ggml_rope_ext(
|
|
10011
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10051
10012
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10052
10013
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10053
10014
|
);
|
|
10054
10015
|
cb(Qcur, "Qcur", il);
|
|
10055
10016
|
|
|
10056
|
-
Kcur =
|
|
10057
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10017
|
+
Kcur = ggml_rope_ext(
|
|
10018
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10058
10019
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10059
10020
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10060
10021
|
);
|
|
@@ -10166,8 +10127,8 @@ struct llm_build_context {
|
|
|
10166
10127
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
10167
10128
|
cb(Vcur, "Vcur", il);
|
|
10168
10129
|
|
|
10169
|
-
Qcur =
|
|
10170
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
|
10130
|
+
Qcur = ggml_rope_ext(
|
|
10131
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
|
10171
10132
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10172
10133
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10173
10134
|
cb(Qcur, "Qcur", il);
|
|
@@ -10175,8 +10136,8 @@ struct llm_build_context {
|
|
|
10175
10136
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
|
10176
10137
|
cb(Qcur, "Qcur_scaled", il);
|
|
10177
10138
|
|
|
10178
|
-
Kcur =
|
|
10179
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
|
10139
|
+
Kcur = ggml_rope_ext(
|
|
10140
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10180
10141
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10181
10142
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10182
10143
|
cb(Kcur, "Kcur", il);
|
|
@@ -10286,15 +10247,15 @@ struct llm_build_context {
|
|
|
10286
10247
|
cb(Vcur, "Vcur", il);
|
|
10287
10248
|
}
|
|
10288
10249
|
|
|
10289
|
-
Qcur =
|
|
10290
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10250
|
+
Qcur = ggml_rope_ext(
|
|
10251
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10291
10252
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10292
10253
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10293
10254
|
);
|
|
10294
10255
|
cb(Qcur, "Qcur", il);
|
|
10295
10256
|
|
|
10296
|
-
Kcur =
|
|
10297
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10257
|
+
Kcur = ggml_rope_ext(
|
|
10258
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10298
10259
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10299
10260
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10300
10261
|
);
|
|
@@ -10576,15 +10537,15 @@ struct llm_build_context {
|
|
|
10576
10537
|
cb(Kcur, "Kcur", il);
|
|
10577
10538
|
}
|
|
10578
10539
|
|
|
10579
|
-
Qcur =
|
|
10580
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10540
|
+
Qcur = ggml_rope_ext(
|
|
10541
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10581
10542
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10582
10543
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10583
10544
|
);
|
|
10584
10545
|
cb(Qcur, "Qcur", il);
|
|
10585
10546
|
|
|
10586
|
-
Kcur =
|
|
10587
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10547
|
+
Kcur = ggml_rope_ext(
|
|
10548
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10588
10549
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10589
10550
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10590
10551
|
);
|
|
@@ -10707,15 +10668,15 @@ struct llm_build_context {
|
|
|
10707
10668
|
cb(Vcur, "Vcur", il);
|
|
10708
10669
|
}
|
|
10709
10670
|
|
|
10710
|
-
Qcur =
|
|
10711
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10671
|
+
Qcur = ggml_rope_ext(
|
|
10672
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10712
10673
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10713
10674
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10714
10675
|
);
|
|
10715
10676
|
cb(Qcur, "Qcur", il);
|
|
10716
10677
|
|
|
10717
|
-
Kcur =
|
|
10718
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10678
|
+
Kcur = ggml_rope_ext(
|
|
10679
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10719
10680
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10720
10681
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10721
10682
|
);
|
|
@@ -10779,6 +10740,274 @@ struct llm_build_context {
|
|
|
10779
10740
|
|
|
10780
10741
|
return gf;
|
|
10781
10742
|
}
|
|
10743
|
+
|
|
10744
|
+
struct ggml_cgraph * build_gptneox() {
|
|
10745
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
10746
|
+
|
|
10747
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10748
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
10749
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10750
|
+
|
|
10751
|
+
struct ggml_tensor * cur;
|
|
10752
|
+
struct ggml_tensor * inpL;
|
|
10753
|
+
|
|
10754
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
10755
|
+
|
|
10756
|
+
// inp_pos - contains the positions
|
|
10757
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
10758
|
+
|
|
10759
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
10760
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
10761
|
+
|
|
10762
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10763
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10764
|
+
model.layers[il].attn_norm,
|
|
10765
|
+
model.layers[il].attn_norm_b,
|
|
10766
|
+
LLM_NORM, cb, il);
|
|
10767
|
+
cb(cur, "attn_norm", il);
|
|
10768
|
+
|
|
10769
|
+
// self-attention
|
|
10770
|
+
{
|
|
10771
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
|
10772
|
+
cb(cur, "wqkv", il);
|
|
10773
|
+
|
|
10774
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
10775
|
+
cb(cur, "bqkv", il);
|
|
10776
|
+
|
|
10777
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
10778
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
10779
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
10780
|
+
|
|
10781
|
+
cb(Qcur, "Qcur", il);
|
|
10782
|
+
cb(Kcur, "Kcur", il);
|
|
10783
|
+
cb(Vcur, "Vcur", il);
|
|
10784
|
+
|
|
10785
|
+
Qcur = ggml_rope_ext(
|
|
10786
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10787
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10788
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10789
|
+
);
|
|
10790
|
+
cb(Qcur, "Qcur", il);
|
|
10791
|
+
|
|
10792
|
+
Kcur = ggml_rope_ext(
|
|
10793
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10794
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10795
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10796
|
+
);
|
|
10797
|
+
cb(Kcur, "Kcur", il);
|
|
10798
|
+
|
|
10799
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10800
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
10801
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10802
|
+
}
|
|
10803
|
+
|
|
10804
|
+
if (il == n_layer - 1) {
|
|
10805
|
+
// skip computing output for unused tokens
|
|
10806
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10807
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10808
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10809
|
+
}
|
|
10810
|
+
|
|
10811
|
+
// ffn
|
|
10812
|
+
if (hparams.use_par_res) {
|
|
10813
|
+
// attention and ffn are computed in parallel
|
|
10814
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
|
10815
|
+
|
|
10816
|
+
struct ggml_tensor * attn_out = cur;
|
|
10817
|
+
|
|
10818
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10819
|
+
model.layers[il].ffn_norm,
|
|
10820
|
+
model.layers[il].ffn_norm_b,
|
|
10821
|
+
LLM_NORM, cb, il);
|
|
10822
|
+
cb(cur, "ffn_norm", il);
|
|
10823
|
+
|
|
10824
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
10825
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
10826
|
+
NULL, NULL,
|
|
10827
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
10828
|
+
NULL,
|
|
10829
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
10830
|
+
cb(cur, "ffn_out", il);
|
|
10831
|
+
|
|
10832
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
10833
|
+
cb(cur, "ffn_out", il);
|
|
10834
|
+
|
|
10835
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
|
10836
|
+
cb(inpL, "l_out", il);
|
|
10837
|
+
} else {
|
|
10838
|
+
// attention and ffn are computed sequentially
|
|
10839
|
+
// x = x + attn(ln1(x))
|
|
10840
|
+
// x = x + ffn(ln2(x))
|
|
10841
|
+
|
|
10842
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
10843
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
10844
|
+
|
|
10845
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
10846
|
+
model.layers[il].ffn_norm,
|
|
10847
|
+
model.layers[il].ffn_norm_b,
|
|
10848
|
+
LLM_NORM, cb, il);
|
|
10849
|
+
cb(cur, "ffn_norm", il);
|
|
10850
|
+
|
|
10851
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
10852
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
10853
|
+
NULL, NULL,
|
|
10854
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
10855
|
+
NULL,
|
|
10856
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
10857
|
+
cb(cur, "ffn_out", il);
|
|
10858
|
+
|
|
10859
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
|
10860
|
+
cb(inpL, "l_out", il);
|
|
10861
|
+
}
|
|
10862
|
+
}
|
|
10863
|
+
|
|
10864
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10865
|
+
model.output_norm,
|
|
10866
|
+
model.output_norm_b,
|
|
10867
|
+
LLM_NORM, cb, -1);
|
|
10868
|
+
cb(cur, "result_norm", -1);
|
|
10869
|
+
|
|
10870
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
10871
|
+
cb(cur, "result_output", -1);
|
|
10872
|
+
|
|
10873
|
+
ggml_build_forward_expand(gf, cur);
|
|
10874
|
+
|
|
10875
|
+
return gf;
|
|
10876
|
+
}
|
|
10877
|
+
|
|
10878
|
+
struct ggml_cgraph * build_arctic() {
|
|
10879
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
10880
|
+
|
|
10881
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
10882
|
+
int32_t n_tokens = this->n_tokens;
|
|
10883
|
+
|
|
10884
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10885
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10886
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
10887
|
+
|
|
10888
|
+
struct ggml_tensor * cur;
|
|
10889
|
+
struct ggml_tensor * inpL;
|
|
10890
|
+
|
|
10891
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
10892
|
+
|
|
10893
|
+
// inp_pos - contains the positions
|
|
10894
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
10895
|
+
|
|
10896
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
10897
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
10898
|
+
|
|
10899
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10900
|
+
struct ggml_tensor * inpSA = inpL;
|
|
10901
|
+
|
|
10902
|
+
// norm
|
|
10903
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10904
|
+
model.layers[il].attn_norm, NULL,
|
|
10905
|
+
LLM_NORM_RMS, cb, il);
|
|
10906
|
+
cb(cur, "attn_norm", il);
|
|
10907
|
+
|
|
10908
|
+
// self-attention
|
|
10909
|
+
{
|
|
10910
|
+
// compute Q and K and RoPE them
|
|
10911
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
10912
|
+
cb(Qcur, "Qcur", il);
|
|
10913
|
+
|
|
10914
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
10915
|
+
cb(Kcur, "Kcur", il);
|
|
10916
|
+
|
|
10917
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
10918
|
+
cb(Vcur, "Vcur", il);
|
|
10919
|
+
|
|
10920
|
+
Qcur = ggml_rope_ext(
|
|
10921
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10922
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10923
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10924
|
+
);
|
|
10925
|
+
cb(Qcur, "Qcur", il);
|
|
10926
|
+
|
|
10927
|
+
Kcur = ggml_rope_ext(
|
|
10928
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10929
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10930
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10931
|
+
);
|
|
10932
|
+
cb(Kcur, "Kcur", il);
|
|
10933
|
+
|
|
10934
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10935
|
+
model.layers[il].wo, NULL,
|
|
10936
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10937
|
+
}
|
|
10938
|
+
|
|
10939
|
+
if (il == n_layer - 1) {
|
|
10940
|
+
// skip computing output for unused tokens
|
|
10941
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10942
|
+
n_tokens = n_outputs;
|
|
10943
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10944
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10945
|
+
}
|
|
10946
|
+
|
|
10947
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
10948
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
10949
|
+
|
|
10950
|
+
// feed-forward network
|
|
10951
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
10952
|
+
model.layers[il].ffn_norm, NULL,
|
|
10953
|
+
LLM_NORM_RMS, cb, il);
|
|
10954
|
+
cb(cur, "ffn_norm", il);
|
|
10955
|
+
|
|
10956
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
10957
|
+
model.layers[il].ffn_up, NULL,
|
|
10958
|
+
model.layers[il].ffn_gate, NULL,
|
|
10959
|
+
model.layers[il].ffn_down, NULL,
|
|
10960
|
+
NULL,
|
|
10961
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
10962
|
+
cb(cur, "ffn_out", il);
|
|
10963
|
+
|
|
10964
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
|
10965
|
+
cb(ffn_out, "ffn_out", il);
|
|
10966
|
+
|
|
10967
|
+
// MoE
|
|
10968
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
|
10969
|
+
model.layers[il].ffn_norm_exps, NULL,
|
|
10970
|
+
LLM_NORM_RMS, cb, il);
|
|
10971
|
+
cb(cur, "ffn_norm_exps", il);
|
|
10972
|
+
|
|
10973
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
|
10974
|
+
model.layers[il].ffn_gate_inp,
|
|
10975
|
+
model.layers[il].ffn_up_exps,
|
|
10976
|
+
model.layers[il].ffn_gate_exps,
|
|
10977
|
+
model.layers[il].ffn_down_exps,
|
|
10978
|
+
n_expert, n_expert_used,
|
|
10979
|
+
LLM_FFN_SILU, true,
|
|
10980
|
+
cb, il);
|
|
10981
|
+
cb(cur, "ffn_moe_out", il);
|
|
10982
|
+
|
|
10983
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
|
10984
|
+
cb(cur, "ffn_out", il);
|
|
10985
|
+
|
|
10986
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
|
10987
|
+
if (layer_dir != nullptr) {
|
|
10988
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
|
10989
|
+
}
|
|
10990
|
+
cb(cur, "l_out", il);
|
|
10991
|
+
|
|
10992
|
+
// input for next layer
|
|
10993
|
+
inpL = cur;
|
|
10994
|
+
}
|
|
10995
|
+
|
|
10996
|
+
cur = inpL;
|
|
10997
|
+
|
|
10998
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
10999
|
+
model.output_norm, NULL,
|
|
11000
|
+
LLM_NORM_RMS, cb, -1);
|
|
11001
|
+
cb(cur, "result_norm", -1);
|
|
11002
|
+
|
|
11003
|
+
// lm_head
|
|
11004
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
11005
|
+
cb(cur, "result_output", -1);
|
|
11006
|
+
|
|
11007
|
+
ggml_build_forward_expand(gf, cur);
|
|
11008
|
+
|
|
11009
|
+
return gf;
|
|
11010
|
+
}
|
|
10782
11011
|
};
|
|
10783
11012
|
|
|
10784
11013
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
|
@@ -10895,10 +11124,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
10895
11124
|
{
|
|
10896
11125
|
result = llm.build_starcoder();
|
|
10897
11126
|
} break;
|
|
10898
|
-
case LLM_ARCH_PERSIMMON:
|
|
10899
|
-
{
|
|
10900
|
-
result = llm.build_persimmon();
|
|
10901
|
-
} break;
|
|
10902
11127
|
case LLM_ARCH_REFACT:
|
|
10903
11128
|
{
|
|
10904
11129
|
result = llm.build_refact();
|
|
@@ -10993,6 +11218,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
10993
11218
|
{
|
|
10994
11219
|
result = llm.build_olmo();
|
|
10995
11220
|
} break;
|
|
11221
|
+
case LLM_ARCH_GPTNEOX:
|
|
11222
|
+
{
|
|
11223
|
+
result = llm.build_gptneox();
|
|
11224
|
+
} break;
|
|
11225
|
+
case LLM_ARCH_ARCTIC:
|
|
11226
|
+
{
|
|
11227
|
+
result = llm.build_arctic();
|
|
11228
|
+
} break;
|
|
10996
11229
|
default:
|
|
10997
11230
|
GGML_ASSERT(false);
|
|
10998
11231
|
}
|
|
@@ -11338,11 +11571,6 @@ static void llama_graph_compute(
|
|
|
11338
11571
|
llama_context & lctx,
|
|
11339
11572
|
ggml_cgraph * gf,
|
|
11340
11573
|
int n_threads) {
|
|
11341
|
-
#ifdef GGML_USE_MPI
|
|
11342
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
|
11343
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
11344
|
-
#endif
|
|
11345
|
-
|
|
11346
11574
|
#ifdef GGML_USE_METAL
|
|
11347
11575
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
|
11348
11576
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
|
@@ -11357,10 +11585,6 @@ static void llama_graph_compute(
|
|
|
11357
11585
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
|
11358
11586
|
|
|
11359
11587
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
11360
|
-
|
|
11361
|
-
#ifdef GGML_USE_MPI
|
|
11362
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
11363
|
-
#endif
|
|
11364
11588
|
}
|
|
11365
11589
|
|
|
11366
11590
|
// decode a batch of tokens by evaluating the transformer
|
|
@@ -11398,12 +11622,6 @@ static int llama_decode_internal(
|
|
|
11398
11622
|
}
|
|
11399
11623
|
lctx.n_queued_tokens += n_tokens_all;
|
|
11400
11624
|
|
|
11401
|
-
#ifdef GGML_USE_MPI
|
|
11402
|
-
// TODO: needs fix after #3228
|
|
11403
|
-
GGML_ASSERT(false && "not implemented");
|
|
11404
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
|
11405
|
-
#endif
|
|
11406
|
-
|
|
11407
11625
|
auto & kv_self = lctx.kv_self;
|
|
11408
11626
|
|
|
11409
11627
|
const int64_t n_embd = hparams.n_embd;
|
|
@@ -12297,6 +12515,7 @@ struct llm_tokenizer_bpe {
|
|
|
12297
12515
|
});
|
|
12298
12516
|
break;
|
|
12299
12517
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
|
12518
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
|
12300
12519
|
word_collection = unicode_regex_split(text, {
|
|
12301
12520
|
// same as llama3
|
|
12302
12521
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
@@ -12353,6 +12572,7 @@ struct llm_tokenizer_bpe {
|
|
|
12353
12572
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
12354
12573
|
});
|
|
12355
12574
|
break;
|
|
12575
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
|
12356
12576
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
|
12357
12577
|
word_collection = unicode_regex_split(text, {
|
|
12358
12578
|
// original regex from tokenizer.json
|
|
@@ -12575,16 +12795,16 @@ struct llm_tokenizer_wpm {
|
|
|
12575
12795
|
// to lowercase, pad chinese characters, pad punctuation
|
|
12576
12796
|
std::string new_str = "";
|
|
12577
12797
|
for (uint32_t code : cpts_nfd) {
|
|
12578
|
-
|
|
12579
|
-
if (
|
|
12798
|
+
const codepoint_flags flags = unicode_cpt_flags(code);
|
|
12799
|
+
if (flags.is_accent_mark || flags.is_control) {
|
|
12580
12800
|
continue;
|
|
12581
12801
|
}
|
|
12582
12802
|
code = unicode_tolower(code);
|
|
12583
|
-
if (
|
|
12803
|
+
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
|
12584
12804
|
code = ' ';
|
|
12585
12805
|
}
|
|
12586
12806
|
std::string s = unicode_cpt_to_utf8(code);
|
|
12587
|
-
if (
|
|
12807
|
+
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
|
12588
12808
|
new_str += " ";
|
|
12589
12809
|
new_str += s;
|
|
12590
12810
|
new_str += " ";
|
|
@@ -12787,9 +13007,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12787
13007
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
|
12788
13008
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
|
12789
13009
|
|
|
13010
|
+
static const bool rtrim = true; //TODO: as param
|
|
13011
|
+
bool is_prev_special = false;
|
|
13012
|
+
bool special_token_rtrim = false;
|
|
13013
|
+
|
|
12790
13014
|
if (add_special && vocab.special_add_bos != 0) {
|
|
12791
13015
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
12792
13016
|
output.push_back(vocab.special_bos_id);
|
|
13017
|
+
is_prev_special = true;
|
|
12793
13018
|
}
|
|
12794
13019
|
|
|
12795
13020
|
for (const auto & fragment : fragment_buffer) {
|
|
@@ -12801,9 +13026,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12801
13026
|
// and passing 'add space prefix' as bool argument
|
|
12802
13027
|
//
|
|
12803
13028
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
12804
|
-
|
|
12805
|
-
|
|
12806
|
-
|
|
13029
|
+
|
|
13030
|
+
if (special_token_rtrim) {
|
|
13031
|
+
size_t num_whitespaces = 0;
|
|
13032
|
+
while (isspace(raw_text[num_whitespaces])) {
|
|
13033
|
+
num_whitespaces++;
|
|
13034
|
+
}
|
|
13035
|
+
if (num_whitespaces == raw_text.size()) {
|
|
13036
|
+
continue; // skip if all whitespaces
|
|
13037
|
+
}
|
|
13038
|
+
raw_text = raw_text.substr(num_whitespaces);
|
|
13039
|
+
}
|
|
13040
|
+
|
|
13041
|
+
if (vocab.add_space_prefix) {
|
|
13042
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
|
13043
|
+
raw_text = " " + raw_text;
|
|
12807
13044
|
}
|
|
12808
13045
|
}
|
|
12809
13046
|
|
|
@@ -12815,9 +13052,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12815
13052
|
tokenizer.tokenize(raw_text, output);
|
|
12816
13053
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
12817
13054
|
output.push_back(fragment.token);
|
|
13055
|
+
is_prev_special = true;
|
|
13056
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
|
13057
|
+
special_token_rtrim = rtrim
|
|
13058
|
+
&& fragment.token != vocab.special_bos_id
|
|
13059
|
+
&& fragment.token != vocab.special_unk_id
|
|
13060
|
+
&& fragment.token != vocab.special_eos_id;
|
|
12818
13061
|
}
|
|
12819
13062
|
}
|
|
12820
13063
|
|
|
13064
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
|
13065
|
+
LLAMA_LOG_WARN(
|
|
13066
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
13067
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
13068
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13069
|
+
}
|
|
13070
|
+
|
|
12821
13071
|
if (add_special && vocab.special_add_eos == 1) {
|
|
12822
13072
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
|
12823
13073
|
output.push_back(vocab.special_eos_id);
|
|
@@ -12844,6 +13094,13 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12844
13094
|
}
|
|
12845
13095
|
}
|
|
12846
13096
|
|
|
13097
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
|
13098
|
+
LLAMA_LOG_WARN(
|
|
13099
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
13100
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
13101
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13102
|
+
}
|
|
13103
|
+
|
|
12847
13104
|
if (add_special && vocab.special_add_eos == 1) {
|
|
12848
13105
|
GGML_ASSERT(vocab.special_add_eos != -1);
|
|
12849
13106
|
output.push_back(vocab.special_eos_id);
|
|
@@ -13904,9 +14161,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
|
13904
14161
|
|
|
13905
14162
|
// Sample the next word X using top-k sampling
|
|
13906
14163
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
|
13907
|
-
|
|
13908
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13909
|
-
}
|
|
14164
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13910
14165
|
llama_token X = llama_sample_token(ctx, candidates);
|
|
13911
14166
|
t_start_sample_us = ggml_time_us();
|
|
13912
14167
|
|
|
@@ -13920,9 +14175,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
|
13920
14175
|
// Update mu using the learning rate and error
|
|
13921
14176
|
*mu = *mu - eta * e;
|
|
13922
14177
|
|
|
13923
|
-
|
|
13924
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13925
|
-
}
|
|
14178
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13926
14179
|
return X;
|
|
13927
14180
|
}
|
|
13928
14181
|
|
|
@@ -14507,8 +14760,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
|
14507
14760
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
|
14508
14761
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
|
14509
14762
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
|
14510
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
|
14511
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
|
14512
14763
|
if (qs.model.type == MODEL_70B) {
|
|
14513
14764
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
|
14514
14765
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
|
@@ -15522,10 +15773,6 @@ void llama_backend_init(void) {
|
|
|
15522
15773
|
struct ggml_context * ctx = ggml_init(params);
|
|
15523
15774
|
ggml_free(ctx);
|
|
15524
15775
|
}
|
|
15525
|
-
|
|
15526
|
-
#ifdef GGML_USE_MPI
|
|
15527
|
-
ggml_mpi_backend_init();
|
|
15528
|
-
#endif
|
|
15529
15776
|
}
|
|
15530
15777
|
|
|
15531
15778
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
@@ -15535,9 +15782,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
|
15535
15782
|
}
|
|
15536
15783
|
|
|
15537
15784
|
void llama_backend_free(void) {
|
|
15538
|
-
#ifdef GGML_USE_MPI
|
|
15539
|
-
ggml_mpi_backend_free();
|
|
15540
|
-
#endif
|
|
15541
15785
|
ggml_quantize_free();
|
|
15542
15786
|
}
|
|
15543
15787
|
|
|
@@ -15680,6 +15924,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15680
15924
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
|
15681
15925
|
}
|
|
15682
15926
|
|
|
15927
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
|
15683
15928
|
cparams.causal_attn = hparams.causal_attn;
|
|
15684
15929
|
|
|
15685
15930
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
|
@@ -15938,20 +16183,6 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15938
16183
|
}
|
|
15939
16184
|
}
|
|
15940
16185
|
|
|
15941
|
-
#ifdef GGML_USE_MPI
|
|
15942
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
|
15943
|
-
|
|
15944
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
|
15945
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
|
15946
|
-
// TODO: needs fix after #3228
|
|
15947
|
-
GGML_ASSERT(false && "not implemented");
|
|
15948
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
|
15949
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
|
15950
|
-
llama_backend_free();
|
|
15951
|
-
exit(1);
|
|
15952
|
-
}
|
|
15953
|
-
#endif
|
|
15954
|
-
|
|
15955
16186
|
return ctx;
|
|
15956
16187
|
}
|
|
15957
16188
|
|
|
@@ -15988,7 +16219,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
15988
16219
|
// these models do not use RoPE
|
|
15989
16220
|
case LLM_ARCH_GPT2:
|
|
15990
16221
|
case LLM_ARCH_GPTJ:
|
|
15991
|
-
case LLM_ARCH_GPTNEOX:
|
|
15992
16222
|
case LLM_ARCH_MPT:
|
|
15993
16223
|
case LLM_ARCH_REFACT:
|
|
15994
16224
|
case LLM_ARCH_BLOOM:
|
|
@@ -16008,13 +16238,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
16008
16238
|
case LLM_ARCH_XVERSE:
|
|
16009
16239
|
case LLM_ARCH_COMMAND_R:
|
|
16010
16240
|
case LLM_ARCH_OLMO:
|
|
16241
|
+
case LLM_ARCH_ARCTIC:
|
|
16011
16242
|
return LLAMA_ROPE_TYPE_NORM;
|
|
16012
16243
|
|
|
16013
16244
|
// the pairs of head values are offset by n_rot/2
|
|
16014
16245
|
case LLM_ARCH_FALCON:
|
|
16015
16246
|
case LLM_ARCH_GROK:
|
|
16016
16247
|
case LLM_ARCH_DBRX:
|
|
16017
|
-
case LLM_ARCH_PERSIMMON:
|
|
16018
16248
|
case LLM_ARCH_BERT:
|
|
16019
16249
|
case LLM_ARCH_NOMIC_BERT:
|
|
16020
16250
|
case LLM_ARCH_STABLELM:
|
|
@@ -16025,6 +16255,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
16025
16255
|
case LLM_ARCH_PHI3:
|
|
16026
16256
|
case LLM_ARCH_GEMMA:
|
|
16027
16257
|
case LLM_ARCH_STARCODER2:
|
|
16258
|
+
case LLM_ARCH_GPTNEOX:
|
|
16028
16259
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
16029
16260
|
|
|
16030
16261
|
// all model arches should be listed explicitly here
|
|
@@ -16184,6 +16415,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
|
16184
16415
|
}
|
|
16185
16416
|
|
|
16186
16417
|
// make tensors
|
|
16418
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
|
16187
16419
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
|
16188
16420
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
|
16189
16421
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
|
@@ -16192,6 +16424,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
|
16192
16424
|
}
|
|
16193
16425
|
|
|
16194
16426
|
// allocate tensors / buffers and zero
|
|
16427
|
+
cvec.ctxs.reserve(ctx_map.size());
|
|
16428
|
+
cvec.bufs.reserve(ctx_map.size());
|
|
16195
16429
|
for (auto it : ctx_map) {
|
|
16196
16430
|
ggml_backend_buffer_type_t buft = it.first;
|
|
16197
16431
|
ggml_context * ctx = it.second;
|
|
@@ -17015,13 +17249,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
|
17015
17249
|
}
|
|
17016
17250
|
else {
|
|
17017
17251
|
if (cell_range_begin != kv_self.size) {
|
|
17018
|
-
cell_ranges.
|
|
17252
|
+
cell_ranges.emplace_back(cell_range_begin, i);
|
|
17019
17253
|
cell_range_begin = kv_self.size;
|
|
17020
17254
|
}
|
|
17021
17255
|
}
|
|
17022
17256
|
}
|
|
17023
17257
|
if (cell_range_begin != kv_self.size) {
|
|
17024
|
-
cell_ranges.
|
|
17258
|
+
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
|
17025
17259
|
}
|
|
17026
17260
|
|
|
17027
17261
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
|
@@ -17400,6 +17634,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
|
17400
17634
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
|
17401
17635
|
}
|
|
17402
17636
|
|
|
17637
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
|
17638
|
+
return ctx->cparams.n_threads;
|
|
17639
|
+
}
|
|
17640
|
+
|
|
17641
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
|
17642
|
+
return ctx->cparams.n_threads_batch;
|
|
17643
|
+
}
|
|
17644
|
+
|
|
17403
17645
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
|
17404
17646
|
ctx->abort_callback = abort_callback;
|
|
17405
17647
|
ctx->abort_callback_data = abort_callback_data;
|
|
@@ -17623,6 +17865,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
|
17623
17865
|
);
|
|
17624
17866
|
}
|
|
17625
17867
|
|
|
17868
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
|
17869
|
+
return llama_is_control_token(model->vocab, token);
|
|
17870
|
+
}
|
|
17871
|
+
|
|
17626
17872
|
llama_token llama_token_bos(const struct llama_model * model) {
|
|
17627
17873
|
return model->vocab.special_bos_id;
|
|
17628
17874
|
}
|
|
@@ -17834,6 +18080,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
17834
18080
|
}
|
|
17835
18081
|
}
|
|
17836
18082
|
// llama2 templates seem to not care about "add_generation_prompt"
|
|
18083
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
|
18084
|
+
// Phi 3
|
|
18085
|
+
for (auto message : chat) {
|
|
18086
|
+
std::string role(message->role);
|
|
18087
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
|
18088
|
+
}
|
|
18089
|
+
if (add_ass) {
|
|
18090
|
+
ss << "<|assistant|>\n";
|
|
18091
|
+
}
|
|
17837
18092
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
|
17838
18093
|
// zephyr template
|
|
17839
18094
|
for (auto message : chat) {
|
|
@@ -17966,15 +18221,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
17966
18221
|
if (add_ass) {
|
|
17967
18222
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
|
17968
18223
|
}
|
|
17969
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
|
17970
|
-
// Phi 3
|
|
17971
|
-
for (auto message : chat) {
|
|
17972
|
-
std::string role(message->role);
|
|
17973
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
|
17974
|
-
}
|
|
17975
|
-
if (add_ass) {
|
|
17976
|
-
ss << "<|assistant|>\n";
|
|
17977
|
-
}
|
|
17978
18224
|
} else {
|
|
17979
18225
|
// template not supported
|
|
17980
18226
|
return -1;
|
|
@@ -18096,8 +18342,10 @@ const char * llama_print_system_info(void) {
|
|
|
18096
18342
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
18097
18343
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
|
18098
18344
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
|
18345
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
|
18099
18346
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
|
18100
18347
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
18348
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
|
18101
18349
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
|
18102
18350
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
|
18103
18351
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
|
@@ -18156,6 +18404,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
|
18156
18404
|
g_state.log_callback_user_data = user_data;
|
|
18157
18405
|
#ifdef GGML_USE_METAL
|
|
18158
18406
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
18407
|
+
#elif defined(GGML_USE_CUDA)
|
|
18408
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
18159
18409
|
#endif
|
|
18160
18410
|
}
|
|
18161
18411
|
|