@fugood/llama.node 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +9 -0
- package/README.md +1 -1
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +1 -1
- package/package.json +2 -1
- package/patches/llama.patch +22 -0
- package/src/LlamaContext.cpp +2 -2
- package/src/TokenizeWorker.cpp +1 -1
- package/src/llama.cpp/CMakeLists.txt +82 -54
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +16 -0
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +6 -0
- package/src/llama.cpp/common/common.cpp +748 -754
- package/src/llama.cpp/common/common.h +49 -41
- package/src/llama.cpp/common/grammar-parser.cpp +10 -1
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +6 -6
- package/src/llama.cpp/common/log.h +5 -5
- package/src/llama.cpp/common/sampling.cpp +92 -10
- package/src/llama.cpp/common/sampling.h +6 -1
- package/src/llama.cpp/common/train.cpp +2 -2
- package/src/llama.cpp/examples/CMakeLists.txt +3 -0
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +13 -4
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +2 -2
- package/src/llama.cpp/examples/finetune/finetune.cpp +4 -3
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +2 -2
- package/src/llama.cpp/examples/infill/infill.cpp +8 -8
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +57 -8
- package/src/llama.cpp/examples/llama.android/llama/CMakeLists.txt +55 -0
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/CMakeLists.txt +7 -8
- package/src/llama.cpp/examples/llama.android/{app → llama}/src/main/cpp/llama-android.cpp +14 -14
- package/src/llama.cpp/examples/llava/clip.h +1 -1
- package/src/llama.cpp/examples/llava/llava-cli.cpp +27 -7
- package/src/llama.cpp/examples/llava/llava.cpp +0 -15
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +1 -1
- package/src/llama.cpp/examples/main/main.cpp +29 -17
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +9 -9
- package/src/llama.cpp/examples/quantize/quantize.cpp +2 -2
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +2 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +2 -0
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +134 -0
- package/src/llama.cpp/examples/server/server.cpp +33 -25
- package/src/llama.cpp/examples/server/utils.hpp +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +359 -9
- package/src/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +4 -3
- package/src/llama.cpp/ggml-backend.c +2 -3
- package/src/llama.cpp/ggml-common.h +0 -54
- package/src/llama.cpp/ggml-cuda.h +1 -0
- package/src/llama.cpp/ggml-impl.h +51 -0
- package/src/llama.cpp/ggml-kompute.cpp +13 -3
- package/src/llama.cpp/ggml-opencl.cpp +4 -1
- package/src/llama.cpp/ggml-quants.c +3715 -2050
- package/src/llama.cpp/ggml-rpc.cpp +1155 -0
- package/src/llama.cpp/ggml-rpc.h +24 -0
- package/src/llama.cpp/ggml-sycl.cpp +119 -673
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +9351 -5627
- package/src/llama.cpp/ggml-vulkan.cpp +203 -224
- package/src/llama.cpp/ggml.c +1208 -1483
- package/src/llama.cpp/ggml.h +71 -46
- package/src/llama.cpp/llama.cpp +1374 -938
- package/src/llama.cpp/llama.h +22 -6
- package/src/llama.cpp/requirements.txt +0 -2
- package/src/llama.cpp/tests/CMakeLists.txt +1 -1
- package/src/llama.cpp/tests/test-backend-ops.cpp +120 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +16 -4
- package/src/llama.cpp/tests/test-grad0.cpp +43 -83
- package/src/llama.cpp/tests/test-grammar-integration.cpp +46 -0
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +27 -3
- package/src/llama.cpp/unicode-data.cpp +6969 -2169
- package/src/llama.cpp/unicode-data.h +15 -12
- package/src/llama.cpp/unicode.cpp +89 -111
- package/src/llama.cpp/unicode.h +44 -12
- package/src/llama.cpp/build.zig +0 -172
- package/src/llama.cpp/ggml-mpi.c +0 -216
- package/src/llama.cpp/ggml-mpi.h +0 -39
- package/src/llama.cpp/requirements/requirements-convert-lora-to-ggml.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert-persimmon-to-gguf.txt +0 -2
package/src/llama.cpp/llama.cpp
CHANGED
|
@@ -7,6 +7,10 @@
|
|
|
7
7
|
#include "ggml-alloc.h"
|
|
8
8
|
#include "ggml-backend.h"
|
|
9
9
|
|
|
10
|
+
#ifdef GGML_USE_RPC
|
|
11
|
+
# include "ggml-rpc.h"
|
|
12
|
+
#endif
|
|
13
|
+
|
|
10
14
|
#ifdef GGML_USE_CUDA
|
|
11
15
|
# include "ggml-cuda.h"
|
|
12
16
|
#elif defined(GGML_USE_CLBLAST)
|
|
@@ -22,16 +26,9 @@
|
|
|
22
26
|
#ifdef GGML_USE_METAL
|
|
23
27
|
# include "ggml-metal.h"
|
|
24
28
|
#endif
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
#
|
|
28
|
-
#ifndef QK_K
|
|
29
|
-
# ifdef GGML_QKK_64
|
|
30
|
-
# define QK_K 64
|
|
31
|
-
# else
|
|
32
|
-
# define QK_K 256
|
|
33
|
-
# endif
|
|
34
|
-
#endif
|
|
29
|
+
|
|
30
|
+
// TODO: replace with ggml API call
|
|
31
|
+
#define QK_K 256
|
|
35
32
|
|
|
36
33
|
#ifdef __has_include
|
|
37
34
|
#if __has_include(<unistd.h>)
|
|
@@ -106,7 +103,7 @@
|
|
|
106
103
|
#endif
|
|
107
104
|
|
|
108
105
|
#define LLAMA_MAX_NODES 8192
|
|
109
|
-
#define LLAMA_MAX_EXPERTS
|
|
106
|
+
#define LLAMA_MAX_EXPERTS 128
|
|
110
107
|
|
|
111
108
|
//
|
|
112
109
|
// logging
|
|
@@ -201,10 +198,10 @@ enum llm_arch {
|
|
|
201
198
|
LLM_ARCH_GPTNEOX,
|
|
202
199
|
LLM_ARCH_MPT,
|
|
203
200
|
LLM_ARCH_STARCODER,
|
|
204
|
-
LLM_ARCH_PERSIMMON,
|
|
205
201
|
LLM_ARCH_REFACT,
|
|
206
202
|
LLM_ARCH_BERT,
|
|
207
203
|
LLM_ARCH_NOMIC_BERT,
|
|
204
|
+
LLM_ARCH_JINA_BERT_V2,
|
|
208
205
|
LLM_ARCH_BLOOM,
|
|
209
206
|
LLM_ARCH_STABLELM,
|
|
210
207
|
LLM_ARCH_QWEN,
|
|
@@ -224,43 +221,45 @@ enum llm_arch {
|
|
|
224
221
|
LLM_ARCH_COMMAND_R,
|
|
225
222
|
LLM_ARCH_DBRX,
|
|
226
223
|
LLM_ARCH_OLMO,
|
|
224
|
+
LLM_ARCH_ARCTIC,
|
|
227
225
|
LLM_ARCH_UNKNOWN,
|
|
228
226
|
};
|
|
229
227
|
|
|
230
228
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
231
|
-
{ LLM_ARCH_LLAMA, "llama"
|
|
232
|
-
{ LLM_ARCH_FALCON, "falcon"
|
|
233
|
-
{ LLM_ARCH_GROK, "grok"
|
|
234
|
-
{ LLM_ARCH_GPT2, "gpt2"
|
|
235
|
-
{ LLM_ARCH_GPTJ, "gptj"
|
|
236
|
-
{ LLM_ARCH_GPTNEOX, "gptneox"
|
|
237
|
-
{ LLM_ARCH_MPT, "mpt"
|
|
238
|
-
{ LLM_ARCH_BAICHUAN, "baichuan"
|
|
239
|
-
{ LLM_ARCH_STARCODER, "starcoder"
|
|
240
|
-
{
|
|
241
|
-
{
|
|
242
|
-
{
|
|
243
|
-
{
|
|
244
|
-
{ LLM_ARCH_BLOOM, "bloom"
|
|
245
|
-
{ LLM_ARCH_STABLELM, "stablelm"
|
|
246
|
-
{ LLM_ARCH_QWEN, "qwen"
|
|
247
|
-
{ LLM_ARCH_QWEN2, "qwen2"
|
|
248
|
-
{ LLM_ARCH_QWEN2MOE, "qwen2moe"
|
|
249
|
-
{ LLM_ARCH_PHI2, "phi2"
|
|
250
|
-
{ LLM_ARCH_PHI3, "phi3"
|
|
251
|
-
{ LLM_ARCH_PLAMO, "plamo"
|
|
252
|
-
{ LLM_ARCH_CODESHELL, "codeshell"
|
|
253
|
-
{ LLM_ARCH_ORION, "orion"
|
|
254
|
-
{ LLM_ARCH_INTERNLM2, "internlm2"
|
|
255
|
-
{ LLM_ARCH_MINICPM, "minicpm"
|
|
256
|
-
{ LLM_ARCH_GEMMA, "gemma"
|
|
257
|
-
{ LLM_ARCH_STARCODER2, "starcoder2"
|
|
258
|
-
{ LLM_ARCH_MAMBA, "mamba"
|
|
259
|
-
{ LLM_ARCH_XVERSE, "xverse"
|
|
260
|
-
{ LLM_ARCH_COMMAND_R, "command-r"
|
|
261
|
-
{ LLM_ARCH_DBRX, "dbrx"
|
|
262
|
-
{ LLM_ARCH_OLMO, "olmo"
|
|
263
|
-
{
|
|
229
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
|
230
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
|
231
|
+
{ LLM_ARCH_GROK, "grok" },
|
|
232
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
|
233
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
|
234
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
|
235
|
+
{ LLM_ARCH_MPT, "mpt" },
|
|
236
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
|
237
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
|
238
|
+
{ LLM_ARCH_REFACT, "refact" },
|
|
239
|
+
{ LLM_ARCH_BERT, "bert" },
|
|
240
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
241
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
|
242
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
|
243
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
244
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
|
245
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
|
246
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
|
247
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
|
248
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
|
249
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
|
250
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
251
|
+
{ LLM_ARCH_ORION, "orion" },
|
|
252
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
253
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
|
254
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
|
255
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
256
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
|
257
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
|
258
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
|
259
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
|
260
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
|
261
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
|
262
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
264
263
|
};
|
|
265
264
|
|
|
266
265
|
enum llm_kv {
|
|
@@ -303,6 +302,7 @@ enum llm_kv {
|
|
|
303
302
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
304
303
|
LLM_KV_ROPE_SCALING_TYPE,
|
|
305
304
|
LLM_KV_ROPE_SCALING_FACTOR,
|
|
305
|
+
LLM_KV_ROPE_SCALING_ATTN_FACTOR,
|
|
306
306
|
LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
|
|
307
307
|
LLM_KV_ROPE_SCALING_FINETUNED,
|
|
308
308
|
|
|
@@ -380,6 +380,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
380
380
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
381
381
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
382
382
|
{ LLM_KV_ROPE_SCALING_FACTOR, "%s.rope.scaling.factor" },
|
|
383
|
+
{ LLM_KV_ROPE_SCALING_ATTN_FACTOR, "%s.rope.scaling.attn_factor" },
|
|
383
384
|
{ LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" },
|
|
384
385
|
{ LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" },
|
|
385
386
|
|
|
@@ -435,6 +436,8 @@ enum llm_tensor {
|
|
|
435
436
|
LLM_TENSOR_OUTPUT,
|
|
436
437
|
LLM_TENSOR_OUTPUT_NORM,
|
|
437
438
|
LLM_TENSOR_ROPE_FREQS,
|
|
439
|
+
LLM_TENSOR_ROPE_FACTORS_LONG,
|
|
440
|
+
LLM_TENSOR_ROPE_FACTORS_SHORT,
|
|
438
441
|
LLM_TENSOR_ATTN_Q,
|
|
439
442
|
LLM_TENSOR_ATTN_K,
|
|
440
443
|
LLM_TENSOR_ATTN_V,
|
|
@@ -454,6 +457,7 @@ enum llm_tensor {
|
|
|
454
457
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
|
455
458
|
LLM_TENSOR_FFN_GATE_EXP,
|
|
456
459
|
LLM_TENSOR_FFN_UP_EXP,
|
|
460
|
+
LLM_TENSOR_FFN_NORM_EXPS,
|
|
457
461
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
|
458
462
|
LLM_TENSOR_FFN_GATE_EXPS,
|
|
459
463
|
LLM_TENSOR_FFN_UP_EXPS,
|
|
@@ -592,23 +596,6 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
592
596
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
593
597
|
},
|
|
594
598
|
},
|
|
595
|
-
{
|
|
596
|
-
LLM_ARCH_PERSIMMON,
|
|
597
|
-
{
|
|
598
|
-
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
|
599
|
-
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
|
600
|
-
{ LLM_TENSOR_OUTPUT, "output"},
|
|
601
|
-
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
|
602
|
-
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
|
603
|
-
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
|
604
|
-
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
|
605
|
-
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
|
606
|
-
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
|
607
|
-
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
|
608
|
-
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
|
609
|
-
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
|
610
|
-
},
|
|
611
|
-
},
|
|
612
599
|
{
|
|
613
600
|
LLM_ARCH_MPT,
|
|
614
601
|
{
|
|
@@ -691,6 +678,25 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
691
678
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
692
679
|
},
|
|
693
680
|
},
|
|
681
|
+
{
|
|
682
|
+
LLM_ARCH_JINA_BERT_V2,
|
|
683
|
+
{
|
|
684
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
685
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
686
|
+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
|
|
687
|
+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
|
|
688
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
689
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
690
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
691
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
692
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
693
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
694
|
+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
|
|
695
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
696
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
697
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
698
|
+
},
|
|
699
|
+
},
|
|
694
700
|
{
|
|
695
701
|
LLM_ARCH_BLOOM,
|
|
696
702
|
{
|
|
@@ -800,18 +806,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
800
806
|
{
|
|
801
807
|
LLM_ARCH_PHI3,
|
|
802
808
|
{
|
|
803
|
-
{ LLM_TENSOR_TOKEN_EMBD,
|
|
804
|
-
{ LLM_TENSOR_OUTPUT_NORM,
|
|
805
|
-
{ LLM_TENSOR_OUTPUT,
|
|
806
|
-
{
|
|
807
|
-
{
|
|
808
|
-
{
|
|
809
|
-
{
|
|
810
|
-
{
|
|
811
|
-
{
|
|
812
|
-
{
|
|
813
|
-
{
|
|
814
|
-
{
|
|
809
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
810
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
811
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
812
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
|
813
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
|
814
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
815
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
816
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
817
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
818
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
819
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
820
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
821
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
822
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
815
823
|
},
|
|
816
824
|
},
|
|
817
825
|
{
|
|
@@ -1027,6 +1035,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|
|
1027
1035
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1028
1036
|
},
|
|
1029
1037
|
},
|
|
1038
|
+
{
|
|
1039
|
+
LLM_ARCH_ARCTIC,
|
|
1040
|
+
{
|
|
1041
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1042
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1043
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1044
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1045
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1046
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1047
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1048
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1049
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1050
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1051
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1052
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1053
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1054
|
+
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
|
1055
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1056
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1057
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1058
|
+
},
|
|
1059
|
+
},
|
|
1030
1060
|
{
|
|
1031
1061
|
LLM_ARCH_UNKNOWN,
|
|
1032
1062
|
{
|
|
@@ -1664,91 +1694,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
|
|
|
1664
1694
|
GGML_UNUSED(host_buffer);
|
|
1665
1695
|
}
|
|
1666
1696
|
|
|
1667
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1668
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
|
1669
|
-
|
|
1670
|
-
#ifdef GGML_USE_METAL
|
|
1671
|
-
buft = ggml_backend_metal_buffer_type();
|
|
1672
|
-
#elif defined(GGML_USE_CUDA)
|
|
1673
|
-
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
1674
|
-
#elif defined(GGML_USE_VULKAN)
|
|
1675
|
-
buft = ggml_backend_vk_buffer_type(gpu);
|
|
1676
|
-
#elif defined(GGML_USE_SYCL)
|
|
1677
|
-
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
1678
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
1679
|
-
buft = ggml_backend_opencl_buffer_type();
|
|
1680
|
-
#elif defined(GGML_USE_KOMPUTE)
|
|
1681
|
-
buft = ggml_backend_kompute_buffer_type(gpu);
|
|
1682
|
-
if (buft == nullptr) {
|
|
1683
|
-
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
|
1684
|
-
}
|
|
1685
|
-
#endif
|
|
1686
|
-
|
|
1687
|
-
if (buft == nullptr) {
|
|
1688
|
-
buft = llama_default_buffer_type_cpu(true);
|
|
1689
|
-
}
|
|
1690
|
-
return buft;
|
|
1691
|
-
|
|
1692
|
-
GGML_UNUSED(gpu);
|
|
1693
|
-
}
|
|
1694
|
-
|
|
1695
|
-
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
|
1696
|
-
ggml_backend_buffer_type_t buft = nullptr;
|
|
1697
|
-
|
|
1698
|
-
#ifdef GGML_USE_CUDA
|
|
1699
|
-
if (ggml_backend_cuda_get_device_count() > 1) {
|
|
1700
|
-
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
|
1701
|
-
}
|
|
1702
|
-
#endif
|
|
1703
|
-
|
|
1704
|
-
#ifdef GGML_USE_SYCL
|
|
1705
|
-
if (ggml_backend_sycl_get_device_count() > 1) {
|
|
1706
|
-
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
|
1707
|
-
}
|
|
1708
|
-
#endif
|
|
1709
|
-
|
|
1710
|
-
if (buft == nullptr) {
|
|
1711
|
-
buft = llama_default_buffer_type_offload(fallback_gpu);
|
|
1712
|
-
}
|
|
1713
|
-
return buft;
|
|
1714
|
-
|
|
1715
|
-
GGML_UNUSED(tensor_split);
|
|
1716
|
-
}
|
|
1717
|
-
|
|
1718
|
-
static size_t llama_get_device_count() {
|
|
1719
|
-
#if defined(GGML_USE_CUDA)
|
|
1720
|
-
return ggml_backend_cuda_get_device_count();
|
|
1721
|
-
#elif defined(GGML_USE_SYCL)
|
|
1722
|
-
return ggml_backend_sycl_get_device_count();
|
|
1723
|
-
#elif defined(GGML_USE_VULKAN)
|
|
1724
|
-
return ggml_backend_vk_get_device_count();
|
|
1725
|
-
#else
|
|
1726
|
-
return 1;
|
|
1727
|
-
#endif
|
|
1728
|
-
}
|
|
1729
|
-
|
|
1730
|
-
static size_t llama_get_device_memory(int device) {
|
|
1731
|
-
#if defined(GGML_USE_CUDA)
|
|
1732
|
-
size_t total;
|
|
1733
|
-
size_t free;
|
|
1734
|
-
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
|
1735
|
-
return free;
|
|
1736
|
-
#elif defined(GGML_USE_SYCL)
|
|
1737
|
-
size_t total;
|
|
1738
|
-
size_t free;
|
|
1739
|
-
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
|
1740
|
-
return free;
|
|
1741
|
-
#elif defined(GGML_USE_VULKAN)
|
|
1742
|
-
size_t total;
|
|
1743
|
-
size_t free;
|
|
1744
|
-
ggml_backend_vk_get_device_memory(device, &free, &total);
|
|
1745
|
-
return free;
|
|
1746
|
-
#else
|
|
1747
|
-
return 1;
|
|
1748
|
-
GGML_UNUSED(device);
|
|
1749
|
-
#endif
|
|
1750
|
-
}
|
|
1751
|
-
|
|
1752
1697
|
//
|
|
1753
1698
|
// globals
|
|
1754
1699
|
//
|
|
@@ -1757,6 +1702,8 @@ struct llama_state {
|
|
|
1757
1702
|
llama_state() {
|
|
1758
1703
|
#ifdef GGML_USE_METAL
|
|
1759
1704
|
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
|
1705
|
+
#elif defined(GGML_USE_CUDA)
|
|
1706
|
+
ggml_backend_cuda_log_set_callback(log_callback, log_callback_user_data);
|
|
1760
1707
|
#endif
|
|
1761
1708
|
}
|
|
1762
1709
|
|
|
@@ -1770,17 +1717,24 @@ static llama_state g_state;
|
|
|
1770
1717
|
// available llama models
|
|
1771
1718
|
enum e_model {
|
|
1772
1719
|
MODEL_UNKNOWN,
|
|
1720
|
+
MODEL_14M,
|
|
1773
1721
|
MODEL_17M,
|
|
1774
1722
|
MODEL_22M,
|
|
1775
1723
|
MODEL_33M,
|
|
1724
|
+
MODEL_70M,
|
|
1776
1725
|
MODEL_109M,
|
|
1777
1726
|
MODEL_137M,
|
|
1727
|
+
MODEL_160M,
|
|
1778
1728
|
MODEL_335M,
|
|
1729
|
+
MODEL_410M,
|
|
1779
1730
|
MODEL_0_5B,
|
|
1780
1731
|
MODEL_1B,
|
|
1732
|
+
MODEL_1_4B,
|
|
1781
1733
|
MODEL_2B,
|
|
1734
|
+
MODEL_2_8B,
|
|
1782
1735
|
MODEL_3B,
|
|
1783
1736
|
MODEL_4B,
|
|
1737
|
+
MODEL_6_9B,
|
|
1784
1738
|
MODEL_7B,
|
|
1785
1739
|
MODEL_8B,
|
|
1786
1740
|
MODEL_12B,
|
|
@@ -1803,6 +1757,7 @@ enum e_model {
|
|
|
1803
1757
|
MODEL_8x7B,
|
|
1804
1758
|
MODEL_8x22B,
|
|
1805
1759
|
MODEL_16x12B,
|
|
1760
|
+
MODEL_10B_128x3_66B,
|
|
1806
1761
|
};
|
|
1807
1762
|
|
|
1808
1763
|
static const size_t kiB = 1024;
|
|
@@ -1812,6 +1767,7 @@ static const size_t GiB = 1024*MiB;
|
|
|
1812
1767
|
struct llama_hparams {
|
|
1813
1768
|
bool vocab_only;
|
|
1814
1769
|
bool rope_finetuned;
|
|
1770
|
+
bool use_par_res;
|
|
1815
1771
|
|
|
1816
1772
|
uint32_t n_vocab;
|
|
1817
1773
|
uint32_t n_ctx_train; // context size the model was trained on
|
|
@@ -1830,6 +1786,7 @@ struct llama_hparams {
|
|
|
1830
1786
|
float f_norm_eps;
|
|
1831
1787
|
float f_norm_rms_eps;
|
|
1832
1788
|
|
|
1789
|
+
float rope_attn_factor = 1.0f;
|
|
1833
1790
|
float rope_freq_base_train;
|
|
1834
1791
|
float rope_freq_scale_train;
|
|
1835
1792
|
uint32_t n_yarn_orig_ctx;
|
|
@@ -1845,7 +1802,7 @@ struct llama_hparams {
|
|
|
1845
1802
|
float f_logit_scale = 0.0f;
|
|
1846
1803
|
|
|
1847
1804
|
bool causal_attn = true;
|
|
1848
|
-
bool use_alibi = false;
|
|
1805
|
+
bool use_alibi = false;
|
|
1849
1806
|
|
|
1850
1807
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
|
1851
1808
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
@@ -1878,6 +1835,7 @@ struct llama_hparams {
|
|
|
1878
1835
|
|
|
1879
1836
|
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
|
1880
1837
|
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
|
1838
|
+
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
|
1881
1839
|
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
|
1882
1840
|
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
|
1883
1841
|
|
|
@@ -1975,6 +1933,7 @@ struct llama_layer {
|
|
|
1975
1933
|
struct ggml_tensor * ffn_norm_b;
|
|
1976
1934
|
struct ggml_tensor * layer_out_norm;
|
|
1977
1935
|
struct ggml_tensor * layer_out_norm_b;
|
|
1936
|
+
struct ggml_tensor * ffn_norm_exps;
|
|
1978
1937
|
|
|
1979
1938
|
// ff
|
|
1980
1939
|
struct ggml_tensor * ffn_gate; // w1
|
|
@@ -2012,6 +1971,10 @@ struct llama_layer {
|
|
|
2012
1971
|
// mamba bias
|
|
2013
1972
|
struct ggml_tensor * ssm_conv1d_b;
|
|
2014
1973
|
struct ggml_tensor * ssm_dt_b;
|
|
1974
|
+
|
|
1975
|
+
// long rope factors
|
|
1976
|
+
struct ggml_tensor * rope_long = nullptr;
|
|
1977
|
+
struct ggml_tensor * rope_short = nullptr;
|
|
2015
1978
|
};
|
|
2016
1979
|
|
|
2017
1980
|
struct llama_kv_cell {
|
|
@@ -2189,6 +2152,8 @@ struct llama_model {
|
|
|
2189
2152
|
int main_gpu;
|
|
2190
2153
|
int n_gpu_layers;
|
|
2191
2154
|
|
|
2155
|
+
std::vector<std::string> rpc_servers;
|
|
2156
|
+
|
|
2192
2157
|
// gguf metadata
|
|
2193
2158
|
std::unordered_map<std::string, std::string> gguf_kv;
|
|
2194
2159
|
|
|
@@ -2317,7 +2282,6 @@ struct llama_context {
|
|
|
2317
2282
|
struct ggml_tensor * inp_pos; // I32 [n_batch]
|
|
2318
2283
|
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
|
|
2319
2284
|
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
|
|
2320
|
-
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
|
|
2321
2285
|
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
|
|
2322
2286
|
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
|
|
2323
2287
|
struct ggml_tensor * inp_cls; // I32 [n_batch]
|
|
@@ -2327,11 +2291,105 @@ struct llama_context {
|
|
|
2327
2291
|
|
|
2328
2292
|
// control vectors
|
|
2329
2293
|
struct llama_control_vector cvec;
|
|
2294
|
+
};
|
|
2295
|
+
|
|
2296
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
|
|
2297
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
2298
|
+
|
|
2299
|
+
#ifdef GGML_USE_RPC
|
|
2300
|
+
std::string endpoint = model.rpc_servers[gpu];
|
|
2301
|
+
buft = ggml_backend_rpc_buffer_type(endpoint.c_str());
|
|
2302
|
+
#elif defined(GGML_USE_METAL)
|
|
2303
|
+
buft = ggml_backend_metal_buffer_type();
|
|
2304
|
+
#elif defined(GGML_USE_CUDA)
|
|
2305
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
2306
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2307
|
+
buft = ggml_backend_vk_buffer_type(gpu);
|
|
2308
|
+
#elif defined(GGML_USE_SYCL)
|
|
2309
|
+
buft = ggml_backend_sycl_buffer_type(gpu);
|
|
2310
|
+
#elif defined(GGML_USE_CLBLAST)
|
|
2311
|
+
buft = ggml_backend_opencl_buffer_type();
|
|
2312
|
+
#elif defined(GGML_USE_KOMPUTE)
|
|
2313
|
+
buft = ggml_backend_kompute_buffer_type(gpu);
|
|
2314
|
+
if (buft == nullptr) {
|
|
2315
|
+
LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
|
|
2316
|
+
}
|
|
2317
|
+
#endif
|
|
2318
|
+
|
|
2319
|
+
if (buft == nullptr) {
|
|
2320
|
+
buft = llama_default_buffer_type_cpu(true);
|
|
2321
|
+
}
|
|
2322
|
+
return buft;
|
|
2323
|
+
GGML_UNUSED(model);
|
|
2324
|
+
GGML_UNUSED(gpu);
|
|
2325
|
+
}
|
|
2330
2326
|
|
|
2331
|
-
|
|
2332
|
-
|
|
2327
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
|
|
2328
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
2329
|
+
|
|
2330
|
+
#ifdef GGML_USE_CUDA
|
|
2331
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
|
2332
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
|
2333
|
+
}
|
|
2333
2334
|
#endif
|
|
2334
|
-
|
|
2335
|
+
|
|
2336
|
+
#ifdef GGML_USE_SYCL
|
|
2337
|
+
if (ggml_backend_sycl_get_device_count() > 1) {
|
|
2338
|
+
buft = ggml_backend_sycl_split_buffer_type(tensor_split);
|
|
2339
|
+
}
|
|
2340
|
+
#endif
|
|
2341
|
+
|
|
2342
|
+
if (buft == nullptr) {
|
|
2343
|
+
buft = llama_default_buffer_type_offload(model, fallback_gpu);
|
|
2344
|
+
}
|
|
2345
|
+
return buft;
|
|
2346
|
+
|
|
2347
|
+
GGML_UNUSED(tensor_split);
|
|
2348
|
+
}
|
|
2349
|
+
|
|
2350
|
+
static size_t llama_get_device_count(const llama_model & model) {
|
|
2351
|
+
#if defined(GGML_USE_RPC)
|
|
2352
|
+
return model.rpc_servers.size();
|
|
2353
|
+
#elif defined(GGML_USE_CUDA)
|
|
2354
|
+
return ggml_backend_cuda_get_device_count();
|
|
2355
|
+
#elif defined(GGML_USE_SYCL)
|
|
2356
|
+
return ggml_backend_sycl_get_device_count();
|
|
2357
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2358
|
+
return ggml_backend_vk_get_device_count();
|
|
2359
|
+
#else
|
|
2360
|
+
return 1;
|
|
2361
|
+
#endif
|
|
2362
|
+
GGML_UNUSED(model);
|
|
2363
|
+
}
|
|
2364
|
+
|
|
2365
|
+
static size_t llama_get_device_memory(const llama_model & model, int device) {
|
|
2366
|
+
#if defined(GGML_USE_RPC)
|
|
2367
|
+
size_t total;
|
|
2368
|
+
size_t free;
|
|
2369
|
+
std::string endpoint = model.rpc_servers[device];
|
|
2370
|
+
ggml_backend_rpc_get_device_memory(endpoint.c_str(), &free, &total);
|
|
2371
|
+
return free;
|
|
2372
|
+
#elif defined(GGML_USE_CUDA)
|
|
2373
|
+
size_t total;
|
|
2374
|
+
size_t free;
|
|
2375
|
+
ggml_backend_cuda_get_device_memory(device, &free, &total);
|
|
2376
|
+
return free;
|
|
2377
|
+
#elif defined(GGML_USE_SYCL)
|
|
2378
|
+
size_t total;
|
|
2379
|
+
size_t free;
|
|
2380
|
+
ggml_backend_sycl_get_device_memory(device, &free, &total);
|
|
2381
|
+
return free;
|
|
2382
|
+
#elif defined(GGML_USE_VULKAN)
|
|
2383
|
+
size_t total;
|
|
2384
|
+
size_t free;
|
|
2385
|
+
ggml_backend_vk_get_device_memory(device, &free, &total);
|
|
2386
|
+
return free;
|
|
2387
|
+
#else
|
|
2388
|
+
return 1;
|
|
2389
|
+
#endif
|
|
2390
|
+
GGML_UNUSED(model);
|
|
2391
|
+
GGML_UNUSED(device);
|
|
2392
|
+
}
|
|
2335
2393
|
|
|
2336
2394
|
//
|
|
2337
2395
|
// kv cache helpers
|
|
@@ -2452,7 +2510,6 @@ static bool llama_kv_cache_init(
|
|
|
2452
2510
|
static bool llama_kv_cache_find_slot(
|
|
2453
2511
|
struct llama_kv_cache & cache,
|
|
2454
2512
|
const struct llama_batch & batch) {
|
|
2455
|
-
const uint32_t n_ctx = cache.size;
|
|
2456
2513
|
const uint32_t n_tokens = batch.n_tokens;
|
|
2457
2514
|
|
|
2458
2515
|
if (cache.recurrent) {
|
|
@@ -2503,16 +2560,16 @@ static bool llama_kv_cache_find_slot(
|
|
|
2503
2560
|
}
|
|
2504
2561
|
// otherwise, one cell per token.
|
|
2505
2562
|
|
|
2506
|
-
if (n_tokens >
|
|
2507
|
-
LLAMA_LOG_ERROR("%s: n_tokens=%d >
|
|
2563
|
+
if (n_tokens > cache.size) {
|
|
2564
|
+
LLAMA_LOG_ERROR("%s: n_tokens=%d > cache.size=%d\n", __func__, n_tokens, cache.size);
|
|
2508
2565
|
return false;
|
|
2509
2566
|
}
|
|
2510
2567
|
|
|
2511
2568
|
uint32_t n_tested = 0;
|
|
2512
2569
|
|
|
2513
2570
|
while (true) {
|
|
2514
|
-
if (cache.head + n_tokens >
|
|
2515
|
-
n_tested +=
|
|
2571
|
+
if (cache.head + n_tokens > cache.size) {
|
|
2572
|
+
n_tested += cache.size - cache.head;
|
|
2516
2573
|
cache.head = 0;
|
|
2517
2574
|
continue;
|
|
2518
2575
|
}
|
|
@@ -2531,7 +2588,7 @@ static bool llama_kv_cache_find_slot(
|
|
|
2531
2588
|
break;
|
|
2532
2589
|
}
|
|
2533
2590
|
|
|
2534
|
-
if (n_tested >=
|
|
2591
|
+
if (n_tested >= cache.size) {
|
|
2535
2592
|
//LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens);
|
|
2536
2593
|
return false;
|
|
2537
2594
|
}
|
|
@@ -2785,6 +2842,11 @@ static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
|
|
2785
2842
|
cache.do_defrag = true;
|
|
2786
2843
|
}
|
|
2787
2844
|
|
|
2845
|
+
static uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams) {
|
|
2846
|
+
// the FA kernels require padding to avoid extra runtime boundary checks
|
|
2847
|
+
return cparams.flash_attn ? 256u : 32u;
|
|
2848
|
+
}
|
|
2849
|
+
|
|
2788
2850
|
//
|
|
2789
2851
|
// model loading and saving
|
|
2790
2852
|
//
|
|
@@ -3287,22 +3349,55 @@ struct llama_model_loader {
|
|
|
3287
3349
|
}
|
|
3288
3350
|
|
|
3289
3351
|
template<typename T>
|
|
3290
|
-
bool
|
|
3291
|
-
|
|
3352
|
+
bool get_arr(const std::string & key, std::vector<T> & result, const bool required = true) {
|
|
3353
|
+
const int kid = gguf_find_key(meta, key.c_str());
|
|
3292
3354
|
|
|
3293
|
-
|
|
3294
|
-
|
|
3355
|
+
if (kid < 0) {
|
|
3356
|
+
if (required) {
|
|
3357
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
|
3358
|
+
}
|
|
3359
|
+
return false;
|
|
3360
|
+
}
|
|
3295
3361
|
|
|
3296
|
-
|
|
3362
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
|
3363
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta, kid);
|
|
3297
3364
|
|
|
3298
|
-
if (
|
|
3299
|
-
throw std::runtime_error(format("
|
|
3365
|
+
if (arr_info.gt != GGUF_TYPE_FLOAT32 && arr_info.gt != GGUF_TYPE_INT32) {
|
|
3366
|
+
throw std::runtime_error(format("%s is not a float32 or int32 array", key.c_str()));
|
|
3300
3367
|
}
|
|
3301
3368
|
|
|
3302
|
-
|
|
3303
|
-
|
|
3369
|
+
// GGML_ASSERT(gguf_type_size(arr_info.gt) == sizeof(T));
|
|
3370
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_FLOAT32 || std::is_same<T, float>::value));
|
|
3371
|
+
GGML_ASSERT((arr_info.gt != GGUF_TYPE_INT32 || std::is_same<T, int>::value));
|
|
3304
3372
|
|
|
3305
|
-
|
|
3373
|
+
result.resize(arr_info.length);
|
|
3374
|
+
result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
|
|
3375
|
+
|
|
3376
|
+
return true;
|
|
3377
|
+
}
|
|
3378
|
+
|
|
3379
|
+
template<typename T>
|
|
3380
|
+
bool get_arr(const enum llm_kv kid, T& result, const bool required = true) {
|
|
3381
|
+
return get_arr(llm_kv(kid), result, required);
|
|
3382
|
+
}
|
|
3383
|
+
|
|
3384
|
+
template<typename T>
|
|
3385
|
+
bool get_key(const std::string & key, T & result, const bool required = true) {
|
|
3386
|
+
auto it = kv_overrides.find(key);
|
|
3387
|
+
|
|
3388
|
+
const struct llama_model_kv_override * override =
|
|
3389
|
+
it != kv_overrides.end() ? &it->second : nullptr;
|
|
3390
|
+
|
|
3391
|
+
const bool found = GGUFMeta::GKV<T>::set(meta, key, result, override);
|
|
3392
|
+
|
|
3393
|
+
if (required && !found) {
|
|
3394
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
|
3395
|
+
}
|
|
3396
|
+
|
|
3397
|
+
return found;
|
|
3398
|
+
}
|
|
3399
|
+
|
|
3400
|
+
template<typename T>
|
|
3306
3401
|
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
|
|
3307
3402
|
return get_key(llm_kv(kid), result, required);
|
|
3308
3403
|
}
|
|
@@ -3360,11 +3455,15 @@ struct llama_model_loader {
|
|
|
3360
3455
|
return get_tensor_meta(get_tensor_name(i));
|
|
3361
3456
|
}
|
|
3362
3457
|
|
|
3363
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) {
|
|
3458
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur, bool duplicated) {
|
|
3364
3459
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur);
|
|
3365
3460
|
ggml_set_name(tensor, ggml_get_name(cur));
|
|
3366
3461
|
|
|
3367
|
-
|
|
3462
|
+
if (duplicated) {
|
|
3463
|
+
size_data += ggml_nbytes(cur);
|
|
3464
|
+
} else {
|
|
3465
|
+
n_created++;
|
|
3466
|
+
}
|
|
3368
3467
|
|
|
3369
3468
|
return tensor;
|
|
3370
3469
|
}
|
|
@@ -3399,14 +3498,17 @@ struct llama_model_loader {
|
|
|
3399
3498
|
return cur;
|
|
3400
3499
|
}
|
|
3401
3500
|
|
|
3402
|
-
|
|
3403
|
-
|
|
3501
|
+
static const int TENSOR_NOT_REQUIRED = 1;
|
|
3502
|
+
static const int TENSOR_DUPLICATED = 2;
|
|
3503
|
+
|
|
3504
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, int flags = 0) {
|
|
3505
|
+
const struct ggml_tensor * cur = check_tensor_dims(name, ne, !(flags & TENSOR_NOT_REQUIRED));
|
|
3404
3506
|
|
|
3405
3507
|
if (cur == NULL) {
|
|
3406
3508
|
return NULL;
|
|
3407
3509
|
}
|
|
3408
3510
|
|
|
3409
|
-
return create_tensor_for(ctx, cur);
|
|
3511
|
+
return create_tensor_for(ctx, cur, flags & TENSOR_DUPLICATED);
|
|
3410
3512
|
}
|
|
3411
3513
|
|
|
3412
3514
|
struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector<int64_t> & ne, size_t offset, bool required = true) {
|
|
@@ -3706,37 +3808,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
3706
3808
|
|
|
3707
3809
|
static const char * llama_model_type_name(e_model type) {
|
|
3708
3810
|
switch (type) {
|
|
3709
|
-
case
|
|
3710
|
-
case
|
|
3711
|
-
case
|
|
3712
|
-
case
|
|
3713
|
-
case
|
|
3714
|
-
case
|
|
3715
|
-
case
|
|
3716
|
-
case
|
|
3717
|
-
case
|
|
3718
|
-
case
|
|
3719
|
-
case
|
|
3720
|
-
case
|
|
3721
|
-
case
|
|
3722
|
-
case
|
|
3723
|
-
case
|
|
3724
|
-
case
|
|
3725
|
-
case
|
|
3726
|
-
case
|
|
3727
|
-
case
|
|
3728
|
-
case
|
|
3729
|
-
case
|
|
3730
|
-
case
|
|
3731
|
-
case
|
|
3732
|
-
case
|
|
3733
|
-
case
|
|
3734
|
-
case
|
|
3735
|
-
case
|
|
3736
|
-
case
|
|
3737
|
-
case
|
|
3738
|
-
case
|
|
3739
|
-
|
|
3811
|
+
case MODEL_14M: return "14M";
|
|
3812
|
+
case MODEL_17M: return "17M";
|
|
3813
|
+
case MODEL_22M: return "22M";
|
|
3814
|
+
case MODEL_33M: return "33M";
|
|
3815
|
+
case MODEL_70M: return "70M";
|
|
3816
|
+
case MODEL_109M: return "109M";
|
|
3817
|
+
case MODEL_137M: return "137M";
|
|
3818
|
+
case MODEL_160M: return "160M";
|
|
3819
|
+
case MODEL_335M: return "335M";
|
|
3820
|
+
case MODEL_410M: return "410M";
|
|
3821
|
+
case MODEL_0_5B: return "0.5B";
|
|
3822
|
+
case MODEL_1B: return "1B";
|
|
3823
|
+
case MODEL_1_4B: return "1.4B";
|
|
3824
|
+
case MODEL_2B: return "2B";
|
|
3825
|
+
case MODEL_2_8B: return "2.8B";
|
|
3826
|
+
case MODEL_3B: return "3B";
|
|
3827
|
+
case MODEL_4B: return "4B";
|
|
3828
|
+
case MODEL_6_9B: return "6.9B";
|
|
3829
|
+
case MODEL_7B: return "7B";
|
|
3830
|
+
case MODEL_8B: return "8B";
|
|
3831
|
+
case MODEL_12B: return "12B";
|
|
3832
|
+
case MODEL_13B: return "13B";
|
|
3833
|
+
case MODEL_14B: return "14B";
|
|
3834
|
+
case MODEL_15B: return "15B";
|
|
3835
|
+
case MODEL_20B: return "20B";
|
|
3836
|
+
case MODEL_30B: return "30B";
|
|
3837
|
+
case MODEL_34B: return "34B";
|
|
3838
|
+
case MODEL_35B: return "35B";
|
|
3839
|
+
case MODEL_40B: return "40B";
|
|
3840
|
+
case MODEL_65B: return "65B";
|
|
3841
|
+
case MODEL_70B: return "70B";
|
|
3842
|
+
case MODEL_314B: return "314B";
|
|
3843
|
+
case MODEL_SMALL: return "0.1B";
|
|
3844
|
+
case MODEL_MEDIUM: return "0.4B";
|
|
3845
|
+
case MODEL_LARGE: return "0.8B";
|
|
3846
|
+
case MODEL_XL: return "1.5B";
|
|
3847
|
+
case MODEL_A2_7B: return "A2.7B";
|
|
3848
|
+
case MODEL_8x7B: return "8x7B";
|
|
3849
|
+
case MODEL_8x22B: return "8x22B";
|
|
3850
|
+
case MODEL_16x12B: return "16x12B";
|
|
3851
|
+
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
|
3852
|
+
default: return "?B";
|
|
3740
3853
|
}
|
|
3741
3854
|
}
|
|
3742
3855
|
|
|
@@ -3779,6 +3892,12 @@ static void llm_load_hparams(
|
|
|
3779
3892
|
|
|
3780
3893
|
// get hparams kv
|
|
3781
3894
|
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
|
3895
|
+
|
|
3896
|
+
// everything past this point is not vocab-related
|
|
3897
|
+
if (hparams.vocab_only) {
|
|
3898
|
+
return;
|
|
3899
|
+
}
|
|
3900
|
+
|
|
3782
3901
|
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
3783
3902
|
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
3784
3903
|
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
|
@@ -3823,6 +3942,8 @@ static void llm_load_hparams(
|
|
|
3823
3942
|
}
|
|
3824
3943
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
3825
3944
|
|
|
3945
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
3946
|
+
|
|
3826
3947
|
// sanity check for n_rot (optional)
|
|
3827
3948
|
{
|
|
3828
3949
|
hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
|
|
@@ -3860,7 +3981,7 @@ static void llm_load_hparams(
|
|
|
3860
3981
|
switch (hparams.n_layer) {
|
|
3861
3982
|
case 22: model.type = e_model::MODEL_1B; break;
|
|
3862
3983
|
case 26: model.type = e_model::MODEL_3B; break;
|
|
3863
|
-
case 32: model.type = hparams.
|
|
3984
|
+
case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
|
|
3864
3985
|
case 40: model.type = e_model::MODEL_13B; break;
|
|
3865
3986
|
case 48: model.type = e_model::MODEL_34B; break;
|
|
3866
3987
|
case 60: model.type = e_model::MODEL_30B; break;
|
|
@@ -3922,14 +4043,6 @@ static void llm_load_hparams(
|
|
|
3922
4043
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3923
4044
|
}
|
|
3924
4045
|
} break;
|
|
3925
|
-
case LLM_ARCH_PERSIMMON:
|
|
3926
|
-
{
|
|
3927
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
3928
|
-
switch (hparams.n_layer) {
|
|
3929
|
-
case 36: model.type = e_model::MODEL_8B; break;
|
|
3930
|
-
default: model.type = e_model::MODEL_UNKNOWN;
|
|
3931
|
-
}
|
|
3932
|
-
} break;
|
|
3933
4046
|
case LLM_ARCH_REFACT:
|
|
3934
4047
|
{
|
|
3935
4048
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -3962,6 +4075,19 @@ static void llm_load_hparams(
|
|
|
3962
4075
|
model.type = e_model::MODEL_335M; break; // bge-large
|
|
3963
4076
|
}
|
|
3964
4077
|
} break;
|
|
4078
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
4079
|
+
{
|
|
4080
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
4081
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
4082
|
+
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
|
4083
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
4084
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
4085
|
+
|
|
4086
|
+
switch (hparams.n_layer) {
|
|
4087
|
+
case 4: model.type = e_model::MODEL_33M; break; // jina-embeddings-small
|
|
4088
|
+
case 12: model.type = e_model::MODEL_137M; break; // jina-embeddings-base
|
|
4089
|
+
}
|
|
4090
|
+
} break;
|
|
3965
4091
|
case LLM_ARCH_NOMIC_BERT:
|
|
3966
4092
|
{
|
|
3967
4093
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -4058,6 +4184,7 @@ static void llm_load_hparams(
|
|
|
4058
4184
|
switch (hparams.n_layer) {
|
|
4059
4185
|
case 24: model.type = e_model::MODEL_1B; break;
|
|
4060
4186
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
4187
|
+
case 40: model.type = e_model::MODEL_14B; break;
|
|
4061
4188
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4062
4189
|
}
|
|
4063
4190
|
} break;
|
|
@@ -4198,6 +4325,65 @@ static void llm_load_hparams(
|
|
|
4198
4325
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4199
4326
|
}
|
|
4200
4327
|
} break;
|
|
4328
|
+
case LLM_ARCH_GPTNEOX:
|
|
4329
|
+
{
|
|
4330
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
4331
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
|
4332
|
+
switch (hparams.n_layer) {
|
|
4333
|
+
case 6:
|
|
4334
|
+
switch (hparams.n_ff) {
|
|
4335
|
+
case 512: model.type = e_model::MODEL_14M; break;
|
|
4336
|
+
case 2048: model.type = e_model::MODEL_70M; break;
|
|
4337
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4338
|
+
} break;
|
|
4339
|
+
case 12:
|
|
4340
|
+
switch (hparams.n_ff) {
|
|
4341
|
+
case 3072: model.type = e_model::MODEL_160M; break;
|
|
4342
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4343
|
+
} break;
|
|
4344
|
+
case 16:
|
|
4345
|
+
switch (hparams.n_ff) {
|
|
4346
|
+
case 8192: model.type = e_model::MODEL_1B; break;
|
|
4347
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4348
|
+
} break;
|
|
4349
|
+
case 24:
|
|
4350
|
+
switch (hparams.n_ff) {
|
|
4351
|
+
case 4096: model.type = e_model::MODEL_410M; break;
|
|
4352
|
+
case 8192: model.type = e_model::MODEL_1_4B; break;
|
|
4353
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4354
|
+
} break;
|
|
4355
|
+
case 32:
|
|
4356
|
+
switch (hparams.n_ff) {
|
|
4357
|
+
case 10240: model.type = e_model::MODEL_2_8B; break;
|
|
4358
|
+
case 16384: model.type = e_model::MODEL_6_9B; break;
|
|
4359
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4360
|
+
} break;
|
|
4361
|
+
case 36:
|
|
4362
|
+
switch (hparams.n_ff) {
|
|
4363
|
+
case 20480: model.type = e_model::MODEL_12B; break;
|
|
4364
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4365
|
+
} break;
|
|
4366
|
+
case 44:
|
|
4367
|
+
switch (hparams.n_ff) {
|
|
4368
|
+
case 24576: model.type = e_model::MODEL_20B; break;
|
|
4369
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4370
|
+
} break;
|
|
4371
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4372
|
+
}
|
|
4373
|
+
} break;
|
|
4374
|
+
case LLM_ARCH_ARCTIC:
|
|
4375
|
+
{
|
|
4376
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
4377
|
+
|
|
4378
|
+
if (hparams.n_expert == 128) {
|
|
4379
|
+
switch (hparams.n_layer) {
|
|
4380
|
+
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
|
4381
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
4382
|
+
}
|
|
4383
|
+
} else {
|
|
4384
|
+
model.type = e_model::MODEL_UNKNOWN;
|
|
4385
|
+
}
|
|
4386
|
+
} break;
|
|
4201
4387
|
default: (void)0;
|
|
4202
4388
|
}
|
|
4203
4389
|
|
|
@@ -4383,7 +4569,11 @@ static void llm_load_vocab(
|
|
|
4383
4569
|
tokenizer_pre == "starcoder") {
|
|
4384
4570
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
|
|
4385
4571
|
} else if (
|
|
4386
|
-
tokenizer_pre == "gpt-2"
|
|
4572
|
+
tokenizer_pre == "gpt-2" ||
|
|
4573
|
+
tokenizer_pre == "jina-es" ||
|
|
4574
|
+
tokenizer_pre == "jina-de" ||
|
|
4575
|
+
tokenizer_pre == "jina-v2-es" ||
|
|
4576
|
+
tokenizer_pre == "jina-v2-de") {
|
|
4387
4577
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
4388
4578
|
} else if (
|
|
4389
4579
|
tokenizer_pre == "refact") {
|
|
@@ -4394,12 +4584,18 @@ static void llm_load_vocab(
|
|
|
4394
4584
|
} else if (
|
|
4395
4585
|
tokenizer_pre == "qwen2") {
|
|
4396
4586
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
|
4587
|
+
} else if (
|
|
4588
|
+
tokenizer_pre == "stablelm2") {
|
|
4589
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STABLELM2;
|
|
4397
4590
|
} else if (
|
|
4398
4591
|
tokenizer_pre == "olmo") {
|
|
4399
4592
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
|
4400
4593
|
} else if (
|
|
4401
4594
|
tokenizer_pre == "dbrx") {
|
|
4402
4595
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
|
4596
|
+
} else if (
|
|
4597
|
+
tokenizer_pre == "smaug-bpe") {
|
|
4598
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
|
4403
4599
|
} else {
|
|
4404
4600
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
4405
4601
|
}
|
|
@@ -4515,7 +4711,8 @@ static void llm_load_vocab(
|
|
|
4515
4711
|
(t.first == "<|eot_id|>" ||
|
|
4516
4712
|
t.first == "<|im_end|>" ||
|
|
4517
4713
|
t.first == "<|end|>" ||
|
|
4518
|
-
t.first == "<end_of_turn>"
|
|
4714
|
+
t.first == "<end_of_turn>" ||
|
|
4715
|
+
t.first == "<|endoftext|>"
|
|
4519
4716
|
)
|
|
4520
4717
|
) {
|
|
4521
4718
|
vocab.special_eot_id = t.second;
|
|
@@ -4743,13 +4940,13 @@ static bool llm_load_tensors(
|
|
|
4743
4940
|
|
|
4744
4941
|
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
|
4745
4942
|
// calculate the split points
|
|
4746
|
-
int device_count = llama_get_device_count();
|
|
4943
|
+
int device_count = llama_get_device_count(model);
|
|
4747
4944
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
4748
4945
|
std::vector<float> splits(device_count);
|
|
4749
4946
|
if (all_zero) {
|
|
4750
4947
|
// default split, by free memory
|
|
4751
4948
|
for (int i = 0; i < device_count; ++i) {
|
|
4752
|
-
splits[i] = llama_get_device_memory(i);
|
|
4949
|
+
splits[i] = llama_get_device_memory(model, i);
|
|
4753
4950
|
}
|
|
4754
4951
|
} else {
|
|
4755
4952
|
std::copy(tensor_split, tensor_split + device_count, splits.begin());
|
|
@@ -4769,35 +4966,35 @@ static bool llm_load_tensors(
|
|
|
4769
4966
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
|
4770
4967
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
4771
4968
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits.begin();
|
|
4772
|
-
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
|
4969
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(model, layer_gpu);
|
|
4773
4970
|
}
|
|
4774
4971
|
// assign the output layer
|
|
4775
4972
|
if (n_gpu_layers > n_layer) {
|
|
4776
4973
|
int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits.begin();
|
|
4777
|
-
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
|
4974
|
+
model.buft_output = llama_default_buffer_type_offload(model, layer_gpu);
|
|
4778
4975
|
} else {
|
|
4779
4976
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
4780
4977
|
}
|
|
4781
4978
|
} else {
|
|
4782
4979
|
ggml_backend_buffer_type_t split_buft;
|
|
4783
4980
|
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
4784
|
-
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
4981
|
+
split_buft = llama_default_buffer_type_split(model, main_gpu, tensor_split);
|
|
4785
4982
|
} else {
|
|
4786
4983
|
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
|
4787
|
-
split_buft = llama_default_buffer_type_offload(main_gpu);
|
|
4984
|
+
split_buft = llama_default_buffer_type_offload(model, main_gpu);
|
|
4788
4985
|
}
|
|
4789
4986
|
// assign the repeating layers
|
|
4790
4987
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
4791
4988
|
model.buft_layer[i] = {
|
|
4792
4989
|
split_buft,
|
|
4793
|
-
llama_default_buffer_type_offload(main_gpu)
|
|
4990
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
|
4794
4991
|
};
|
|
4795
4992
|
}
|
|
4796
4993
|
// assign the output layer
|
|
4797
4994
|
if (n_gpu_layers > n_layer) {
|
|
4798
4995
|
model.buft_output = {
|
|
4799
4996
|
split_buft,
|
|
4800
|
-
llama_default_buffer_type_offload(main_gpu)
|
|
4997
|
+
llama_default_buffer_type_offload(model, main_gpu)
|
|
4801
4998
|
};
|
|
4802
4999
|
} else {
|
|
4803
5000
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
@@ -4841,6 +5038,7 @@ static bool llm_load_tensors(
|
|
|
4841
5038
|
// create tensors for the weights
|
|
4842
5039
|
{
|
|
4843
5040
|
const int64_t n_embd = hparams.n_embd;
|
|
5041
|
+
const int64_t n_embd_head = n_embd / hparams.n_head;
|
|
4844
5042
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
4845
5043
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
4846
5044
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
@@ -4875,12 +5073,10 @@ static bool llm_load_tensors(
|
|
|
4875
5073
|
{
|
|
4876
5074
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
4877
5075
|
if (model.arch != LLM_ARCH_MINICPM){
|
|
4878
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5076
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4879
5077
|
// if output is NULL, init from the input tok embed
|
|
4880
5078
|
if (model.output == NULL) {
|
|
4881
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4882
|
-
ml.n_created--; // artificial tensor
|
|
4883
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5079
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
4884
5080
|
}
|
|
4885
5081
|
}
|
|
4886
5082
|
}
|
|
@@ -4899,10 +5095,10 @@ static bool llm_load_tensors(
|
|
|
4899
5095
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
4900
5096
|
|
|
4901
5097
|
// optional bias tensors
|
|
4902
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
4903
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
4904
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
4905
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
|
5098
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5099
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5100
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5101
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4906
5102
|
|
|
4907
5103
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
4908
5104
|
|
|
@@ -4913,7 +5109,7 @@ static bool llm_load_tensors(
|
|
|
4913
5109
|
} else {
|
|
4914
5110
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
4915
5111
|
|
|
4916
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
|
5112
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4917
5113
|
if (layer.ffn_gate_exps) {
|
|
4918
5114
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
|
4919
5115
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
@@ -4955,12 +5151,10 @@ static bool llm_load_tensors(
|
|
|
4955
5151
|
// output
|
|
4956
5152
|
{
|
|
4957
5153
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
4958
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5154
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4959
5155
|
// if output is NULL, init from the input tok embed
|
|
4960
5156
|
if (model.output == NULL) {
|
|
4961
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
4962
|
-
ml.n_created--; // artificial tensor
|
|
4963
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5157
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
4964
5158
|
}
|
|
4965
5159
|
}
|
|
4966
5160
|
|
|
@@ -4983,7 +5177,7 @@ static bool llm_load_tensors(
|
|
|
4983
5177
|
|
|
4984
5178
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
4985
5179
|
|
|
4986
|
-
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert},
|
|
5180
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
4987
5181
|
if (layer.ffn_gate_exps) {
|
|
4988
5182
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
|
4989
5183
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
@@ -5085,11 +5279,9 @@ static bool llm_load_tensors(
|
|
|
5085
5279
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5086
5280
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5087
5281
|
|
|
5088
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5282
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5089
5283
|
if (!model.output) {
|
|
5090
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
|
5091
|
-
ml.n_created--; // artificial tensor
|
|
5092
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5284
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
|
5093
5285
|
}
|
|
5094
5286
|
}
|
|
5095
5287
|
|
|
@@ -5102,8 +5294,8 @@ static bool llm_load_tensors(
|
|
|
5102
5294
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5103
5295
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
5104
5296
|
|
|
5105
|
-
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd},
|
|
5106
|
-
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd},
|
|
5297
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5298
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5107
5299
|
|
|
5108
5300
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
5109
5301
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
@@ -5121,7 +5313,12 @@ static bool llm_load_tensors(
|
|
|
5121
5313
|
{
|
|
5122
5314
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5123
5315
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5124
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
5316
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5317
|
+
if (!model.output) {
|
|
5318
|
+
// needs to be on GPU
|
|
5319
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5320
|
+
}
|
|
5321
|
+
|
|
5125
5322
|
}
|
|
5126
5323
|
|
|
5127
5324
|
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -5149,47 +5346,6 @@ static bool llm_load_tensors(
|
|
|
5149
5346
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
5150
5347
|
}
|
|
5151
5348
|
} break;
|
|
5152
|
-
case LLM_ARCH_PERSIMMON:
|
|
5153
|
-
{
|
|
5154
|
-
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5155
|
-
|
|
5156
|
-
{
|
|
5157
|
-
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5158
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5159
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
5160
|
-
}
|
|
5161
|
-
|
|
5162
|
-
for (int i = 0; i < n_layer; ++i) {
|
|
5163
|
-
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
5164
|
-
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
5165
|
-
|
|
5166
|
-
auto & layer = model.layers[i];
|
|
5167
|
-
|
|
5168
|
-
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5169
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
5170
|
-
|
|
5171
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
5172
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
5173
|
-
|
|
5174
|
-
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
5175
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
5176
|
-
|
|
5177
|
-
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
5178
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
5179
|
-
|
|
5180
|
-
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5181
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
5182
|
-
|
|
5183
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
5184
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
5185
|
-
|
|
5186
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
|
5187
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
|
5188
|
-
|
|
5189
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
|
5190
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
|
5191
|
-
}
|
|
5192
|
-
} break;
|
|
5193
5349
|
case LLM_ARCH_BERT:
|
|
5194
5350
|
case LLM_ARCH_NOMIC_BERT:
|
|
5195
5351
|
{
|
|
@@ -5242,6 +5398,50 @@ static bool llm_load_tensors(
|
|
|
5242
5398
|
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
|
5243
5399
|
}
|
|
5244
5400
|
} break;
|
|
5401
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
5402
|
+
{
|
|
5403
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // word_embeddings
|
|
5404
|
+
model.type_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_vocab_type}); //token_type_embeddings
|
|
5405
|
+
model.tok_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); // LayerNorm
|
|
5406
|
+
model.tok_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); //LayerNorm bias
|
|
5407
|
+
|
|
5408
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
5409
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
5410
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
5411
|
+
|
|
5412
|
+
auto & layer = model.layers[i]; // JinaBertLayer
|
|
5413
|
+
|
|
5414
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
5415
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
|
5416
|
+
|
|
5417
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5418
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5419
|
+
|
|
5420
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
5421
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
5422
|
+
|
|
5423
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5424
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5425
|
+
|
|
5426
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
5427
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
|
5428
|
+
|
|
5429
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); //output_dens
|
|
5430
|
+
layer.bo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); //output_dens
|
|
5431
|
+
|
|
5432
|
+
layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
|
|
5433
|
+
layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd});
|
|
5434
|
+
|
|
5435
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5436
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5437
|
+
|
|
5438
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
5439
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
5440
|
+
|
|
5441
|
+
layer.layer_out_norm = ml.create_tensor(ctx_split, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
|
|
5442
|
+
layer.layer_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd});
|
|
5443
|
+
}
|
|
5444
|
+
} break;
|
|
5245
5445
|
case LLM_ARCH_BLOOM:
|
|
5246
5446
|
{
|
|
5247
5447
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
@@ -5283,18 +5483,16 @@ static bool llm_load_tensors(
|
|
|
5283
5483
|
case LLM_ARCH_MPT:
|
|
5284
5484
|
{
|
|
5285
5485
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5286
|
-
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train},
|
|
5486
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5287
5487
|
|
|
5288
5488
|
// output
|
|
5289
5489
|
{
|
|
5290
5490
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5291
|
-
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
|
5491
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5292
5492
|
|
|
5293
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5493
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5294
5494
|
if (!model.output) {
|
|
5295
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
|
5296
|
-
ml.n_created--; // artificial tensor
|
|
5297
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5495
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // needs to be on GPU
|
|
5298
5496
|
}
|
|
5299
5497
|
}
|
|
5300
5498
|
|
|
@@ -5305,31 +5503,31 @@ static bool llm_load_tensors(
|
|
|
5305
5503
|
auto & layer = model.layers[i];
|
|
5306
5504
|
|
|
5307
5505
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5308
|
-
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd},
|
|
5506
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5309
5507
|
|
|
5310
5508
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
5311
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
|
5509
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5312
5510
|
|
|
5313
5511
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
5314
|
-
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},
|
|
5512
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5315
5513
|
|
|
5316
5514
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
5317
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
|
5515
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5318
5516
|
|
|
5319
5517
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
5320
|
-
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd},
|
|
5518
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5321
5519
|
|
|
5322
5520
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5323
|
-
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff},
|
|
5521
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5324
5522
|
|
|
5325
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd},
|
|
5326
|
-
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd},
|
|
5523
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5524
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5327
5525
|
|
|
5328
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd},
|
|
5329
|
-
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd},
|
|
5526
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5527
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5330
5528
|
|
|
5331
5529
|
// AWQ ScaleActivation layer
|
|
5332
|
-
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff},
|
|
5530
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5333
5531
|
}
|
|
5334
5532
|
} break;
|
|
5335
5533
|
case LLM_ARCH_STABLELM:
|
|
@@ -5358,17 +5556,17 @@ static bool llm_load_tensors(
|
|
|
5358
5556
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
5359
5557
|
|
|
5360
5558
|
// optional bias tensors, present in Stable LM 2 1.6B
|
|
5361
|
-
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
5362
|
-
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
5363
|
-
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
5559
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5560
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5561
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5364
5562
|
|
|
5365
5563
|
// optional q and k layernorms, present in StableLM 2 12B
|
|
5366
|
-
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head},
|
|
5367
|
-
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv},
|
|
5564
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5565
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5368
5566
|
|
|
5369
5567
|
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
|
5370
|
-
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd},
|
|
5371
|
-
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd},
|
|
5568
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5569
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5372
5570
|
|
|
5373
5571
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
5374
5572
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
@@ -5411,12 +5609,10 @@ static bool llm_load_tensors(
|
|
|
5411
5609
|
// output
|
|
5412
5610
|
{
|
|
5413
5611
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5414
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5612
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5415
5613
|
// if output is NULL, init from the input tok embed
|
|
5416
5614
|
if (model.output == NULL) {
|
|
5417
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5418
|
-
ml.n_created--; // artificial tensor
|
|
5419
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5615
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5420
5616
|
}
|
|
5421
5617
|
}
|
|
5422
5618
|
|
|
@@ -5514,8 +5710,8 @@ static bool llm_load_tensors(
|
|
|
5514
5710
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
5515
5711
|
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
5516
5712
|
|
|
5517
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
|
5518
|
-
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
|
5713
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5714
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5519
5715
|
|
|
5520
5716
|
if (layer.wqkv == nullptr) {
|
|
5521
5717
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
@@ -5552,17 +5748,20 @@ static bool llm_load_tensors(
|
|
|
5552
5748
|
ggml_context* ctx_layer = ctx_for_layer(i);
|
|
5553
5749
|
ggml_context* ctx_split = ctx_for_layer_split(i);
|
|
5554
5750
|
|
|
5555
|
-
auto& layer = model.layers[i];
|
|
5751
|
+
auto & layer = model.layers[i];
|
|
5556
5752
|
|
|
5557
5753
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd });
|
|
5558
5754
|
|
|
5559
|
-
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
|
5560
|
-
layer.wo
|
|
5755
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5756
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd });
|
|
5561
5757
|
|
|
5562
5758
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd });
|
|
5563
5759
|
|
|
5564
5760
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd });
|
|
5565
5761
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff });
|
|
5762
|
+
|
|
5763
|
+
layer.rope_long = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
5764
|
+
layer.rope_short = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight"), { n_embd_head/2 }, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
5566
5765
|
}
|
|
5567
5766
|
} break;
|
|
5568
5767
|
case LLM_ARCH_PLAMO:
|
|
@@ -5731,9 +5930,7 @@ static bool llm_load_tensors(
|
|
|
5731
5930
|
|
|
5732
5931
|
// output
|
|
5733
5932
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5734
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // same as tok_embd, duplicated to allow offloading
|
|
5735
|
-
ml.n_created--; // artificial tensor
|
|
5736
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5933
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
|
5737
5934
|
|
|
5738
5935
|
const int64_t n_ff = hparams.n_ff;
|
|
5739
5936
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
@@ -5768,12 +5965,10 @@ static bool llm_load_tensors(
|
|
|
5768
5965
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5769
5966
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
5770
5967
|
|
|
5771
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
5968
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5772
5969
|
// if output is NULL, init from the input tok embed
|
|
5773
5970
|
if (model.output == NULL) {
|
|
5774
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5775
|
-
ml.n_created--; // artificial tensor
|
|
5776
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
5971
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5777
5972
|
}
|
|
5778
5973
|
|
|
5779
5974
|
}
|
|
@@ -5824,12 +6019,10 @@ static bool llm_load_tensors(
|
|
|
5824
6019
|
{
|
|
5825
6020
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5826
6021
|
|
|
5827
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
6022
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5828
6023
|
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
5829
6024
|
if (model.output == NULL) {
|
|
5830
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5831
|
-
ml.n_created--; // artificial tensor
|
|
5832
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
6025
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5833
6026
|
}
|
|
5834
6027
|
}
|
|
5835
6028
|
|
|
@@ -5890,9 +6083,7 @@ static bool llm_load_tensors(
|
|
|
5890
6083
|
{
|
|
5891
6084
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
5892
6085
|
// init output from the input tok embed
|
|
5893
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5894
|
-
ml.n_created--; // artificial tensor
|
|
5895
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
6086
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5896
6087
|
}
|
|
5897
6088
|
|
|
5898
6089
|
for (int i = 0; i < n_layer; ++i) {
|
|
@@ -5924,12 +6115,10 @@ static bool llm_load_tensors(
|
|
|
5924
6115
|
|
|
5925
6116
|
// output
|
|
5926
6117
|
{
|
|
5927
|
-
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab},
|
|
6118
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
5928
6119
|
// if output is NULL, init from the input tok embed
|
|
5929
6120
|
if (model.output == NULL) {
|
|
5930
|
-
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
5931
|
-
ml.n_created--; // artificial tensor
|
|
5932
|
-
ml.size_data += ggml_nbytes(model.output);
|
|
6121
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
5933
6122
|
}
|
|
5934
6123
|
}
|
|
5935
6124
|
|
|
@@ -5949,6 +6138,81 @@ static bool llm_load_tensors(
|
|
|
5949
6138
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
5950
6139
|
}
|
|
5951
6140
|
} break;
|
|
6141
|
+
case LLM_ARCH_GPTNEOX:
|
|
6142
|
+
{
|
|
6143
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
6144
|
+
// output
|
|
6145
|
+
{
|
|
6146
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
6147
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
6148
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
6149
|
+
}
|
|
6150
|
+
|
|
6151
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6152
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
6153
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
6154
|
+
|
|
6155
|
+
auto & layer = model.layers[i];
|
|
6156
|
+
|
|
6157
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
6158
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
6159
|
+
|
|
6160
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
6161
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
6162
|
+
|
|
6163
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
6164
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
6165
|
+
|
|
6166
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
6167
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
6168
|
+
|
|
6169
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
6170
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
6171
|
+
|
|
6172
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
6173
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
6174
|
+
}
|
|
6175
|
+
} break;
|
|
6176
|
+
case LLM_ARCH_ARCTIC:
|
|
6177
|
+
{
|
|
6178
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
6179
|
+
|
|
6180
|
+
// output
|
|
6181
|
+
{
|
|
6182
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
6183
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
6184
|
+
// if output is NULL, init from the input tok embed
|
|
6185
|
+
if (model.output == NULL) {
|
|
6186
|
+
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
|
6187
|
+
}
|
|
6188
|
+
}
|
|
6189
|
+
|
|
6190
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
6191
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
6192
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
6193
|
+
|
|
6194
|
+
auto & layer = model.layers[i];
|
|
6195
|
+
|
|
6196
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
6197
|
+
|
|
6198
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
6199
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
6200
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
6201
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
6202
|
+
|
|
6203
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
6204
|
+
|
|
6205
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
|
6206
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
|
6207
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
|
6208
|
+
|
|
6209
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
|
6210
|
+
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
|
6211
|
+
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
|
6212
|
+
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
|
6213
|
+
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
|
6214
|
+
}
|
|
6215
|
+
} break;
|
|
5952
6216
|
default:
|
|
5953
6217
|
throw std::runtime_error("unknown architecture");
|
|
5954
6218
|
}
|
|
@@ -6213,10 +6477,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
|
6213
6477
|
|
|
6214
6478
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
|
6215
6479
|
} else {
|
|
6216
|
-
|
|
6217
|
-
GGML_ASSERT(false && "not implemented");
|
|
6218
|
-
#endif
|
|
6219
|
-
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
6480
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
6220
6481
|
inpL = lctx.inp_embd;
|
|
6221
6482
|
ggml_set_input(lctx.inp_embd);
|
|
6222
6483
|
}
|
|
@@ -6318,7 +6579,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|
|
6318
6579
|
llm_ffn_gate_type type_gate,
|
|
6319
6580
|
const llm_build_cb & cb,
|
|
6320
6581
|
int il) {
|
|
6321
|
-
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
|
6582
|
+
struct ggml_tensor * tmp = up ? ggml_mul_mat(ctx, up, cur) : cur;
|
|
6322
6583
|
cb(tmp, "ffn_up", il);
|
|
6323
6584
|
|
|
6324
6585
|
if (up_b) {
|
|
@@ -6500,7 +6761,6 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6500
6761
|
struct ggml_tensor * wo_b,
|
|
6501
6762
|
struct ggml_tensor * q_cur,
|
|
6502
6763
|
struct ggml_tensor * kq_mask,
|
|
6503
|
-
struct ggml_tensor * kq_pos,
|
|
6504
6764
|
int32_t n_tokens,
|
|
6505
6765
|
int32_t n_kv,
|
|
6506
6766
|
float kq_scale,
|
|
@@ -6512,6 +6772,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6512
6772
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
6513
6773
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
6514
6774
|
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
|
6775
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
6515
6776
|
|
|
6516
6777
|
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
|
|
6517
6778
|
cb(q, "q", il);
|
|
@@ -6530,31 +6791,27 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6530
6791
|
GGML_UNUSED(model);
|
|
6531
6792
|
GGML_UNUSED(n_ctx);
|
|
6532
6793
|
|
|
6533
|
-
// note: if this assert triggers, then some check has failed earlier
|
|
6534
|
-
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
|
|
6535
|
-
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
|
|
6536
|
-
|
|
6537
6794
|
// split cached v into n_head heads (not transposed)
|
|
6538
6795
|
struct ggml_tensor * v =
|
|
6539
6796
|
ggml_view_3d(ctx, kv.v_l[il],
|
|
6540
6797
|
n_embd_head_v, n_kv, n_head_kv,
|
|
6541
|
-
ggml_row_size(kv.v_l[il]->type,
|
|
6542
|
-
ggml_row_size(kv.v_l[il]->type,
|
|
6798
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
|
|
6799
|
+
ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
|
|
6543
6800
|
0);
|
|
6544
6801
|
cb(v, "v", il);
|
|
6545
6802
|
|
|
6546
|
-
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
|
|
6803
|
+
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
6547
6804
|
|
|
6548
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
|
6805
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
|
6549
6806
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
|
6550
6807
|
}
|
|
6551
6808
|
|
|
6552
|
-
cur = ggml_reshape_2d(ctx, cur,
|
|
6809
|
+
cur = ggml_reshape_2d(ctx, cur, n_embd_head_v*n_head, n_tokens);
|
|
6553
6810
|
} else {
|
|
6554
6811
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
|
6555
6812
|
cb(kq, "kq", il);
|
|
6556
6813
|
|
|
6557
|
-
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
|
6814
|
+
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_GPTNEOX) {
|
|
6558
6815
|
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
|
6559
6816
|
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
|
6560
6817
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
|
@@ -6574,28 +6831,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6574
6831
|
kq = ggml_scale(ctx, kq, 30);
|
|
6575
6832
|
}
|
|
6576
6833
|
|
|
6577
|
-
|
|
6578
|
-
|
|
6579
|
-
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
|
6580
|
-
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
|
6581
|
-
if (hparams.use_alibi) {
|
|
6582
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
|
6583
|
-
cb(kq, "kq_scaled", il);
|
|
6584
|
-
|
|
6585
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
|
|
6586
|
-
cb(kq, "kq_scaled_alibi", il);
|
|
6587
|
-
|
|
6588
|
-
kq = ggml_add(ctx, kq, kq_mask);
|
|
6589
|
-
cb(kq, "kq_masked", il);
|
|
6590
|
-
|
|
6591
|
-
kq = ggml_soft_max(ctx, kq);
|
|
6592
|
-
cb(kq, "kq_soft_max", il);
|
|
6593
|
-
} else
|
|
6594
|
-
#endif
|
|
6595
|
-
{
|
|
6596
|
-
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
|
|
6597
|
-
cb(kq, "kq_soft_max_ext", il);
|
|
6598
|
-
}
|
|
6834
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
|
|
6835
|
+
cb(kq, "kq_soft_max_ext", il);
|
|
6599
6836
|
|
|
6600
6837
|
GGML_ASSERT(kv.size == n_ctx);
|
|
6601
6838
|
|
|
@@ -6614,7 +6851,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|
|
6614
6851
|
struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
|
|
6615
6852
|
cb(kqv_merged, "kqv_merged", il);
|
|
6616
6853
|
|
|
6617
|
-
cur = ggml_cont_2d(ctx, kqv_merged,
|
|
6854
|
+
cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_v*n_head, n_tokens);
|
|
6618
6855
|
cb(cur, "kqv_merged_cont", il);
|
|
6619
6856
|
}
|
|
6620
6857
|
|
|
@@ -6645,7 +6882,6 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
6645
6882
|
struct ggml_tensor * v_cur,
|
|
6646
6883
|
struct ggml_tensor * q_cur,
|
|
6647
6884
|
struct ggml_tensor * kq_mask,
|
|
6648
|
-
struct ggml_tensor * kq_pos,
|
|
6649
6885
|
int32_t n_tokens,
|
|
6650
6886
|
int32_t kv_head,
|
|
6651
6887
|
int32_t n_kv,
|
|
@@ -6664,7 +6900,7 @@ static struct ggml_tensor * llm_build_kv(
|
|
|
6664
6900
|
struct ggml_tensor * cur;
|
|
6665
6901
|
|
|
6666
6902
|
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
|
|
6667
|
-
q_cur, kq_mask,
|
|
6903
|
+
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
|
|
6668
6904
|
cb(cur, "kqv_out", il);
|
|
6669
6905
|
|
|
6670
6906
|
return cur;
|
|
@@ -6771,18 +7007,17 @@ struct llm_build_context {
|
|
|
6771
7007
|
|
|
6772
7008
|
ctx0 = ggml_init(params);
|
|
6773
7009
|
|
|
6774
|
-
lctx.inp_tokens
|
|
6775
|
-
lctx.inp_embd
|
|
6776
|
-
lctx.inp_pos
|
|
7010
|
+
lctx.inp_tokens = nullptr;
|
|
7011
|
+
lctx.inp_embd = nullptr;
|
|
7012
|
+
lctx.inp_pos = nullptr;
|
|
6777
7013
|
lctx.inp_out_ids = nullptr;
|
|
6778
7014
|
lctx.inp_KQ_mask = nullptr;
|
|
6779
|
-
lctx.inp_KQ_pos = nullptr;
|
|
6780
7015
|
lctx.inp_K_shift = nullptr;
|
|
6781
|
-
lctx.inp_mean
|
|
6782
|
-
lctx.inp_cls
|
|
6783
|
-
lctx.inp_s_copy
|
|
6784
|
-
lctx.inp_s_mask
|
|
6785
|
-
lctx.inp_s_seq
|
|
7016
|
+
lctx.inp_mean = nullptr;
|
|
7017
|
+
lctx.inp_cls = nullptr;
|
|
7018
|
+
lctx.inp_s_copy = nullptr;
|
|
7019
|
+
lctx.inp_s_mask = nullptr;
|
|
7020
|
+
lctx.inp_s_seq = nullptr;
|
|
6786
7021
|
}
|
|
6787
7022
|
|
|
6788
7023
|
void free() {
|
|
@@ -6801,17 +7036,20 @@ struct llm_build_context {
|
|
|
6801
7036
|
cb(lctx.inp_K_shift, "K_shift", -1);
|
|
6802
7037
|
ggml_set_input(lctx.inp_K_shift);
|
|
6803
7038
|
|
|
7039
|
+
|
|
6804
7040
|
for (int il = 0; il < n_layer; ++il) {
|
|
7041
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
6805
7042
|
struct ggml_tensor * tmp =
|
|
6806
7043
|
// we rotate only the first n_rot dimensions
|
|
6807
|
-
|
|
7044
|
+
ggml_rope_ext_inplace(ctx0,
|
|
6808
7045
|
ggml_view_3d(ctx0, kv_self.k_l[il],
|
|
6809
7046
|
n_embd_head_k, n_head_kv, n_ctx,
|
|
6810
7047
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
|
6811
7048
|
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
|
6812
7049
|
0),
|
|
6813
|
-
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7050
|
+
lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
6814
7051
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
7052
|
+
|
|
6815
7053
|
cb(tmp, "K_shifted", il);
|
|
6816
7054
|
ggml_build_forward_expand(gf, tmp);
|
|
6817
7055
|
}
|
|
@@ -6914,6 +7152,17 @@ struct llm_build_context {
|
|
|
6914
7152
|
return lctx.inp_pos;
|
|
6915
7153
|
}
|
|
6916
7154
|
|
|
7155
|
+
struct ggml_tensor * build_rope_factors(int il) {
|
|
7156
|
+
// choose long/short freq factors based on the context size
|
|
7157
|
+
const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
|
|
7158
|
+
|
|
7159
|
+
if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
|
|
7160
|
+
return model.layers[il].rope_long;
|
|
7161
|
+
}
|
|
7162
|
+
|
|
7163
|
+
return model.layers[il].rope_short;
|
|
7164
|
+
}
|
|
7165
|
+
|
|
6917
7166
|
struct ggml_tensor * build_inp_out_ids() {
|
|
6918
7167
|
lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs);
|
|
6919
7168
|
cb(lctx.inp_out_ids, "inp_out_ids", -1);
|
|
@@ -6932,19 +7181,6 @@ struct llm_build_context {
|
|
|
6932
7181
|
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
|
|
6933
7182
|
}
|
|
6934
7183
|
|
|
6935
|
-
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
|
|
6936
|
-
if (causal) {
|
|
6937
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
|
|
6938
|
-
} else {
|
|
6939
|
-
// TODO: this will be needed for ALiBi-based BERT models
|
|
6940
|
-
// https://github.com/ggerganov/llama.cpp/pull/6826
|
|
6941
|
-
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens);
|
|
6942
|
-
}
|
|
6943
|
-
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
|
|
6944
|
-
ggml_set_input(lctx.inp_KQ_pos);
|
|
6945
|
-
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
|
|
6946
|
-
}
|
|
6947
|
-
|
|
6948
7184
|
struct ggml_tensor * build_inp_mean() {
|
|
6949
7185
|
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
|
|
6950
7186
|
cb(lctx.inp_mean, "inp_mean", -1);
|
|
@@ -7034,15 +7270,15 @@ struct llm_build_context {
|
|
|
7034
7270
|
cb(Vcur, "Vcur", il);
|
|
7035
7271
|
}
|
|
7036
7272
|
|
|
7037
|
-
Qcur =
|
|
7038
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7273
|
+
Qcur = ggml_rope_ext(
|
|
7274
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7039
7275
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7040
7276
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7041
7277
|
);
|
|
7042
7278
|
cb(Qcur, "Qcur", il);
|
|
7043
7279
|
|
|
7044
|
-
Kcur =
|
|
7045
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7280
|
+
Kcur = ggml_rope_ext(
|
|
7281
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7046
7282
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7047
7283
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7048
7284
|
);
|
|
@@ -7050,7 +7286,7 @@ struct llm_build_context {
|
|
|
7050
7286
|
|
|
7051
7287
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7052
7288
|
model.layers[il].wo, model.layers[il].bo,
|
|
7053
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7289
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7054
7290
|
}
|
|
7055
7291
|
|
|
7056
7292
|
if (il == n_layer - 1) {
|
|
@@ -7143,9 +7379,6 @@ struct llm_build_context {
|
|
|
7143
7379
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7144
7380
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
7145
7381
|
|
|
7146
|
-
// positions of the tokens in the KV cache
|
|
7147
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
7148
|
-
|
|
7149
7382
|
for (int il = 0; il < n_layer; ++il) {
|
|
7150
7383
|
struct ggml_tensor * inpSA = inpL;
|
|
7151
7384
|
|
|
@@ -7167,13 +7400,13 @@ struct llm_build_context {
|
|
|
7167
7400
|
|
|
7168
7401
|
switch (model.type) {
|
|
7169
7402
|
case MODEL_7B:
|
|
7170
|
-
Qcur =
|
|
7171
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7403
|
+
Qcur = ggml_rope_ext(
|
|
7404
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7172
7405
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7173
7406
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7174
7407
|
);
|
|
7175
|
-
Kcur =
|
|
7176
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7408
|
+
Kcur = ggml_rope_ext(
|
|
7409
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7177
7410
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7178
7411
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7179
7412
|
);
|
|
@@ -7190,7 +7423,7 @@ struct llm_build_context {
|
|
|
7190
7423
|
|
|
7191
7424
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7192
7425
|
model.layers[il].wo, NULL,
|
|
7193
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7426
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7194
7427
|
}
|
|
7195
7428
|
|
|
7196
7429
|
if (il == n_layer - 1) {
|
|
@@ -7260,9 +7493,6 @@ struct llm_build_context {
|
|
|
7260
7493
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7261
7494
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
7262
7495
|
|
|
7263
|
-
// positions of the tokens in the KV cache
|
|
7264
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
7265
|
-
|
|
7266
7496
|
for (int il = 0; il < n_layer; ++il) {
|
|
7267
7497
|
struct ggml_tensor * inpSA = inpL;
|
|
7268
7498
|
|
|
@@ -7282,22 +7512,22 @@ struct llm_build_context {
|
|
|
7282
7512
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
7283
7513
|
cb(Vcur, "Vcur", il);
|
|
7284
7514
|
|
|
7285
|
-
Qcur =
|
|
7286
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7515
|
+
Qcur = ggml_rope_ext(
|
|
7516
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7287
7517
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7288
7518
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7289
7519
|
);
|
|
7290
7520
|
cb(Qcur, "Qcur", il);
|
|
7291
7521
|
|
|
7292
|
-
Kcur =
|
|
7293
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7522
|
+
Kcur = ggml_rope_ext(
|
|
7523
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7294
7524
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7295
7525
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7296
7526
|
);
|
|
7297
7527
|
cb(Kcur, "Kcur", il);
|
|
7298
7528
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7299
7529
|
model.layers[il].wo, NULL,
|
|
7300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7530
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7301
7531
|
}
|
|
7302
7532
|
|
|
7303
7533
|
if (il == n_layer - 1) {
|
|
@@ -7403,21 +7633,21 @@ struct llm_build_context {
|
|
|
7403
7633
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7404
7634
|
|
|
7405
7635
|
// using mode = 2 for neox mode
|
|
7406
|
-
Qcur =
|
|
7407
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
7636
|
+
Qcur = ggml_rope_ext(
|
|
7637
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
7408
7638
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7409
7639
|
);
|
|
7410
7640
|
cb(Qcur, "Qcur", il);
|
|
7411
7641
|
|
|
7412
|
-
Kcur =
|
|
7413
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
7642
|
+
Kcur = ggml_rope_ext(
|
|
7643
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
7414
7644
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7415
7645
|
);
|
|
7416
7646
|
cb(Kcur, "Kcur", il);
|
|
7417
7647
|
|
|
7418
7648
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7419
7649
|
model.layers[il].wo, NULL,
|
|
7420
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7650
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7421
7651
|
}
|
|
7422
7652
|
|
|
7423
7653
|
if (il == n_layer - 1) {
|
|
@@ -7526,15 +7756,15 @@ struct llm_build_context {
|
|
|
7526
7756
|
cb(Vcur, "Vcur", il);
|
|
7527
7757
|
}
|
|
7528
7758
|
|
|
7529
|
-
Qcur =
|
|
7530
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7759
|
+
Qcur = ggml_rope_ext(
|
|
7760
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7531
7761
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7532
7762
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7533
7763
|
);
|
|
7534
7764
|
cb(Qcur, "Qcur", il);
|
|
7535
7765
|
|
|
7536
|
-
Kcur =
|
|
7537
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7766
|
+
Kcur = ggml_rope_ext(
|
|
7767
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7538
7768
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7539
7769
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7540
7770
|
);
|
|
@@ -7542,7 +7772,7 @@ struct llm_build_context {
|
|
|
7542
7772
|
|
|
7543
7773
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7544
7774
|
model.layers[il].wo, model.layers[il].bo,
|
|
7545
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7775
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
7546
7776
|
}
|
|
7547
7777
|
|
|
7548
7778
|
if (il == n_layer - 1) {
|
|
@@ -7678,15 +7908,15 @@ struct llm_build_context {
|
|
|
7678
7908
|
cb(Kcur, "Kcur", il);
|
|
7679
7909
|
cb(Vcur, "Vcur", il);
|
|
7680
7910
|
|
|
7681
|
-
Qcur =
|
|
7682
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
7911
|
+
Qcur = ggml_rope_ext(
|
|
7912
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
7683
7913
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7684
7914
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7685
7915
|
);
|
|
7686
7916
|
cb(Qcur, "Qcur", il);
|
|
7687
7917
|
|
|
7688
|
-
Kcur =
|
|
7689
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
7918
|
+
Kcur = ggml_rope_ext(
|
|
7919
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
7690
7920
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
7691
7921
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7692
7922
|
);
|
|
@@ -7694,7 +7924,7 @@ struct llm_build_context {
|
|
|
7694
7924
|
|
|
7695
7925
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7696
7926
|
model.layers[il].wo, NULL,
|
|
7697
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
7927
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7698
7928
|
}
|
|
7699
7929
|
|
|
7700
7930
|
if (il == n_layer - 1) {
|
|
@@ -7806,7 +8036,7 @@ struct llm_build_context {
|
|
|
7806
8036
|
|
|
7807
8037
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
7808
8038
|
model.layers[il].wo, model.layers[il].bo,
|
|
7809
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8039
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
7810
8040
|
}
|
|
7811
8041
|
|
|
7812
8042
|
if (il == n_layer - 1) {
|
|
@@ -7855,259 +8085,49 @@ struct llm_build_context {
|
|
|
7855
8085
|
return gf;
|
|
7856
8086
|
}
|
|
7857
8087
|
|
|
7858
|
-
struct ggml_cgraph *
|
|
8088
|
+
struct ggml_cgraph * build_refact() {
|
|
7859
8089
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
7860
8090
|
|
|
7861
8091
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7862
|
-
GGML_ASSERT(n_embd_head
|
|
7863
|
-
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
|
8092
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7864
8093
|
|
|
7865
8094
|
struct ggml_tensor * cur;
|
|
7866
8095
|
struct ggml_tensor * inpL;
|
|
7867
8096
|
|
|
7868
8097
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
7869
8098
|
|
|
7870
|
-
// inp_pos - contains the positions
|
|
7871
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
7872
|
-
|
|
7873
8099
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
7874
8100
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
7875
8101
|
|
|
7876
8102
|
for (int il = 0; il < n_layer; ++il) {
|
|
7877
|
-
struct ggml_tensor *
|
|
8103
|
+
struct ggml_tensor * inpSA = inpL;
|
|
7878
8104
|
|
|
7879
8105
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
7880
|
-
model.layers[il].attn_norm,
|
|
7881
|
-
|
|
7882
|
-
LLM_NORM, cb, il);
|
|
8106
|
+
model.layers[il].attn_norm, NULL,
|
|
8107
|
+
LLM_NORM_RMS, cb, il);
|
|
7883
8108
|
cb(cur, "attn_norm", il);
|
|
7884
8109
|
|
|
7885
|
-
// self
|
|
8110
|
+
// self-attention
|
|
7886
8111
|
{
|
|
7887
|
-
|
|
7888
|
-
cb(
|
|
7889
|
-
|
|
7890
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7891
|
-
cb(cur, "bqkv", il);
|
|
7892
|
-
|
|
7893
|
-
// split qkv
|
|
7894
|
-
GGML_ASSERT(n_head_kv == n_head);
|
|
8112
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
8113
|
+
cb(Qcur, "Qcur", il);
|
|
7895
8114
|
|
|
7896
|
-
struct ggml_tensor *
|
|
7897
|
-
cb(
|
|
8115
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
8116
|
+
cb(Kcur, "Kcur", il);
|
|
7898
8117
|
|
|
7899
|
-
struct ggml_tensor *
|
|
7900
|
-
cb(
|
|
8118
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
8119
|
+
cb(Vcur, "Vcur", il);
|
|
7901
8120
|
|
|
7902
|
-
|
|
7903
|
-
|
|
7904
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
|
7905
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
|
7906
|
-
0
|
|
7907
|
-
);
|
|
7908
|
-
cb(tmpq, "tmpq", il);
|
|
8121
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8122
|
+
cb(Kcur, "Kcur", il);
|
|
7909
8123
|
|
|
7910
|
-
|
|
7911
|
-
|
|
7912
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
|
7913
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
|
7914
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
|
7915
|
-
);
|
|
7916
|
-
cb(tmpk, "tmpk", il);
|
|
8124
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8125
|
+
cb(Qcur, "Qcur", il);
|
|
7917
8126
|
|
|
7918
|
-
|
|
7919
|
-
|
|
7920
|
-
|
|
7921
|
-
|
|
7922
|
-
LLM_NORM, cb, il);
|
|
7923
|
-
cb(tmpq, "tmpq", il);
|
|
7924
|
-
|
|
7925
|
-
tmpk = llm_build_norm(ctx0, tmpk, hparams,
|
|
7926
|
-
model.layers[il].attn_k_norm,
|
|
7927
|
-
model.layers[il].attn_k_norm_b,
|
|
7928
|
-
LLM_NORM, cb, il);
|
|
7929
|
-
cb(tmpk, "tmpk", il);
|
|
7930
|
-
|
|
7931
|
-
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
|
7932
|
-
struct ggml_tensor * qrot = ggml_view_3d(
|
|
7933
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
7934
|
-
ggml_element_size(tmpq) * n_embd_head,
|
|
7935
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
7936
|
-
0
|
|
7937
|
-
);
|
|
7938
|
-
cb(qrot, "qrot", il);
|
|
7939
|
-
|
|
7940
|
-
struct ggml_tensor * krot = ggml_view_3d(
|
|
7941
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
7942
|
-
ggml_element_size(tmpk) * n_embd_head,
|
|
7943
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
7944
|
-
0
|
|
7945
|
-
);
|
|
7946
|
-
cb(krot, "krot", il);
|
|
7947
|
-
|
|
7948
|
-
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
|
7949
|
-
struct ggml_tensor * qpass = ggml_view_3d(
|
|
7950
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
7951
|
-
ggml_element_size(tmpq) * n_embd_head,
|
|
7952
|
-
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
7953
|
-
ggml_element_size(tmpq) * n_rot
|
|
7954
|
-
);
|
|
7955
|
-
cb(qpass, "qpass", il);
|
|
7956
|
-
|
|
7957
|
-
struct ggml_tensor * kpass = ggml_view_3d(
|
|
7958
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
7959
|
-
ggml_element_size(tmpk) * n_embd_head,
|
|
7960
|
-
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
7961
|
-
ggml_element_size(tmpk) * n_rot
|
|
7962
|
-
);
|
|
7963
|
-
cb(kpass, "kpass", il);
|
|
7964
|
-
|
|
7965
|
-
struct ggml_tensor * qrotated = ggml_rope_custom(
|
|
7966
|
-
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
7967
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7968
|
-
);
|
|
7969
|
-
cb(qrotated, "qrotated", il);
|
|
7970
|
-
|
|
7971
|
-
struct ggml_tensor * krotated = ggml_rope_custom(
|
|
7972
|
-
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
7973
|
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
7974
|
-
);
|
|
7975
|
-
cb(krotated, "krotated", il);
|
|
7976
|
-
|
|
7977
|
-
// ggml currently only supports concatenation on dim=2
|
|
7978
|
-
// so we need to permute qrot, qpass, concat, then permute back.
|
|
7979
|
-
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
|
7980
|
-
cb(qrotated, "qrotated", il);
|
|
7981
|
-
|
|
7982
|
-
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
|
7983
|
-
cb(krotated, "krotated", il);
|
|
7984
|
-
|
|
7985
|
-
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
|
7986
|
-
cb(qpass, "qpass", il);
|
|
7987
|
-
|
|
7988
|
-
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
|
7989
|
-
cb(kpass, "kpass", il);
|
|
7990
|
-
|
|
7991
|
-
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
|
7992
|
-
cb(Qcur, "Qcur", il);
|
|
7993
|
-
|
|
7994
|
-
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
|
7995
|
-
cb(Kcur, "Kcur", il);
|
|
7996
|
-
|
|
7997
|
-
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
|
7998
|
-
cb(Q, "Q", il);
|
|
7999
|
-
|
|
8000
|
-
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
|
8001
|
-
cb(Kcur, "Kcur", il);
|
|
8002
|
-
|
|
8003
|
-
struct ggml_tensor * Vcur = ggml_view_3d(
|
|
8004
|
-
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
|
8005
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
|
8006
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
|
8007
|
-
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
|
8008
|
-
);
|
|
8009
|
-
cb(Vcur, "Vcur", il);
|
|
8010
|
-
|
|
8011
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8012
|
-
model.layers[il].wo, model.layers[il].bo,
|
|
8013
|
-
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8014
|
-
}
|
|
8015
|
-
|
|
8016
|
-
if (il == n_layer - 1) {
|
|
8017
|
-
// skip computing output for unused tokens
|
|
8018
|
-
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
8019
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
8020
|
-
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
8021
|
-
}
|
|
8022
|
-
|
|
8023
|
-
struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur);
|
|
8024
|
-
cb(ffn_inp, "ffn_inp", il);
|
|
8025
|
-
|
|
8026
|
-
// feed-forward network
|
|
8027
|
-
{
|
|
8028
|
-
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
8029
|
-
model.layers[il].ffn_norm,
|
|
8030
|
-
model.layers[il].ffn_norm_b,
|
|
8031
|
-
LLM_NORM, cb, il);
|
|
8032
|
-
cb(cur, "ffn_norm", il);
|
|
8033
|
-
|
|
8034
|
-
cur = llm_build_ffn(ctx0, cur,
|
|
8035
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
8036
|
-
NULL, NULL,
|
|
8037
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
8038
|
-
NULL,
|
|
8039
|
-
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
|
8040
|
-
cb(cur, "ffn_out", il);
|
|
8041
|
-
}
|
|
8042
|
-
|
|
8043
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
8044
|
-
cb(cur, "l_out", il);
|
|
8045
|
-
|
|
8046
|
-
inpL = cur;
|
|
8047
|
-
}
|
|
8048
|
-
|
|
8049
|
-
cur = inpL;
|
|
8050
|
-
|
|
8051
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
|
8052
|
-
model.output_norm,
|
|
8053
|
-
model.output_norm_b,
|
|
8054
|
-
LLM_NORM, cb, -1);
|
|
8055
|
-
cb(cur, "result_norm", -1);
|
|
8056
|
-
|
|
8057
|
-
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
8058
|
-
cb(cur, "result_output", -1);
|
|
8059
|
-
|
|
8060
|
-
ggml_build_forward_expand(gf, cur);
|
|
8061
|
-
|
|
8062
|
-
return gf;
|
|
8063
|
-
}
|
|
8064
|
-
|
|
8065
|
-
struct ggml_cgraph * build_refact() {
|
|
8066
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
8067
|
-
|
|
8068
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
8069
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
8070
|
-
|
|
8071
|
-
struct ggml_tensor * cur;
|
|
8072
|
-
struct ggml_tensor * inpL;
|
|
8073
|
-
|
|
8074
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
8075
|
-
|
|
8076
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
8077
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
8078
|
-
|
|
8079
|
-
// positions of the tokens in the KV cache
|
|
8080
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
8081
|
-
|
|
8082
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
8083
|
-
struct ggml_tensor * inpSA = inpL;
|
|
8084
|
-
|
|
8085
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
8086
|
-
model.layers[il].attn_norm, NULL,
|
|
8087
|
-
LLM_NORM_RMS, cb, il);
|
|
8088
|
-
cb(cur, "attn_norm", il);
|
|
8089
|
-
|
|
8090
|
-
// self-attention
|
|
8091
|
-
{
|
|
8092
|
-
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
8093
|
-
cb(Qcur, "Qcur", il);
|
|
8094
|
-
|
|
8095
|
-
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
8096
|
-
cb(Kcur, "Kcur", il);
|
|
8097
|
-
|
|
8098
|
-
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
8099
|
-
cb(Vcur, "Vcur", il);
|
|
8100
|
-
|
|
8101
|
-
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8102
|
-
cb(Kcur, "Kcur", il);
|
|
8103
|
-
|
|
8104
|
-
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8105
|
-
cb(Qcur, "Qcur", il);
|
|
8106
|
-
|
|
8107
|
-
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8108
|
-
model.layers[il].wo, NULL,
|
|
8109
|
-
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8110
|
-
}
|
|
8127
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8128
|
+
model.layers[il].wo, NULL,
|
|
8129
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8130
|
+
}
|
|
8111
8131
|
|
|
8112
8132
|
if (il == n_layer - 1) {
|
|
8113
8133
|
// skip computing output for unused tokens
|
|
@@ -8168,8 +8188,11 @@ struct llm_build_context {
|
|
|
8168
8188
|
|
|
8169
8189
|
struct ggml_tensor * cur;
|
|
8170
8190
|
struct ggml_tensor * inpL;
|
|
8191
|
+
struct ggml_tensor * inp_pos = nullptr;
|
|
8171
8192
|
|
|
8172
|
-
|
|
8193
|
+
if (model.arch != LLM_ARCH_JINA_BERT_V2) {
|
|
8194
|
+
inp_pos = build_inp_pos();
|
|
8195
|
+
}
|
|
8173
8196
|
struct ggml_tensor * inp_mean = build_inp_mean();
|
|
8174
8197
|
struct ggml_tensor * inp_cls = build_inp_cls();
|
|
8175
8198
|
|
|
@@ -8200,13 +8223,26 @@ struct llm_build_context {
|
|
|
8200
8223
|
struct ggml_tensor * Vcur;
|
|
8201
8224
|
|
|
8202
8225
|
// self-attention
|
|
8203
|
-
if (model.arch == LLM_ARCH_BERT) {
|
|
8226
|
+
if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
8204
8227
|
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), model.layers[il].bq);
|
|
8205
8228
|
cb(Qcur, "Qcur", il);
|
|
8206
8229
|
|
|
8230
|
+
if (model.layers[il].attn_q_norm) {
|
|
8231
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams,
|
|
8232
|
+
model.layers[il].attn_q_norm,
|
|
8233
|
+
model.layers[il].attn_q_norm_b,
|
|
8234
|
+
LLM_NORM, cb, il);
|
|
8235
|
+
}
|
|
8236
|
+
|
|
8207
8237
|
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), model.layers[il].bk);
|
|
8208
8238
|
cb(Kcur, "Kcur", il);
|
|
8209
8239
|
|
|
8240
|
+
if (model.layers[il].attn_k_norm) {
|
|
8241
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
|
8242
|
+
model.layers[il].attn_k_norm,
|
|
8243
|
+
model.layers[il].attn_k_norm_b,
|
|
8244
|
+
LLM_NORM, cb, il);
|
|
8245
|
+
}
|
|
8210
8246
|
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), model.layers[il].bv);
|
|
8211
8247
|
cb(Vcur, "Vcur", il);
|
|
8212
8248
|
|
|
@@ -8225,15 +8261,15 @@ struct llm_build_context {
|
|
|
8225
8261
|
cb(Kcur, "Kcur", il);
|
|
8226
8262
|
cb(Vcur, "Vcur", il);
|
|
8227
8263
|
|
|
8228
|
-
Qcur =
|
|
8229
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
8264
|
+
Qcur = ggml_rope_ext(
|
|
8265
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8230
8266
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8231
8267
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8232
8268
|
);
|
|
8233
8269
|
cb(Qcur, "Qcur", il);
|
|
8234
8270
|
|
|
8235
|
-
Kcur =
|
|
8236
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
8271
|
+
Kcur = ggml_rope_ext(
|
|
8272
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8237
8273
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8238
8274
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8239
8275
|
);
|
|
@@ -8246,7 +8282,7 @@ struct llm_build_context {
|
|
|
8246
8282
|
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
8247
8283
|
cb(kq, "kq", il);
|
|
8248
8284
|
|
|
8249
|
-
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask,
|
|
8285
|
+
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
|
8250
8286
|
cb(kq, "kq_soft_max_ext", il);
|
|
8251
8287
|
|
|
8252
8288
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
|
@@ -8297,6 +8333,13 @@ struct llm_build_context {
|
|
|
8297
8333
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
8298
8334
|
NULL,
|
|
8299
8335
|
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
8336
|
+
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
8337
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
8338
|
+
model.layers[il].ffn_up, NULL,
|
|
8339
|
+
model.layers[il].ffn_gate, NULL,
|
|
8340
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
8341
|
+
NULL,
|
|
8342
|
+
LLM_FFN_GELU, LLM_FFN_PAR, cb, il);
|
|
8300
8343
|
} else {
|
|
8301
8344
|
cur = llm_build_ffn(ctx0, cur,
|
|
8302
8345
|
model.layers[il].ffn_up, NULL,
|
|
@@ -8363,9 +8406,6 @@ struct llm_build_context {
|
|
|
8363
8406
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
8364
8407
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
8365
8408
|
|
|
8366
|
-
// positions of the tokens in the KV cache
|
|
8367
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
8368
|
-
|
|
8369
8409
|
inpL = llm_build_norm(ctx0, inpL, hparams,
|
|
8370
8410
|
model.tok_norm,
|
|
8371
8411
|
model.tok_norm_b,
|
|
@@ -8399,7 +8439,7 @@ struct llm_build_context {
|
|
|
8399
8439
|
|
|
8400
8440
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8401
8441
|
model.layers[il].wo, model.layers[il].bo,
|
|
8402
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8442
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8403
8443
|
}
|
|
8404
8444
|
|
|
8405
8445
|
if (il == n_layer - 1) {
|
|
@@ -8464,9 +8504,6 @@ struct llm_build_context {
|
|
|
8464
8504
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
8465
8505
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
8466
8506
|
|
|
8467
|
-
// positions of the tokens in the KV cache
|
|
8468
|
-
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
|
|
8469
|
-
|
|
8470
8507
|
if (model.pos_embd) {
|
|
8471
8508
|
// inp_pos - contains the positions
|
|
8472
8509
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
@@ -8530,13 +8567,13 @@ struct llm_build_context {
|
|
|
8530
8567
|
|
|
8531
8568
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8532
8569
|
model.layers[il].wo, model.layers[il].bo,
|
|
8533
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8570
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8534
8571
|
} else {
|
|
8535
8572
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
8536
8573
|
|
|
8537
8574
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8538
8575
|
model.layers[il].wo, model.layers[il].bo,
|
|
8539
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8576
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8540
8577
|
}
|
|
8541
8578
|
}
|
|
8542
8579
|
|
|
@@ -8664,15 +8701,15 @@ struct llm_build_context {
|
|
|
8664
8701
|
}
|
|
8665
8702
|
|
|
8666
8703
|
|
|
8667
|
-
Qcur =
|
|
8668
|
-
ctx0, Qcur, inp_pos,
|
|
8704
|
+
Qcur = ggml_rope_ext(
|
|
8705
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
8669
8706
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8670
8707
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8671
8708
|
);
|
|
8672
8709
|
cb(Qcur, "Qcur", il);
|
|
8673
8710
|
|
|
8674
|
-
Kcur =
|
|
8675
|
-
ctx0, Kcur, inp_pos,
|
|
8711
|
+
Kcur = ggml_rope_ext(
|
|
8712
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
8676
8713
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8677
8714
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8678
8715
|
);
|
|
@@ -8680,7 +8717,7 @@ struct llm_build_context {
|
|
|
8680
8717
|
|
|
8681
8718
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8682
8719
|
model.layers[il].wo, NULL,
|
|
8683
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8720
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8684
8721
|
}
|
|
8685
8722
|
|
|
8686
8723
|
if (il == n_layer - 1) {
|
|
@@ -8784,21 +8821,21 @@ struct llm_build_context {
|
|
|
8784
8821
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
8785
8822
|
|
|
8786
8823
|
// using mode = 2 for neox mode
|
|
8787
|
-
Qcur =
|
|
8788
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
8824
|
+
Qcur = ggml_rope_ext(
|
|
8825
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
8789
8826
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8790
8827
|
);
|
|
8791
8828
|
cb(Qcur, "Qcur", il);
|
|
8792
8829
|
|
|
8793
|
-
Kcur =
|
|
8794
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
8830
|
+
Kcur = ggml_rope_ext(
|
|
8831
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
8795
8832
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
8796
8833
|
);
|
|
8797
8834
|
cb(Kcur, "Kcur", il);
|
|
8798
8835
|
|
|
8799
8836
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8800
8837
|
model.layers[il].wo, NULL,
|
|
8801
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8838
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8802
8839
|
}
|
|
8803
8840
|
|
|
8804
8841
|
if (il == n_layer - 1) {
|
|
@@ -8895,15 +8932,15 @@ struct llm_build_context {
|
|
|
8895
8932
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
8896
8933
|
cb(Vcur, "Vcur", il);
|
|
8897
8934
|
|
|
8898
|
-
Qcur =
|
|
8899
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
8935
|
+
Qcur = ggml_rope_ext(
|
|
8936
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
8900
8937
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8901
8938
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8902
8939
|
);
|
|
8903
8940
|
cb(Qcur, "Qcur", il);
|
|
8904
8941
|
|
|
8905
|
-
Kcur =
|
|
8906
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
8942
|
+
Kcur = ggml_rope_ext(
|
|
8943
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
8907
8944
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
8908
8945
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
8909
8946
|
);
|
|
@@ -8911,7 +8948,7 @@ struct llm_build_context {
|
|
|
8911
8948
|
|
|
8912
8949
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
8913
8950
|
model.layers[il].wo, model.layers[il].bo,
|
|
8914
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
8951
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
8915
8952
|
}
|
|
8916
8953
|
|
|
8917
8954
|
if (il == n_layer - 1) {
|
|
@@ -9009,15 +9046,15 @@ struct llm_build_context {
|
|
|
9009
9046
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
9010
9047
|
cb(Vcur, "Vcur", il);
|
|
9011
9048
|
|
|
9012
|
-
Qcur =
|
|
9013
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9049
|
+
Qcur = ggml_rope_ext(
|
|
9050
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9014
9051
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9015
9052
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9016
9053
|
);
|
|
9017
9054
|
cb(Qcur, "Qcur", il);
|
|
9018
9055
|
|
|
9019
|
-
Kcur =
|
|
9020
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9056
|
+
Kcur = ggml_rope_ext(
|
|
9057
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9021
9058
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9022
9059
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9023
9060
|
);
|
|
@@ -9025,7 +9062,7 @@ struct llm_build_context {
|
|
|
9025
9062
|
|
|
9026
9063
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9027
9064
|
model.layers[il].wo, model.layers[il].bo,
|
|
9028
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9065
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9029
9066
|
}
|
|
9030
9067
|
|
|
9031
9068
|
if (il == n_layer - 1) {
|
|
@@ -9161,8 +9198,8 @@ struct llm_build_context {
|
|
|
9161
9198
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9162
9199
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9163
9200
|
|
|
9164
|
-
Qcur =
|
|
9165
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9201
|
+
Qcur = ggml_rope_ext(
|
|
9202
|
+
ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
9166
9203
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9167
9204
|
);
|
|
9168
9205
|
cb(Qcur, "Qcur", il);
|
|
@@ -9172,15 +9209,15 @@ struct llm_build_context {
|
|
|
9172
9209
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
9173
9210
|
cb(Qcur, "Qcur", il);
|
|
9174
9211
|
|
|
9175
|
-
Kcur =
|
|
9176
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9212
|
+
Kcur = ggml_rope_ext(
|
|
9213
|
+
ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
|
|
9177
9214
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9178
9215
|
);
|
|
9179
9216
|
cb(Kcur, "Kcur", il);
|
|
9180
9217
|
|
|
9181
9218
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9182
9219
|
model.layers[il].wo, model.layers[il].bo,
|
|
9183
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9220
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
9184
9221
|
}
|
|
9185
9222
|
|
|
9186
9223
|
if (il == n_layer - 1) {
|
|
@@ -9249,6 +9286,9 @@ struct llm_build_context {
|
|
|
9249
9286
|
|
|
9250
9287
|
// self-attention
|
|
9251
9288
|
{
|
|
9289
|
+
// rope freq factors for 128k context
|
|
9290
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
9291
|
+
|
|
9252
9292
|
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
|
9253
9293
|
model.layers[il].attn_norm,
|
|
9254
9294
|
NULL,
|
|
@@ -9280,8 +9320,8 @@ struct llm_build_context {
|
|
|
9280
9320
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
9281
9321
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
9282
9322
|
|
|
9283
|
-
Qcur =
|
|
9284
|
-
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9323
|
+
Qcur = ggml_rope_ext(
|
|
9324
|
+
ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
|
9285
9325
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9286
9326
|
);
|
|
9287
9327
|
cb(Qcur, "Qcur", il);
|
|
@@ -9289,15 +9329,15 @@ struct llm_build_context {
|
|
|
9289
9329
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
|
9290
9330
|
cb(Qcur, "Qcur", il);
|
|
9291
9331
|
|
|
9292
|
-
Kcur =
|
|
9293
|
-
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
|
9332
|
+
Kcur = ggml_rope_ext(
|
|
9333
|
+
ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
|
|
9294
9334
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
9295
9335
|
);
|
|
9296
9336
|
cb(Kcur, "Kcur", il);
|
|
9297
9337
|
|
|
9298
9338
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9299
9339
|
model.layers[il].wo, model.layers[il].bo,
|
|
9300
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9340
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
9301
9341
|
}
|
|
9302
9342
|
|
|
9303
9343
|
if (il == n_layer - 1) {
|
|
@@ -9396,21 +9436,21 @@ struct llm_build_context {
|
|
|
9396
9436
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
9397
9437
|
cb(Vcur, "Vcur", il);
|
|
9398
9438
|
|
|
9399
|
-
Qcur =
|
|
9400
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
|
9439
|
+
Qcur = ggml_rope_ext(
|
|
9440
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos, nullptr,
|
|
9401
9441
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9402
9442
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9403
9443
|
cb(Qcur, "Qcur", il);
|
|
9404
9444
|
|
|
9405
|
-
Kcur =
|
|
9406
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
|
9445
|
+
Kcur = ggml_rope_ext(
|
|
9446
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9407
9447
|
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9408
9448
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
9409
9449
|
cb(Kcur, "Kcur", il);
|
|
9410
9450
|
|
|
9411
9451
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9412
9452
|
model.layers[il].wo, NULL,
|
|
9413
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9453
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9414
9454
|
}
|
|
9415
9455
|
struct ggml_tensor * sa_out = cur;
|
|
9416
9456
|
|
|
@@ -9513,7 +9553,7 @@ struct llm_build_context {
|
|
|
9513
9553
|
|
|
9514
9554
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9515
9555
|
model.layers[il].wo, model.layers[il].bo,
|
|
9516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9556
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9517
9557
|
}
|
|
9518
9558
|
|
|
9519
9559
|
if (il == n_layer - 1) {
|
|
@@ -9604,15 +9644,15 @@ struct llm_build_context {
|
|
|
9604
9644
|
cb(tmpk, "tmpk", il);
|
|
9605
9645
|
cb(Vcur, "Vcur", il);
|
|
9606
9646
|
|
|
9607
|
-
struct ggml_tensor * Qcur =
|
|
9608
|
-
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9647
|
+
struct ggml_tensor * Qcur = ggml_rope_ext(
|
|
9648
|
+
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9609
9649
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9610
9650
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9611
9651
|
);
|
|
9612
9652
|
cb(Qcur, "Qcur", il);
|
|
9613
9653
|
|
|
9614
|
-
struct ggml_tensor * Kcur =
|
|
9615
|
-
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9654
|
+
struct ggml_tensor * Kcur = ggml_rope_ext(
|
|
9655
|
+
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9616
9656
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9617
9657
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9618
9658
|
);
|
|
@@ -9620,7 +9660,7 @@ struct llm_build_context {
|
|
|
9620
9660
|
|
|
9621
9661
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9622
9662
|
model.layers[il].wo, model.layers[il].bo,
|
|
9623
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9663
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9624
9664
|
}
|
|
9625
9665
|
|
|
9626
9666
|
if (il == n_layer - 1) {
|
|
@@ -9720,15 +9760,15 @@ struct llm_build_context {
|
|
|
9720
9760
|
// cb(Vcur, "Vcur", il);
|
|
9721
9761
|
// }
|
|
9722
9762
|
|
|
9723
|
-
Qcur =
|
|
9724
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9763
|
+
Qcur = ggml_rope_ext(
|
|
9764
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9725
9765
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9726
9766
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9727
9767
|
);
|
|
9728
9768
|
cb(Qcur, "Qcur", il);
|
|
9729
9769
|
|
|
9730
|
-
Kcur =
|
|
9731
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9770
|
+
Kcur = ggml_rope_ext(
|
|
9771
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9732
9772
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9733
9773
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9734
9774
|
);
|
|
@@ -9736,7 +9776,7 @@ struct llm_build_context {
|
|
|
9736
9776
|
|
|
9737
9777
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9738
9778
|
model.layers[il].wo, NULL,
|
|
9739
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9779
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9740
9780
|
}
|
|
9741
9781
|
|
|
9742
9782
|
if (il == n_layer - 1) {
|
|
@@ -9837,15 +9877,15 @@ struct llm_build_context {
|
|
|
9837
9877
|
cb(Vcur, "Vcur", il);
|
|
9838
9878
|
}
|
|
9839
9879
|
|
|
9840
|
-
Qcur =
|
|
9841
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
9880
|
+
Qcur = ggml_rope_ext(
|
|
9881
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9842
9882
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9843
9883
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9844
9884
|
);
|
|
9845
9885
|
cb(Qcur, "Qcur", il);
|
|
9846
9886
|
|
|
9847
|
-
Kcur =
|
|
9848
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
9887
|
+
Kcur = ggml_rope_ext(
|
|
9888
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9849
9889
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9850
9890
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9851
9891
|
);
|
|
@@ -9853,7 +9893,7 @@ struct llm_build_context {
|
|
|
9853
9893
|
|
|
9854
9894
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9855
9895
|
model.layers[il].wo, model.layers[il].bo,
|
|
9856
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
9896
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9857
9897
|
}
|
|
9858
9898
|
|
|
9859
9899
|
if (il == n_layer - 1) {
|
|
@@ -9967,15 +10007,15 @@ struct llm_build_context {
|
|
|
9967
10007
|
cb(Vcur, "Vcur", il);
|
|
9968
10008
|
}
|
|
9969
10009
|
|
|
9970
|
-
Qcur =
|
|
9971
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10010
|
+
Qcur = ggml_rope_ext(
|
|
10011
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
9972
10012
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9973
10013
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9974
10014
|
);
|
|
9975
10015
|
cb(Qcur, "Qcur", il);
|
|
9976
10016
|
|
|
9977
|
-
Kcur =
|
|
9978
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10017
|
+
Kcur = ggml_rope_ext(
|
|
10018
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
9979
10019
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
9980
10020
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
9981
10021
|
);
|
|
@@ -9983,7 +10023,7 @@ struct llm_build_context {
|
|
|
9983
10023
|
|
|
9984
10024
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
9985
10025
|
model.layers[il].wo, model.layers[il].bo,
|
|
9986
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10026
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
9987
10027
|
}
|
|
9988
10028
|
|
|
9989
10029
|
if (il == n_layer - 1) {
|
|
@@ -10087,8 +10127,8 @@ struct llm_build_context {
|
|
|
10087
10127
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
10088
10128
|
cb(Vcur, "Vcur", il);
|
|
10089
10129
|
|
|
10090
|
-
Qcur =
|
|
10091
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
|
10130
|
+
Qcur = ggml_rope_ext(
|
|
10131
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos, nullptr,
|
|
10092
10132
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10093
10133
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10094
10134
|
cb(Qcur, "Qcur", il);
|
|
@@ -10096,15 +10136,15 @@ struct llm_build_context {
|
|
|
10096
10136
|
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head_k)));
|
|
10097
10137
|
cb(Qcur, "Qcur_scaled", il);
|
|
10098
10138
|
|
|
10099
|
-
Kcur =
|
|
10100
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
|
10139
|
+
Kcur = ggml_rope_ext(
|
|
10140
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10101
10141
|
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10102
10142
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
10103
10143
|
cb(Kcur, "Kcur", il);
|
|
10104
10144
|
|
|
10105
10145
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10106
10146
|
model.layers[il].wo, NULL,
|
|
10107
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10147
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
10108
10148
|
}
|
|
10109
10149
|
|
|
10110
10150
|
if (il == n_layer - 1) {
|
|
@@ -10207,15 +10247,15 @@ struct llm_build_context {
|
|
|
10207
10247
|
cb(Vcur, "Vcur", il);
|
|
10208
10248
|
}
|
|
10209
10249
|
|
|
10210
|
-
Qcur =
|
|
10211
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10250
|
+
Qcur = ggml_rope_ext(
|
|
10251
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10212
10252
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10213
10253
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10214
10254
|
);
|
|
10215
10255
|
cb(Qcur, "Qcur", il);
|
|
10216
10256
|
|
|
10217
|
-
Kcur =
|
|
10218
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10257
|
+
Kcur = ggml_rope_ext(
|
|
10258
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10219
10259
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10220
10260
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10221
10261
|
);
|
|
@@ -10223,7 +10263,7 @@ struct llm_build_context {
|
|
|
10223
10263
|
|
|
10224
10264
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10225
10265
|
model.layers[il].wo, model.layers[il].bo,
|
|
10226
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10266
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10227
10267
|
}
|
|
10228
10268
|
|
|
10229
10269
|
if (il == n_layer - 1) {
|
|
@@ -10490,22 +10530,267 @@ struct llm_build_context {
|
|
|
10490
10530
|
LLM_NORM, cb, il);
|
|
10491
10531
|
cb(Qcur, "Qcur", il);
|
|
10492
10532
|
|
|
10493
|
-
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
|
10494
|
-
model.layers[il].attn_k_norm,
|
|
10495
|
-
NULL,
|
|
10496
|
-
LLM_NORM, cb, il);
|
|
10497
|
-
cb(Kcur, "Kcur", il);
|
|
10498
|
-
}
|
|
10533
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams,
|
|
10534
|
+
model.layers[il].attn_k_norm,
|
|
10535
|
+
NULL,
|
|
10536
|
+
LLM_NORM, cb, il);
|
|
10537
|
+
cb(Kcur, "Kcur", il);
|
|
10538
|
+
}
|
|
10539
|
+
|
|
10540
|
+
Qcur = ggml_rope_ext(
|
|
10541
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10542
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10543
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10544
|
+
);
|
|
10545
|
+
cb(Qcur, "Qcur", il);
|
|
10546
|
+
|
|
10547
|
+
Kcur = ggml_rope_ext(
|
|
10548
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10549
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10550
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10551
|
+
);
|
|
10552
|
+
cb(Kcur, "Kcur", il);
|
|
10553
|
+
|
|
10554
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10555
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
10556
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10557
|
+
}
|
|
10558
|
+
|
|
10559
|
+
if (il == n_layer - 1) {
|
|
10560
|
+
// skip computing output for unused tokens
|
|
10561
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10562
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10563
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10564
|
+
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
10565
|
+
}
|
|
10566
|
+
|
|
10567
|
+
struct ggml_tensor * attn_out = cur;
|
|
10568
|
+
|
|
10569
|
+
// feed-forward network
|
|
10570
|
+
{
|
|
10571
|
+
cur = llm_build_ffn(ctx0, ffn_inp,
|
|
10572
|
+
model.layers[il].ffn_up, NULL,
|
|
10573
|
+
model.layers[il].ffn_gate, NULL,
|
|
10574
|
+
model.layers[il].ffn_down, NULL,
|
|
10575
|
+
NULL,
|
|
10576
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
10577
|
+
cb(cur, "ffn_out", il);
|
|
10578
|
+
}
|
|
10579
|
+
|
|
10580
|
+
// add together residual + FFN + self-attention
|
|
10581
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
10582
|
+
cur = ggml_add(ctx0, cur, attn_out);
|
|
10583
|
+
cb(cur, "l_out", il);
|
|
10584
|
+
|
|
10585
|
+
// input for next layer
|
|
10586
|
+
inpL = cur;
|
|
10587
|
+
}
|
|
10588
|
+
|
|
10589
|
+
cur = inpL;
|
|
10590
|
+
|
|
10591
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
10592
|
+
model.output_norm, NULL,
|
|
10593
|
+
LLM_NORM, cb, -1);
|
|
10594
|
+
cb(cur, "result_norm", -1);
|
|
10595
|
+
|
|
10596
|
+
// lm_head
|
|
10597
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
10598
|
+
|
|
10599
|
+
if (f_logit_scale) {
|
|
10600
|
+
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
|
10601
|
+
}
|
|
10602
|
+
|
|
10603
|
+
cb(cur, "result_output", -1);
|
|
10604
|
+
|
|
10605
|
+
ggml_build_forward_expand(gf, cur);
|
|
10606
|
+
|
|
10607
|
+
return gf;
|
|
10608
|
+
|
|
10609
|
+
}
|
|
10610
|
+
|
|
10611
|
+
// ref: https://allenai.org/olmo
|
|
10612
|
+
// based on the original build_llama() function, changes:
|
|
10613
|
+
// * non-parametric layer norm
|
|
10614
|
+
// * clamp qkv
|
|
10615
|
+
// * removed bias
|
|
10616
|
+
// * removed MoE
|
|
10617
|
+
struct ggml_cgraph * build_olmo() {
|
|
10618
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
10619
|
+
|
|
10620
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
10621
|
+
int32_t n_tokens = this->n_tokens;
|
|
10622
|
+
|
|
10623
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10624
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10625
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
10626
|
+
|
|
10627
|
+
struct ggml_tensor * cur;
|
|
10628
|
+
struct ggml_tensor * inpL;
|
|
10629
|
+
|
|
10630
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
10631
|
+
|
|
10632
|
+
// inp_pos - contains the positions
|
|
10633
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
10634
|
+
|
|
10635
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
10636
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
10637
|
+
|
|
10638
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10639
|
+
struct ggml_tensor * inpSA = inpL;
|
|
10640
|
+
|
|
10641
|
+
// norm
|
|
10642
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10643
|
+
NULL, NULL,
|
|
10644
|
+
LLM_NORM, cb, il);
|
|
10645
|
+
cb(cur, "attn_norm", il);
|
|
10646
|
+
|
|
10647
|
+
// self-attention
|
|
10648
|
+
{
|
|
10649
|
+
// compute Q and K and RoPE them
|
|
10650
|
+
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
10651
|
+
cb(Qcur, "Qcur", il);
|
|
10652
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
|
10653
|
+
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
10654
|
+
cb(Qcur, "Qcur", il);
|
|
10655
|
+
}
|
|
10656
|
+
|
|
10657
|
+
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
10658
|
+
cb(Kcur, "Kcur", il);
|
|
10659
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
|
10660
|
+
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
10661
|
+
cb(Kcur, "Kcur", il);
|
|
10662
|
+
}
|
|
10663
|
+
|
|
10664
|
+
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
10665
|
+
cb(Vcur, "Vcur", il);
|
|
10666
|
+
if (hparams.f_clamp_kqv > 0.0f) {
|
|
10667
|
+
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
10668
|
+
cb(Vcur, "Vcur", il);
|
|
10669
|
+
}
|
|
10670
|
+
|
|
10671
|
+
Qcur = ggml_rope_ext(
|
|
10672
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10673
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10674
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10675
|
+
);
|
|
10676
|
+
cb(Qcur, "Qcur", il);
|
|
10677
|
+
|
|
10678
|
+
Kcur = ggml_rope_ext(
|
|
10679
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10680
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10681
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10682
|
+
);
|
|
10683
|
+
cb(Kcur, "Kcur", il);
|
|
10684
|
+
|
|
10685
|
+
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10686
|
+
model.layers[il].wo, nullptr,
|
|
10687
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10688
|
+
}
|
|
10689
|
+
|
|
10690
|
+
if (il == n_layer - 1) {
|
|
10691
|
+
// skip computing output for unused tokens
|
|
10692
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10693
|
+
n_tokens = n_outputs;
|
|
10694
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10695
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
10696
|
+
}
|
|
10697
|
+
|
|
10698
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
10699
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
10700
|
+
|
|
10701
|
+
// feed-forward network
|
|
10702
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
10703
|
+
NULL, NULL,
|
|
10704
|
+
LLM_NORM, cb, il);
|
|
10705
|
+
cb(cur, "ffn_norm", il);
|
|
10706
|
+
|
|
10707
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
10708
|
+
model.layers[il].ffn_up, NULL,
|
|
10709
|
+
model.layers[il].ffn_gate, NULL,
|
|
10710
|
+
model.layers[il].ffn_down, NULL,
|
|
10711
|
+
NULL,
|
|
10712
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
10713
|
+
cb(cur, "ffn_out", il);
|
|
10714
|
+
|
|
10715
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
10716
|
+
cb(cur, "ffn_out", il);
|
|
10717
|
+
|
|
10718
|
+
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
|
10719
|
+
if (layer_dir != nullptr) {
|
|
10720
|
+
cur = ggml_add(ctx0, cur, layer_dir);
|
|
10721
|
+
}
|
|
10722
|
+
cb(cur, "l_out", il);
|
|
10723
|
+
|
|
10724
|
+
// input for next layer
|
|
10725
|
+
inpL = cur;
|
|
10726
|
+
}
|
|
10727
|
+
|
|
10728
|
+
cur = inpL;
|
|
10729
|
+
|
|
10730
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
10731
|
+
NULL, NULL,
|
|
10732
|
+
LLM_NORM, cb, -1);
|
|
10733
|
+
cb(cur, "result_norm", -1);
|
|
10734
|
+
|
|
10735
|
+
// lm_head
|
|
10736
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
10737
|
+
cb(cur, "result_output", -1);
|
|
10738
|
+
|
|
10739
|
+
ggml_build_forward_expand(gf, cur);
|
|
10740
|
+
|
|
10741
|
+
return gf;
|
|
10742
|
+
}
|
|
10743
|
+
|
|
10744
|
+
struct ggml_cgraph * build_gptneox() {
|
|
10745
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
10746
|
+
|
|
10747
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
10748
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
10749
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
10750
|
+
|
|
10751
|
+
struct ggml_tensor * cur;
|
|
10752
|
+
struct ggml_tensor * inpL;
|
|
10753
|
+
|
|
10754
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
|
10755
|
+
|
|
10756
|
+
// inp_pos - contains the positions
|
|
10757
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
10758
|
+
|
|
10759
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
10760
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
10761
|
+
|
|
10762
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
10763
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10764
|
+
model.layers[il].attn_norm,
|
|
10765
|
+
model.layers[il].attn_norm_b,
|
|
10766
|
+
LLM_NORM, cb, il);
|
|
10767
|
+
cb(cur, "attn_norm", il);
|
|
10768
|
+
|
|
10769
|
+
// self-attention
|
|
10770
|
+
{
|
|
10771
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
|
10772
|
+
cb(cur, "wqkv", il);
|
|
10773
|
+
|
|
10774
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
10775
|
+
cb(cur, "bqkv", il);
|
|
10776
|
+
|
|
10777
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
10778
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
10779
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
10780
|
+
|
|
10781
|
+
cb(Qcur, "Qcur", il);
|
|
10782
|
+
cb(Kcur, "Kcur", il);
|
|
10783
|
+
cb(Vcur, "Vcur", il);
|
|
10499
10784
|
|
|
10500
|
-
Qcur =
|
|
10501
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10785
|
+
Qcur = ggml_rope_ext(
|
|
10786
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10502
10787
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10503
10788
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10504
10789
|
);
|
|
10505
10790
|
cb(Qcur, "Qcur", il);
|
|
10506
10791
|
|
|
10507
|
-
Kcur =
|
|
10508
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10792
|
+
Kcur = ggml_rope_ext(
|
|
10793
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10509
10794
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10510
10795
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10511
10796
|
);
|
|
@@ -10513,68 +10798,84 @@ struct llm_build_context {
|
|
|
10513
10798
|
|
|
10514
10799
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10515
10800
|
model.layers[il].wo, model.layers[il].bo,
|
|
10516
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10801
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10517
10802
|
}
|
|
10518
10803
|
|
|
10519
10804
|
if (il == n_layer - 1) {
|
|
10520
10805
|
// skip computing output for unused tokens
|
|
10521
10806
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
10522
|
-
cur
|
|
10523
|
-
inpL
|
|
10524
|
-
ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
|
|
10807
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
10808
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
10525
10809
|
}
|
|
10526
10810
|
|
|
10527
|
-
|
|
10811
|
+
// ffn
|
|
10812
|
+
if (hparams.use_par_res) {
|
|
10813
|
+
// attention and ffn are computed in parallel
|
|
10814
|
+
// x = x + attn(ln1(x)) + ffn(ln2(x))
|
|
10528
10815
|
|
|
10529
|
-
|
|
10530
|
-
|
|
10531
|
-
cur =
|
|
10532
|
-
model.layers[il].
|
|
10533
|
-
model.layers[il].
|
|
10534
|
-
|
|
10816
|
+
struct ggml_tensor * attn_out = cur;
|
|
10817
|
+
|
|
10818
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10819
|
+
model.layers[il].ffn_norm,
|
|
10820
|
+
model.layers[il].ffn_norm_b,
|
|
10821
|
+
LLM_NORM, cb, il);
|
|
10822
|
+
cb(cur, "ffn_norm", il);
|
|
10823
|
+
|
|
10824
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
10825
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
10826
|
+
NULL, NULL,
|
|
10827
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
10535
10828
|
NULL,
|
|
10536
|
-
|
|
10829
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
10537
10830
|
cb(cur, "ffn_out", il);
|
|
10538
|
-
}
|
|
10539
10831
|
|
|
10540
|
-
|
|
10541
|
-
|
|
10542
|
-
cur = ggml_add(ctx0, cur, attn_out);
|
|
10543
|
-
cb(cur, "l_out", il);
|
|
10832
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
10833
|
+
cb(cur, "ffn_out", il);
|
|
10544
10834
|
|
|
10545
|
-
|
|
10546
|
-
|
|
10547
|
-
|
|
10835
|
+
inpL = ggml_add(ctx0, cur, attn_out);
|
|
10836
|
+
cb(inpL, "l_out", il);
|
|
10837
|
+
} else {
|
|
10838
|
+
// attention and ffn are computed sequentially
|
|
10839
|
+
// x = x + attn(ln1(x))
|
|
10840
|
+
// x = x + ffn(ln2(x))
|
|
10548
10841
|
|
|
10549
|
-
|
|
10842
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
10843
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
10550
10844
|
|
|
10551
|
-
|
|
10552
|
-
|
|
10553
|
-
|
|
10554
|
-
|
|
10845
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
10846
|
+
model.layers[il].ffn_norm,
|
|
10847
|
+
model.layers[il].ffn_norm_b,
|
|
10848
|
+
LLM_NORM, cb, il);
|
|
10849
|
+
cb(cur, "ffn_norm", il);
|
|
10555
10850
|
|
|
10556
|
-
|
|
10557
|
-
|
|
10851
|
+
cur = llm_build_ffn(ctx0, cur,
|
|
10852
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
|
10853
|
+
NULL, NULL,
|
|
10854
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
|
10855
|
+
NULL,
|
|
10856
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
10857
|
+
cb(cur, "ffn_out", il);
|
|
10558
10858
|
|
|
10559
|
-
|
|
10560
|
-
|
|
10859
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
|
10860
|
+
cb(inpL, "l_out", il);
|
|
10861
|
+
}
|
|
10561
10862
|
}
|
|
10562
10863
|
|
|
10864
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10865
|
+
model.output_norm,
|
|
10866
|
+
model.output_norm_b,
|
|
10867
|
+
LLM_NORM, cb, -1);
|
|
10868
|
+
cb(cur, "result_norm", -1);
|
|
10869
|
+
|
|
10870
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
|
10563
10871
|
cb(cur, "result_output", -1);
|
|
10564
10872
|
|
|
10565
10873
|
ggml_build_forward_expand(gf, cur);
|
|
10566
10874
|
|
|
10567
10875
|
return gf;
|
|
10568
|
-
|
|
10569
10876
|
}
|
|
10570
10877
|
|
|
10571
|
-
|
|
10572
|
-
// based on the original build_llama() function, changes:
|
|
10573
|
-
// * non-parametric layer norm
|
|
10574
|
-
// * clamp qkv
|
|
10575
|
-
// * removed bias
|
|
10576
|
-
// * removed MoE
|
|
10577
|
-
struct ggml_cgraph * build_olmo() {
|
|
10878
|
+
struct ggml_cgraph * build_arctic() {
|
|
10578
10879
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
10579
10880
|
|
|
10580
10881
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
@@ -10600,8 +10901,8 @@ struct llm_build_context {
|
|
|
10600
10901
|
|
|
10601
10902
|
// norm
|
|
10602
10903
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
10603
|
-
|
|
10604
|
-
|
|
10904
|
+
model.layers[il].attn_norm, NULL,
|
|
10905
|
+
LLM_NORM_RMS, cb, il);
|
|
10605
10906
|
cb(cur, "attn_norm", il);
|
|
10606
10907
|
|
|
10607
10908
|
// self-attention
|
|
@@ -10609,42 +10910,30 @@ struct llm_build_context {
|
|
|
10609
10910
|
// compute Q and K and RoPE them
|
|
10610
10911
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
10611
10912
|
cb(Qcur, "Qcur", il);
|
|
10612
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
|
10613
|
-
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
10614
|
-
cb(Qcur, "Qcur", il);
|
|
10615
|
-
}
|
|
10616
10913
|
|
|
10617
10914
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
|
10618
10915
|
cb(Kcur, "Kcur", il);
|
|
10619
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
|
10620
|
-
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
10621
|
-
cb(Kcur, "Kcur", il);
|
|
10622
|
-
}
|
|
10623
10916
|
|
|
10624
10917
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
|
10625
10918
|
cb(Vcur, "Vcur", il);
|
|
10626
|
-
if (hparams.f_clamp_kqv > 0.0f) {
|
|
10627
|
-
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
|
10628
|
-
cb(Vcur, "Vcur", il);
|
|
10629
|
-
}
|
|
10630
10919
|
|
|
10631
|
-
Qcur =
|
|
10632
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
10920
|
+
Qcur = ggml_rope_ext(
|
|
10921
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
10633
10922
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10634
10923
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10635
10924
|
);
|
|
10636
10925
|
cb(Qcur, "Qcur", il);
|
|
10637
10926
|
|
|
10638
|
-
Kcur =
|
|
10639
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
10927
|
+
Kcur = ggml_rope_ext(
|
|
10928
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
10640
10929
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
|
10641
10930
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10642
10931
|
);
|
|
10643
10932
|
cb(Kcur, "Kcur", il);
|
|
10644
10933
|
|
|
10645
10934
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
|
10646
|
-
model.layers[il].wo,
|
|
10647
|
-
Kcur, Vcur, Qcur, KQ_mask,
|
|
10935
|
+
model.layers[il].wo, NULL,
|
|
10936
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
10648
10937
|
}
|
|
10649
10938
|
|
|
10650
10939
|
if (il == n_layer - 1) {
|
|
@@ -10660,8 +10949,8 @@ struct llm_build_context {
|
|
|
10660
10949
|
|
|
10661
10950
|
// feed-forward network
|
|
10662
10951
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
10663
|
-
|
|
10664
|
-
|
|
10952
|
+
model.layers[il].ffn_norm, NULL,
|
|
10953
|
+
LLM_NORM_RMS, cb, il);
|
|
10665
10954
|
cb(cur, "ffn_norm", il);
|
|
10666
10955
|
|
|
10667
10956
|
cur = llm_build_ffn(ctx0, cur,
|
|
@@ -10672,7 +10961,26 @@ struct llm_build_context {
|
|
|
10672
10961
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
10673
10962
|
cb(cur, "ffn_out", il);
|
|
10674
10963
|
|
|
10675
|
-
|
|
10964
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
|
10965
|
+
cb(ffn_out, "ffn_out", il);
|
|
10966
|
+
|
|
10967
|
+
// MoE
|
|
10968
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
|
10969
|
+
model.layers[il].ffn_norm_exps, NULL,
|
|
10970
|
+
LLM_NORM_RMS, cb, il);
|
|
10971
|
+
cb(cur, "ffn_norm_exps", il);
|
|
10972
|
+
|
|
10973
|
+
cur = llm_build_moe_ffn(ctx0, cur,
|
|
10974
|
+
model.layers[il].ffn_gate_inp,
|
|
10975
|
+
model.layers[il].ffn_up_exps,
|
|
10976
|
+
model.layers[il].ffn_gate_exps,
|
|
10977
|
+
model.layers[il].ffn_down_exps,
|
|
10978
|
+
n_expert, n_expert_used,
|
|
10979
|
+
LLM_FFN_SILU, true,
|
|
10980
|
+
cb, il);
|
|
10981
|
+
cb(cur, "ffn_moe_out", il);
|
|
10982
|
+
|
|
10983
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
|
10676
10984
|
cb(cur, "ffn_out", il);
|
|
10677
10985
|
|
|
10678
10986
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
|
@@ -10688,8 +10996,8 @@ struct llm_build_context {
|
|
|
10688
10996
|
cur = inpL;
|
|
10689
10997
|
|
|
10690
10998
|
cur = llm_build_norm(ctx0, cur, hparams,
|
|
10691
|
-
|
|
10692
|
-
|
|
10999
|
+
model.output_norm, NULL,
|
|
11000
|
+
LLM_NORM_RMS, cb, -1);
|
|
10693
11001
|
cb(cur, "result_norm", -1);
|
|
10694
11002
|
|
|
10695
11003
|
// lm_head
|
|
@@ -10816,15 +11124,12 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
10816
11124
|
{
|
|
10817
11125
|
result = llm.build_starcoder();
|
|
10818
11126
|
} break;
|
|
10819
|
-
case LLM_ARCH_PERSIMMON:
|
|
10820
|
-
{
|
|
10821
|
-
result = llm.build_persimmon();
|
|
10822
|
-
} break;
|
|
10823
11127
|
case LLM_ARCH_REFACT:
|
|
10824
11128
|
{
|
|
10825
11129
|
result = llm.build_refact();
|
|
10826
11130
|
} break;
|
|
10827
11131
|
case LLM_ARCH_BERT:
|
|
11132
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
10828
11133
|
case LLM_ARCH_NOMIC_BERT:
|
|
10829
11134
|
{
|
|
10830
11135
|
result = llm.build_bert();
|
|
@@ -10913,6 +11218,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
10913
11218
|
{
|
|
10914
11219
|
result = llm.build_olmo();
|
|
10915
11220
|
} break;
|
|
11221
|
+
case LLM_ARCH_GPTNEOX:
|
|
11222
|
+
{
|
|
11223
|
+
result = llm.build_gptneox();
|
|
11224
|
+
} break;
|
|
11225
|
+
case LLM_ARCH_ARCTIC:
|
|
11226
|
+
{
|
|
11227
|
+
result = llm.build_arctic();
|
|
11228
|
+
} break;
|
|
10916
11229
|
default:
|
|
10917
11230
|
GGML_ASSERT(false);
|
|
10918
11231
|
}
|
|
@@ -11032,11 +11345,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11032
11345
|
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
|
|
11033
11346
|
f = -INFINITY;
|
|
11034
11347
|
} else {
|
|
11035
|
-
|
|
11348
|
+
if (hparams.use_alibi) {
|
|
11349
|
+
f = -fabs(lctx.kv_self.cells[i].pos - pos);
|
|
11350
|
+
} else {
|
|
11351
|
+
f = 0.0f;
|
|
11352
|
+
}
|
|
11036
11353
|
}
|
|
11037
11354
|
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
|
|
11038
11355
|
}
|
|
11039
11356
|
}
|
|
11357
|
+
|
|
11358
|
+
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
|
|
11359
|
+
for (int j = 0; j < n_kv; ++j) {
|
|
11360
|
+
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
|
|
11361
|
+
}
|
|
11362
|
+
}
|
|
11040
11363
|
}
|
|
11041
11364
|
} else {
|
|
11042
11365
|
// when using kv cache, the mask needs to match the kv cache size
|
|
@@ -11055,7 +11378,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11055
11378
|
float f = -INFINITY;
|
|
11056
11379
|
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
|
|
11057
11380
|
if (batch.seq_id[i][s] == seq_id) {
|
|
11058
|
-
|
|
11381
|
+
if (hparams.use_alibi) {
|
|
11382
|
+
f = -fabs(batch.pos[i] - batch.pos[j]);
|
|
11383
|
+
} else {
|
|
11384
|
+
f = 0.0f;
|
|
11385
|
+
}
|
|
11059
11386
|
break;
|
|
11060
11387
|
}
|
|
11061
11388
|
}
|
|
@@ -11071,21 +11398,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
|
11071
11398
|
}
|
|
11072
11399
|
}
|
|
11073
11400
|
|
|
11074
|
-
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
|
|
11075
|
-
// this allows to process multiple sequences in parallel with ALiBi-based models
|
|
11076
|
-
if (hparams.use_alibi) {
|
|
11077
|
-
const int64_t n_kv = kv_self.n;
|
|
11078
|
-
|
|
11079
|
-
GGML_ASSERT(lctx.inp_KQ_pos);
|
|
11080
|
-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
|
|
11081
|
-
|
|
11082
|
-
float * data = (float *) lctx.inp_KQ_pos->data;
|
|
11083
|
-
|
|
11084
|
-
for (int i = 0; i < n_kv; ++i) {
|
|
11085
|
-
data[i] = float(lctx.kv_self.cells[i].pos);
|
|
11086
|
-
}
|
|
11087
|
-
}
|
|
11088
|
-
|
|
11089
11401
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
|
11090
11402
|
const int64_t n_tokens = batch.n_tokens;
|
|
11091
11403
|
|
|
@@ -11259,11 +11571,6 @@ static void llama_graph_compute(
|
|
|
11259
11571
|
llama_context & lctx,
|
|
11260
11572
|
ggml_cgraph * gf,
|
|
11261
11573
|
int n_threads) {
|
|
11262
|
-
#ifdef GGML_USE_MPI
|
|
11263
|
-
const int64_t n_layer = lctx.model.hparams.n_layer;
|
|
11264
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
|
11265
|
-
#endif
|
|
11266
|
-
|
|
11267
11574
|
#ifdef GGML_USE_METAL
|
|
11268
11575
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
|
11269
11576
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
|
@@ -11278,10 +11585,6 @@ static void llama_graph_compute(
|
|
|
11278
11585
|
ggml_backend_sched_graph_compute_async(lctx.sched, gf);
|
|
11279
11586
|
|
|
11280
11587
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
11281
|
-
|
|
11282
|
-
#ifdef GGML_USE_MPI
|
|
11283
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
11284
|
-
#endif
|
|
11285
11588
|
}
|
|
11286
11589
|
|
|
11287
11590
|
// decode a batch of tokens by evaluating the transformer
|
|
@@ -11319,12 +11622,6 @@ static int llama_decode_internal(
|
|
|
11319
11622
|
}
|
|
11320
11623
|
lctx.n_queued_tokens += n_tokens_all;
|
|
11321
11624
|
|
|
11322
|
-
#ifdef GGML_USE_MPI
|
|
11323
|
-
// TODO: needs fix after #3228
|
|
11324
|
-
GGML_ASSERT(false && "not implemented");
|
|
11325
|
-
//ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
|
11326
|
-
#endif
|
|
11327
|
-
|
|
11328
11625
|
auto & kv_self = lctx.kv_self;
|
|
11329
11626
|
|
|
11330
11627
|
const int64_t n_embd = hparams.n_embd;
|
|
@@ -11455,7 +11752,8 @@ static int llama_decode_internal(
|
|
|
11455
11752
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
|
11456
11753
|
// after enough generations, the benefit from this heuristic disappears
|
|
11457
11754
|
// if we start defragmenting the cache, the benefit from this will be more important
|
|
11458
|
-
|
|
11755
|
+
const uint32_t pad = llama_kv_cache_get_padding(cparams);
|
|
11756
|
+
kv_self.n = std::min(kv_self.size, std::max(pad, GGML_PAD(llama_kv_cache_cell_max(kv_self), pad)));
|
|
11459
11757
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
|
11460
11758
|
}
|
|
11461
11759
|
}
|
|
@@ -12200,13 +12498,14 @@ struct llm_tokenizer_bpe {
|
|
|
12200
12498
|
|
|
12201
12499
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
|
12202
12500
|
int final_prev_index = -1;
|
|
12501
|
+
bool ignore_merges = false;
|
|
12203
12502
|
|
|
12204
12503
|
std::vector<std::string> word_collection;
|
|
12205
12504
|
switch (vocab.type) {
|
|
12206
12505
|
case LLAMA_VOCAB_TYPE_BPE:
|
|
12207
12506
|
switch (vocab.type_pre) {
|
|
12208
12507
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
|
12209
|
-
|
|
12508
|
+
ignore_merges = true;
|
|
12210
12509
|
word_collection = unicode_regex_split(text, {
|
|
12211
12510
|
// original regex from tokenizer.json
|
|
12212
12511
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
@@ -12215,6 +12514,13 @@ struct llm_tokenizer_bpe {
|
|
|
12215
12514
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12216
12515
|
});
|
|
12217
12516
|
break;
|
|
12517
|
+
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
|
12518
|
+
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
|
12519
|
+
word_collection = unicode_regex_split(text, {
|
|
12520
|
+
// same as llama3
|
|
12521
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
|
12522
|
+
});
|
|
12523
|
+
break;
|
|
12218
12524
|
case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
|
|
12219
12525
|
word_collection = unicode_regex_split(text, {
|
|
12220
12526
|
"[\r\n]",
|
|
@@ -12266,6 +12572,7 @@ struct llm_tokenizer_bpe {
|
|
|
12266
12572
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
|
12267
12573
|
});
|
|
12268
12574
|
break;
|
|
12575
|
+
case LLAMA_VOCAB_PRE_TYPE_STABLELM2:
|
|
12269
12576
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
|
12270
12577
|
word_collection = unicode_regex_split(text, {
|
|
12271
12578
|
// original regex from tokenizer.json
|
|
@@ -12298,6 +12605,11 @@ struct llm_tokenizer_bpe {
|
|
|
12298
12605
|
int index = 0;
|
|
12299
12606
|
size_t offset = 0;
|
|
12300
12607
|
|
|
12608
|
+
if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
|
|
12609
|
+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
|
|
12610
|
+
offset = word.size();
|
|
12611
|
+
}
|
|
12612
|
+
|
|
12301
12613
|
while (offset < word.size()) {
|
|
12302
12614
|
llm_symbol sym;
|
|
12303
12615
|
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
|
|
@@ -12483,16 +12795,16 @@ struct llm_tokenizer_wpm {
|
|
|
12483
12795
|
// to lowercase, pad chinese characters, pad punctuation
|
|
12484
12796
|
std::string new_str = "";
|
|
12485
12797
|
for (uint32_t code : cpts_nfd) {
|
|
12486
|
-
|
|
12487
|
-
if (
|
|
12798
|
+
const codepoint_flags flags = unicode_cpt_flags(code);
|
|
12799
|
+
if (flags.is_accent_mark || flags.is_control) {
|
|
12488
12800
|
continue;
|
|
12489
12801
|
}
|
|
12490
12802
|
code = unicode_tolower(code);
|
|
12491
|
-
if (
|
|
12803
|
+
if (flags.is_separator || flags.is_whitespace) { //####FIXME: is_separator ?
|
|
12492
12804
|
code = ' ';
|
|
12493
12805
|
}
|
|
12494
12806
|
std::string s = unicode_cpt_to_utf8(code);
|
|
12495
|
-
if (
|
|
12807
|
+
if (flags.is_punctuation || is_ascii_punct(code) || is_chinese_char(code)) {
|
|
12496
12808
|
new_str += " ";
|
|
12497
12809
|
new_str += s;
|
|
12498
12810
|
new_str += " ";
|
|
@@ -12695,9 +13007,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12695
13007
|
// tokenizer.encode('', add_special_tokens=True) returns [1]
|
|
12696
13008
|
// tokenizer.encode('', add_special_tokens=False) returns []
|
|
12697
13009
|
|
|
13010
|
+
static const bool rtrim = true; //TODO: as param
|
|
13011
|
+
bool is_prev_special = false;
|
|
13012
|
+
bool special_token_rtrim = false;
|
|
13013
|
+
|
|
12698
13014
|
if (add_special && vocab.special_add_bos != 0) {
|
|
12699
13015
|
GGML_ASSERT(vocab.special_bos_id != -1);
|
|
12700
13016
|
output.push_back(vocab.special_bos_id);
|
|
13017
|
+
is_prev_special = true;
|
|
12701
13018
|
}
|
|
12702
13019
|
|
|
12703
13020
|
for (const auto & fragment : fragment_buffer) {
|
|
@@ -12709,9 +13026,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12709
13026
|
// and passing 'add space prefix' as bool argument
|
|
12710
13027
|
//
|
|
12711
13028
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
12712
|
-
|
|
12713
|
-
|
|
12714
|
-
|
|
13029
|
+
|
|
13030
|
+
if (special_token_rtrim) {
|
|
13031
|
+
size_t num_whitespaces = 0;
|
|
13032
|
+
while (isspace(raw_text[num_whitespaces])) {
|
|
13033
|
+
num_whitespaces++;
|
|
13034
|
+
}
|
|
13035
|
+
if (num_whitespaces == raw_text.size()) {
|
|
13036
|
+
continue; // skip if all whitespaces
|
|
13037
|
+
}
|
|
13038
|
+
raw_text = raw_text.substr(num_whitespaces);
|
|
13039
|
+
}
|
|
13040
|
+
|
|
13041
|
+
if (vocab.add_space_prefix) {
|
|
13042
|
+
if (!output.size() || is_prev_special) { // prefix with space if first token
|
|
13043
|
+
raw_text = " " + raw_text;
|
|
12715
13044
|
}
|
|
12716
13045
|
}
|
|
12717
13046
|
|
|
@@ -12723,9 +13052,22 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12723
13052
|
tokenizer.tokenize(raw_text, output);
|
|
12724
13053
|
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
|
12725
13054
|
output.push_back(fragment.token);
|
|
13055
|
+
is_prev_special = true;
|
|
13056
|
+
// phi-3 special tokens without rtrim, works fine for llama-spm too
|
|
13057
|
+
special_token_rtrim = rtrim
|
|
13058
|
+
&& fragment.token != vocab.special_bos_id
|
|
13059
|
+
&& fragment.token != vocab.special_unk_id
|
|
13060
|
+
&& fragment.token != vocab.special_eos_id;
|
|
12726
13061
|
}
|
|
12727
13062
|
}
|
|
12728
13063
|
|
|
13064
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
|
13065
|
+
LLAMA_LOG_WARN(
|
|
13066
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
13067
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
13068
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13069
|
+
}
|
|
13070
|
+
|
|
12729
13071
|
if (add_special && vocab.special_add_eos == 1) {
|
|
12730
13072
|
GGML_ASSERT(vocab.special_eos_id != -1);
|
|
12731
13073
|
output.push_back(vocab.special_eos_id);
|
|
@@ -12752,7 +13094,17 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
12752
13094
|
}
|
|
12753
13095
|
}
|
|
12754
13096
|
|
|
12755
|
-
|
|
13097
|
+
if (add_special && vocab.special_add_bos != 0 && output.size() >= 2 && output[1] == vocab.special_bos_id) {
|
|
13098
|
+
LLAMA_LOG_WARN(
|
|
13099
|
+
"%s: Added a BOS token to the prompt as specified by the model but the prompt "
|
|
13100
|
+
"also starts with a BOS token. So now the final prompt starts with 2 BOS tokens. "
|
|
13101
|
+
"Are you sure this is what you want?\n", __FUNCTION__);
|
|
13102
|
+
}
|
|
13103
|
+
|
|
13104
|
+
if (add_special && vocab.special_add_eos == 1) {
|
|
13105
|
+
GGML_ASSERT(vocab.special_add_eos != -1);
|
|
13106
|
+
output.push_back(vocab.special_eos_id);
|
|
13107
|
+
}
|
|
12756
13108
|
} break;
|
|
12757
13109
|
case LLAMA_VOCAB_TYPE_WPM:
|
|
12758
13110
|
{
|
|
@@ -13106,6 +13458,58 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates(
|
|
|
13106
13458
|
return rejects;
|
|
13107
13459
|
}
|
|
13108
13460
|
|
|
13461
|
+
static bool llama_grammar_detect_left_recursion(
|
|
13462
|
+
const std::vector<std::vector<llama_grammar_element>> & rules,
|
|
13463
|
+
size_t rule_index,
|
|
13464
|
+
std::vector<bool> * rules_visited,
|
|
13465
|
+
std::vector<bool> * rules_in_progress,
|
|
13466
|
+
std::vector<bool> * rules_may_be_empty) {
|
|
13467
|
+
if ((*rules_in_progress)[rule_index]) {
|
|
13468
|
+
return true;
|
|
13469
|
+
}
|
|
13470
|
+
|
|
13471
|
+
(*rules_in_progress)[rule_index] = true;
|
|
13472
|
+
|
|
13473
|
+
const std::vector<llama_grammar_element> & rule = rules[rule_index];
|
|
13474
|
+
|
|
13475
|
+
// First check if the rule might produce the empty string. This could be done combined with the second
|
|
13476
|
+
// step but it's more readable as two steps.
|
|
13477
|
+
bool at_rule_start = true;
|
|
13478
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
|
13479
|
+
if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
|
13480
|
+
if (at_rule_start) {
|
|
13481
|
+
(*rules_may_be_empty)[rule_index] = true;
|
|
13482
|
+
break;
|
|
13483
|
+
}
|
|
13484
|
+
at_rule_start = true;
|
|
13485
|
+
} else {
|
|
13486
|
+
at_rule_start = false;
|
|
13487
|
+
}
|
|
13488
|
+
}
|
|
13489
|
+
|
|
13490
|
+
// Second, recurse into leftmost nonterminals (or next-leftmost as long as the previous nonterminal may
|
|
13491
|
+
// be empty)
|
|
13492
|
+
bool recurse_into_nonterminal = true;
|
|
13493
|
+
for (size_t i = 0; i < rule.size(); i++) {
|
|
13494
|
+
if (rule[i].type == LLAMA_GRETYPE_RULE_REF && recurse_into_nonterminal) {
|
|
13495
|
+
if (llama_grammar_detect_left_recursion(rules, (size_t)rule[i].value, rules_visited, rules_in_progress, rules_may_be_empty)) {
|
|
13496
|
+
return true;
|
|
13497
|
+
}
|
|
13498
|
+
if (!((*rules_may_be_empty)[(size_t)rule[i].value])) {
|
|
13499
|
+
recurse_into_nonterminal = false;
|
|
13500
|
+
}
|
|
13501
|
+
} else if (llama_grammar_is_end_of_sequence(&rule[i])) {
|
|
13502
|
+
recurse_into_nonterminal = true;
|
|
13503
|
+
} else {
|
|
13504
|
+
recurse_into_nonterminal = false;
|
|
13505
|
+
}
|
|
13506
|
+
}
|
|
13507
|
+
|
|
13508
|
+
(*rules_in_progress)[rule_index] = false;
|
|
13509
|
+
(*rules_visited)[rule_index] = true;
|
|
13510
|
+
return false;
|
|
13511
|
+
}
|
|
13512
|
+
|
|
13109
13513
|
//
|
|
13110
13514
|
// grammar - external
|
|
13111
13515
|
//
|
|
@@ -13125,6 +13529,19 @@ struct llama_grammar * llama_grammar_init(
|
|
|
13125
13529
|
vec_rules[i].push_back({LLAMA_GRETYPE_END, 0});
|
|
13126
13530
|
}
|
|
13127
13531
|
|
|
13532
|
+
// Check for left recursion
|
|
13533
|
+
std::vector<bool> rules_visited(n_rules);
|
|
13534
|
+
std::vector<bool> rules_in_progress(n_rules);
|
|
13535
|
+
std::vector<bool> rules_may_be_empty(n_rules);
|
|
13536
|
+
for (size_t i = 0; i < n_rules; i++) {
|
|
13537
|
+
if (rules_visited[i]) {
|
|
13538
|
+
continue;
|
|
13539
|
+
}
|
|
13540
|
+
if (llama_grammar_detect_left_recursion(vec_rules, i, &rules_visited, &rules_in_progress, &rules_may_be_empty)) {
|
|
13541
|
+
throw std::runtime_error(format("unsupported grammar, left recursion detected for nonterminal at index %zu", i));
|
|
13542
|
+
}
|
|
13543
|
+
}
|
|
13544
|
+
|
|
13128
13545
|
// loop over alternates of start rule to build initial stacks
|
|
13129
13546
|
std::vector<std::vector<const llama_grammar_element *>> stacks;
|
|
13130
13547
|
pos = vec_rules[start_rule_index].data();
|
|
@@ -13147,6 +13564,9 @@ struct llama_grammar * llama_grammar_init(
|
|
|
13147
13564
|
}
|
|
13148
13565
|
} while (true);
|
|
13149
13566
|
|
|
13567
|
+
// Important: vec_rules has to be moved here, not copied, because stacks contains
|
|
13568
|
+
// pointers to elements of vec_rules. If vec_rules were copied into llama_grammar
|
|
13569
|
+
// then the pointers would be invalidated when the local vec_rules goes out of scope.
|
|
13150
13570
|
return new llama_grammar{ std::move(vec_rules), std::move(stacks), {} };
|
|
13151
13571
|
}
|
|
13152
13572
|
|
|
@@ -13741,9 +14161,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
|
13741
14161
|
|
|
13742
14162
|
// Sample the next word X using top-k sampling
|
|
13743
14163
|
llama_sample_top_k(nullptr, candidates, int(k), 1);
|
|
13744
|
-
|
|
13745
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13746
|
-
}
|
|
14164
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13747
14165
|
llama_token X = llama_sample_token(ctx, candidates);
|
|
13748
14166
|
t_start_sample_us = ggml_time_us();
|
|
13749
14167
|
|
|
@@ -13757,9 +14175,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
|
|
13757
14175
|
// Update mu using the learning rate and error
|
|
13758
14176
|
*mu = *mu - eta * e;
|
|
13759
14177
|
|
|
13760
|
-
|
|
13761
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13762
|
-
}
|
|
14178
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
13763
14179
|
return X;
|
|
13764
14180
|
}
|
|
13765
14181
|
|
|
@@ -14344,8 +14760,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
|
14344
14760
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
|
14345
14761
|
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
|
|
14346
14762
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
|
14347
|
-
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
|
14348
|
-
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
|
14349
14763
|
if (qs.model.type == MODEL_70B) {
|
|
14350
14764
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
|
14351
14765
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
|
@@ -15246,6 +15660,7 @@ struct llama_model_params llama_model_default_params() {
|
|
|
15246
15660
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
15247
15661
|
/*.main_gpu =*/ 0,
|
|
15248
15662
|
/*.tensor_split =*/ nullptr,
|
|
15663
|
+
/*.rpc_servers =*/ nullptr,
|
|
15249
15664
|
/*.progress_callback =*/ nullptr,
|
|
15250
15665
|
/*.progress_callback_user_data =*/ nullptr,
|
|
15251
15666
|
/*.kv_overrides =*/ nullptr,
|
|
@@ -15316,7 +15731,9 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
15316
15731
|
}
|
|
15317
15732
|
|
|
15318
15733
|
size_t llama_max_devices(void) {
|
|
15319
|
-
#if defined(
|
|
15734
|
+
#if defined(GGML_USE_RPC)
|
|
15735
|
+
return GGML_RPC_MAX_SERVERS;
|
|
15736
|
+
#elif defined(GGML_USE_METAL)
|
|
15320
15737
|
return 1;
|
|
15321
15738
|
#elif defined(GGML_USE_CUDA)
|
|
15322
15739
|
return GGML_CUDA_MAX_DEVICES;
|
|
@@ -15339,7 +15756,7 @@ bool llama_supports_mlock(void) {
|
|
|
15339
15756
|
|
|
15340
15757
|
bool llama_supports_gpu_offload(void) {
|
|
15341
15758
|
#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
|
|
15342
|
-
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
|
|
15759
|
+
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
|
|
15343
15760
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
15344
15761
|
return true;
|
|
15345
15762
|
#else
|
|
@@ -15356,10 +15773,6 @@ void llama_backend_init(void) {
|
|
|
15356
15773
|
struct ggml_context * ctx = ggml_init(params);
|
|
15357
15774
|
ggml_free(ctx);
|
|
15358
15775
|
}
|
|
15359
|
-
|
|
15360
|
-
#ifdef GGML_USE_MPI
|
|
15361
|
-
ggml_mpi_backend_init();
|
|
15362
|
-
#endif
|
|
15363
15776
|
}
|
|
15364
15777
|
|
|
15365
15778
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
@@ -15369,9 +15782,6 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
|
15369
15782
|
}
|
|
15370
15783
|
|
|
15371
15784
|
void llama_backend_free(void) {
|
|
15372
|
-
#ifdef GGML_USE_MPI
|
|
15373
|
-
ggml_mpi_backend_free();
|
|
15374
|
-
#endif
|
|
15375
15785
|
ggml_quantize_free();
|
|
15376
15786
|
}
|
|
15377
15787
|
|
|
@@ -15402,7 +15812,17 @@ struct llama_model * llama_load_model_from_file(
|
|
|
15402
15812
|
return true;
|
|
15403
15813
|
};
|
|
15404
15814
|
}
|
|
15405
|
-
|
|
15815
|
+
if (params.rpc_servers != nullptr) {
|
|
15816
|
+
// split the servers set them into model->rpc_servers
|
|
15817
|
+
std::string servers(params.rpc_servers);
|
|
15818
|
+
size_t pos = 0;
|
|
15819
|
+
while ((pos = servers.find(",")) != std::string::npos) {
|
|
15820
|
+
std::string server = servers.substr(0, pos);
|
|
15821
|
+
model->rpc_servers.push_back(server);
|
|
15822
|
+
servers.erase(0, pos + 1);
|
|
15823
|
+
}
|
|
15824
|
+
model->rpc_servers.push_back(servers);
|
|
15825
|
+
}
|
|
15406
15826
|
int status = llama_model_load(path_model, *model, params);
|
|
15407
15827
|
GGML_ASSERT(status <= 0);
|
|
15408
15828
|
if (status < 0) {
|
|
@@ -15441,6 +15861,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15441
15861
|
return nullptr;
|
|
15442
15862
|
}
|
|
15443
15863
|
|
|
15864
|
+
if (params.flash_attn && model->arch == LLM_ARCH_GROK) {
|
|
15865
|
+
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
|
15866
|
+
params.flash_attn = false;
|
|
15867
|
+
}
|
|
15868
|
+
|
|
15444
15869
|
llama_context * ctx = new llama_context(*model);
|
|
15445
15870
|
|
|
15446
15871
|
const auto & hparams = model->hparams;
|
|
@@ -15464,7 +15889,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15464
15889
|
cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale;
|
|
15465
15890
|
|
|
15466
15891
|
// this is necessary due to kv_self.n being padded later during inference
|
|
15467
|
-
cparams.n_ctx = GGML_PAD(cparams.n_ctx,
|
|
15892
|
+
cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_get_padding(cparams));
|
|
15468
15893
|
|
|
15469
15894
|
// with causal attention, the batch size is limited by the context size
|
|
15470
15895
|
cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch;
|
|
@@ -15499,6 +15924,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15499
15924
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
|
15500
15925
|
}
|
|
15501
15926
|
|
|
15927
|
+
cparams.yarn_attn_factor *= hparams.rope_attn_factor;
|
|
15502
15928
|
cparams.causal_attn = hparams.causal_attn;
|
|
15503
15929
|
|
|
15504
15930
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
|
@@ -15509,16 +15935,6 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15509
15935
|
}
|
|
15510
15936
|
}
|
|
15511
15937
|
|
|
15512
|
-
if (cparams.flash_attn && hparams.use_alibi) {
|
|
15513
|
-
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
|
|
15514
|
-
cparams.flash_attn = false;
|
|
15515
|
-
}
|
|
15516
|
-
|
|
15517
|
-
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
|
|
15518
|
-
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
|
|
15519
|
-
cparams.flash_attn = false;
|
|
15520
|
-
}
|
|
15521
|
-
|
|
15522
15938
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
|
15523
15939
|
params.seed = time(NULL);
|
|
15524
15940
|
}
|
|
@@ -15554,7 +15970,17 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15554
15970
|
|
|
15555
15971
|
if (!hparams.vocab_only) {
|
|
15556
15972
|
// initialize backends
|
|
15557
|
-
#
|
|
15973
|
+
#if defined(GGML_USE_RPC)
|
|
15974
|
+
for (auto & server : model->rpc_servers) {
|
|
15975
|
+
ggml_backend_t backend = ggml_backend_rpc_init(server.c_str());
|
|
15976
|
+
if (backend == nullptr) {
|
|
15977
|
+
LLAMA_LOG_ERROR("%s: failed to connect RPC backend to %s\n", __func__, server.c_str());
|
|
15978
|
+
llama_free(ctx);
|
|
15979
|
+
return nullptr;
|
|
15980
|
+
}
|
|
15981
|
+
ctx->backends.push_back(backend);
|
|
15982
|
+
}
|
|
15983
|
+
#elif defined(GGML_USE_METAL)
|
|
15558
15984
|
if (model->n_gpu_layers > 0) {
|
|
15559
15985
|
ctx->backend_metal = ggml_backend_metal_init();
|
|
15560
15986
|
if (ctx->backend_metal == nullptr) {
|
|
@@ -15710,7 +16136,11 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15710
16136
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
|
15711
16137
|
|
|
15712
16138
|
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
|
15713
|
-
bool pipeline_parallel =
|
|
16139
|
+
bool pipeline_parallel =
|
|
16140
|
+
llama_get_device_count(*model) > 1 &&
|
|
16141
|
+
model->n_gpu_layers > (int)model->hparams.n_layer &&
|
|
16142
|
+
model->split_mode == LLAMA_SPLIT_MODE_LAYER &&
|
|
16143
|
+
params.offload_kqv;
|
|
15714
16144
|
#ifndef GGML_USE_CUDA
|
|
15715
16145
|
// pipeline parallelism requires support for async compute and events
|
|
15716
16146
|
// currently this is only implemented in the CUDA backend
|
|
@@ -15753,20 +16183,6 @@ struct llama_context * llama_new_context_with_model(
|
|
|
15753
16183
|
}
|
|
15754
16184
|
}
|
|
15755
16185
|
|
|
15756
|
-
#ifdef GGML_USE_MPI
|
|
15757
|
-
ctx->ctx_mpi = ggml_mpi_init();
|
|
15758
|
-
|
|
15759
|
-
if (ggml_mpi_rank(ctx->ctx_mpi) > 0) {
|
|
15760
|
-
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
|
|
15761
|
-
// TODO: needs fix after #3228
|
|
15762
|
-
GGML_ASSERT(false && "not implemented");
|
|
15763
|
-
//const std::vector<llama_token> tmp(ctx->model.hparams.n_ctx, llama_token_bos(ctx));
|
|
15764
|
-
//while (!llama_eval(ctx, tmp.data(), tmp.size(), 0, 0)) {};
|
|
15765
|
-
llama_backend_free();
|
|
15766
|
-
exit(1);
|
|
15767
|
-
}
|
|
15768
|
-
#endif
|
|
15769
|
-
|
|
15770
16186
|
return ctx;
|
|
15771
16187
|
}
|
|
15772
16188
|
|
|
@@ -15803,11 +16219,11 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
15803
16219
|
// these models do not use RoPE
|
|
15804
16220
|
case LLM_ARCH_GPT2:
|
|
15805
16221
|
case LLM_ARCH_GPTJ:
|
|
15806
|
-
case LLM_ARCH_GPTNEOX:
|
|
15807
16222
|
case LLM_ARCH_MPT:
|
|
15808
16223
|
case LLM_ARCH_REFACT:
|
|
15809
16224
|
case LLM_ARCH_BLOOM:
|
|
15810
16225
|
case LLM_ARCH_MAMBA:
|
|
16226
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
15811
16227
|
return LLAMA_ROPE_TYPE_NONE;
|
|
15812
16228
|
|
|
15813
16229
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -15822,13 +16238,13 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
15822
16238
|
case LLM_ARCH_XVERSE:
|
|
15823
16239
|
case LLM_ARCH_COMMAND_R:
|
|
15824
16240
|
case LLM_ARCH_OLMO:
|
|
16241
|
+
case LLM_ARCH_ARCTIC:
|
|
15825
16242
|
return LLAMA_ROPE_TYPE_NORM;
|
|
15826
16243
|
|
|
15827
16244
|
// the pairs of head values are offset by n_rot/2
|
|
15828
16245
|
case LLM_ARCH_FALCON:
|
|
15829
16246
|
case LLM_ARCH_GROK:
|
|
15830
16247
|
case LLM_ARCH_DBRX:
|
|
15831
|
-
case LLM_ARCH_PERSIMMON:
|
|
15832
16248
|
case LLM_ARCH_BERT:
|
|
15833
16249
|
case LLM_ARCH_NOMIC_BERT:
|
|
15834
16250
|
case LLM_ARCH_STABLELM:
|
|
@@ -15839,6 +16255,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
15839
16255
|
case LLM_ARCH_PHI3:
|
|
15840
16256
|
case LLM_ARCH_GEMMA:
|
|
15841
16257
|
case LLM_ARCH_STARCODER2:
|
|
16258
|
+
case LLM_ARCH_GPTNEOX:
|
|
15842
16259
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
15843
16260
|
|
|
15844
16261
|
// all model arches should be listed explicitly here
|
|
@@ -15998,6 +16415,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
|
15998
16415
|
}
|
|
15999
16416
|
|
|
16000
16417
|
// make tensors
|
|
16418
|
+
cvec.tensors.reserve(model.hparams.n_layer);
|
|
16001
16419
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
|
16002
16420
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
|
16003
16421
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
|
@@ -16006,6 +16424,8 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
|
|
|
16006
16424
|
}
|
|
16007
16425
|
|
|
16008
16426
|
// allocate tensors / buffers and zero
|
|
16427
|
+
cvec.ctxs.reserve(ctx_map.size());
|
|
16428
|
+
cvec.bufs.reserve(ctx_map.size());
|
|
16009
16429
|
for (auto it : ctx_map) {
|
|
16010
16430
|
ggml_backend_buffer_type_t buft = it.first;
|
|
16011
16431
|
ggml_context * ctx = it.second;
|
|
@@ -16829,13 +17249,13 @@ static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llam
|
|
|
16829
17249
|
}
|
|
16830
17250
|
else {
|
|
16831
17251
|
if (cell_range_begin != kv_self.size) {
|
|
16832
|
-
cell_ranges.
|
|
17252
|
+
cell_ranges.emplace_back(cell_range_begin, i);
|
|
16833
17253
|
cell_range_begin = kv_self.size;
|
|
16834
17254
|
}
|
|
16835
17255
|
}
|
|
16836
17256
|
}
|
|
16837
17257
|
if (cell_range_begin != kv_self.size) {
|
|
16838
|
-
cell_ranges.
|
|
17258
|
+
cell_ranges.emplace_back(cell_range_begin, kv_self.size);
|
|
16839
17259
|
}
|
|
16840
17260
|
|
|
16841
17261
|
// DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count
|
|
@@ -17214,6 +17634,14 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_
|
|
|
17214
17634
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
|
17215
17635
|
}
|
|
17216
17636
|
|
|
17637
|
+
uint32_t llama_n_threads(struct llama_context * ctx) {
|
|
17638
|
+
return ctx->cparams.n_threads;
|
|
17639
|
+
}
|
|
17640
|
+
|
|
17641
|
+
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
|
17642
|
+
return ctx->cparams.n_threads_batch;
|
|
17643
|
+
}
|
|
17644
|
+
|
|
17217
17645
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
|
17218
17646
|
ctx->abort_callback = abort_callback;
|
|
17219
17647
|
ctx->abort_callback_data = abort_callback_data;
|
|
@@ -17437,6 +17865,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|
|
17437
17865
|
);
|
|
17438
17866
|
}
|
|
17439
17867
|
|
|
17868
|
+
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
|
17869
|
+
return llama_is_control_token(model->vocab, token);
|
|
17870
|
+
}
|
|
17871
|
+
|
|
17440
17872
|
llama_token llama_token_bos(const struct llama_model * model) {
|
|
17441
17873
|
return model->vocab.special_bos_id;
|
|
17442
17874
|
}
|
|
@@ -17648,6 +18080,15 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
17648
18080
|
}
|
|
17649
18081
|
}
|
|
17650
18082
|
// llama2 templates seem to not care about "add_generation_prompt"
|
|
18083
|
+
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos)) {
|
|
18084
|
+
// Phi 3
|
|
18085
|
+
for (auto message : chat) {
|
|
18086
|
+
std::string role(message->role);
|
|
18087
|
+
ss << "<|" << role << "|>\n" << message->content << "<|end|>\n";
|
|
18088
|
+
}
|
|
18089
|
+
if (add_ass) {
|
|
18090
|
+
ss << "<|assistant|>\n";
|
|
18091
|
+
}
|
|
17651
18092
|
} else if (tmpl == "zephyr" || tmpl.find("<|user|>") != std::string::npos) {
|
|
17652
18093
|
// zephyr template
|
|
17653
18094
|
for (auto message : chat) {
|
|
@@ -17780,15 +18221,6 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
17780
18221
|
if (add_ass) {
|
|
17781
18222
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
|
17782
18223
|
}
|
|
17783
|
-
} else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) {
|
|
17784
|
-
// Phi 3
|
|
17785
|
-
for (auto message : chat) {
|
|
17786
|
-
std::string role(message->role);
|
|
17787
|
-
ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n";
|
|
17788
|
-
}
|
|
17789
|
-
if (add_ass) {
|
|
17790
|
-
ss << "<|assistant|>\n";
|
|
17791
|
-
}
|
|
17792
18224
|
} else {
|
|
17793
18225
|
// template not supported
|
|
17794
18226
|
return -1;
|
|
@@ -17910,8 +18342,10 @@ const char * llama_print_system_info(void) {
|
|
|
17910
18342
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
17911
18343
|
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
|
17912
18344
|
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
|
18345
|
+
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
|
17913
18346
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
|
17914
18347
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
18348
|
+
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
|
17915
18349
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
|
17916
18350
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
|
17917
18351
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
|
@@ -17970,6 +18404,8 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
|
17970
18404
|
g_state.log_callback_user_data = user_data;
|
|
17971
18405
|
#ifdef GGML_USE_METAL
|
|
17972
18406
|
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
18407
|
+
#elif defined(GGML_USE_CUDA)
|
|
18408
|
+
ggml_backend_cuda_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
17973
18409
|
#endif
|
|
17974
18410
|
}
|
|
17975
18411
|
|