@fugood/llama.node 0.3.6 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -2
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +3 -1
- package/lib/index.js +16 -1
- package/lib/index.ts +16 -0
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +4 -3
- package/src/LlamaCompletionWorker.cpp +4 -2
- package/src/LlamaContext.cpp +61 -6
- package/src/LlamaContext.h +1 -0
- package/src/common.hpp +6 -11
- package/src/llama.cpp/.github/workflows/build.yml +19 -17
- package/src/llama.cpp/.github/workflows/docker.yml +77 -30
- package/src/llama.cpp/.github/workflows/editorconfig.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +22 -3
- package/src/llama.cpp/CMakeLists.txt +49 -24
- package/src/llama.cpp/common/arg.cpp +82 -26
- package/src/llama.cpp/common/arg.h +3 -0
- package/src/llama.cpp/common/common.cpp +192 -72
- package/src/llama.cpp/common/common.h +51 -18
- package/src/llama.cpp/common/ngram-cache.cpp +12 -12
- package/src/llama.cpp/common/ngram-cache.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +11 -6
- package/src/llama.cpp/common/speculative.cpp +18 -15
- package/src/llama.cpp/docs/build.md +2 -0
- package/src/llama.cpp/examples/batched/batched.cpp +9 -7
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +10 -8
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +11 -8
- package/src/llama.cpp/examples/cvector-generator/mean.hpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +8 -7
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +7 -6
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +8 -7
- package/src/llama.cpp/examples/gguf/gguf.cpp +10 -6
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +1 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +8 -7
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +13 -10
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +13 -12
- package/src/llama.cpp/examples/infill/infill.cpp +23 -24
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +44 -13
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -6
- package/src/llama.cpp/examples/llava/clip.cpp +4 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +9 -6
- package/src/llama.cpp/examples/llava/llava.cpp +2 -2
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +8 -4
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +11 -8
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +6 -7
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +4 -9
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +3 -7
- package/src/llama.cpp/examples/lookup/lookup.cpp +5 -6
- package/src/llama.cpp/examples/main/main.cpp +51 -29
- package/src/llama.cpp/examples/parallel/parallel.cpp +5 -6
- package/src/llama.cpp/examples/passkey/passkey.cpp +7 -5
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +37 -23
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -14
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +8 -8
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +12 -0
- package/src/llama.cpp/examples/run/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.cpp +1351 -0
- package/src/llama.cpp/examples/run/linenoise.cpp/linenoise.h +114 -0
- package/src/llama.cpp/examples/run/run.cpp +175 -61
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -25
- package/src/llama.cpp/examples/server/CMakeLists.txt +1 -0
- package/src/llama.cpp/examples/server/httplib.h +1295 -409
- package/src/llama.cpp/examples/server/server.cpp +387 -181
- package/src/llama.cpp/examples/server/tests/requirements.txt +1 -0
- package/src/llama.cpp/examples/server/utils.hpp +170 -58
- package/src/llama.cpp/examples/simple/simple.cpp +9 -8
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +16 -12
- package/src/llama.cpp/examples/speculative/speculative.cpp +22 -23
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +8 -12
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +17 -5
- package/src/llama.cpp/examples/tts/tts.cpp +64 -23
- package/src/llama.cpp/ggml/CMakeLists.txt +5 -21
- package/src/llama.cpp/ggml/include/ggml-backend.h +2 -0
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -0
- package/src/llama.cpp/ggml/include/ggml.h +36 -145
- package/src/llama.cpp/ggml/include/gguf.h +202 -0
- package/src/llama.cpp/ggml/src/CMakeLists.txt +6 -3
- package/src/llama.cpp/ggml/src/ggml-alloc.c +5 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +79 -49
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +33 -23
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +57 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +87 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +335 -66
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +10 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1090 -378
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +3 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-impl.h +11 -16
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +16 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +154 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +18 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +40 -95
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +48 -48
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +24 -24
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +238 -164
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +105 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +3 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +7 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +1 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +74 -4
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +314 -116
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +9 -3
- package/src/llama.cpp/ggml/src/ggml.c +117 -1327
- package/src/llama.cpp/ggml/src/gguf.cpp +1329 -0
- package/src/llama.cpp/include/llama-cpp.h +6 -1
- package/src/llama.cpp/include/llama.h +138 -75
- package/src/llama.cpp/src/CMakeLists.txt +13 -1
- package/src/llama.cpp/src/llama-adapter.cpp +347 -0
- package/src/llama.cpp/src/llama-adapter.h +74 -0
- package/src/llama.cpp/src/llama-arch.cpp +1487 -0
- package/src/llama.cpp/src/llama-arch.h +400 -0
- package/src/llama.cpp/src/llama-batch.cpp +368 -0
- package/src/llama.cpp/src/llama-batch.h +88 -0
- package/src/llama.cpp/src/llama-chat.cpp +578 -0
- package/src/llama.cpp/src/llama-chat.h +52 -0
- package/src/llama.cpp/src/llama-context.cpp +1775 -0
- package/src/llama.cpp/src/llama-context.h +128 -0
- package/src/llama.cpp/src/llama-cparams.cpp +1 -0
- package/src/llama.cpp/src/llama-cparams.h +37 -0
- package/src/llama.cpp/src/llama-grammar.cpp +5 -4
- package/src/llama.cpp/src/llama-grammar.h +3 -1
- package/src/llama.cpp/src/llama-hparams.cpp +71 -0
- package/src/llama.cpp/src/llama-hparams.h +139 -0
- package/src/llama.cpp/src/llama-impl.cpp +167 -0
- package/src/llama.cpp/src/llama-impl.h +16 -136
- package/src/llama.cpp/src/llama-kv-cache.cpp +718 -0
- package/src/llama.cpp/src/llama-kv-cache.h +218 -0
- package/src/llama.cpp/src/llama-mmap.cpp +589 -0
- package/src/llama.cpp/src/llama-mmap.h +67 -0
- package/src/llama.cpp/src/llama-model-loader.cpp +1124 -0
- package/src/llama.cpp/src/llama-model-loader.h +167 -0
- package/src/llama.cpp/src/llama-model.cpp +3953 -0
- package/src/llama.cpp/src/llama-model.h +370 -0
- package/src/llama.cpp/src/llama-quant.cpp +934 -0
- package/src/llama.cpp/src/llama-quant.h +1 -0
- package/src/llama.cpp/src/llama-sampling.cpp +147 -32
- package/src/llama.cpp/src/llama-sampling.h +3 -19
- package/src/llama.cpp/src/llama-vocab.cpp +1832 -575
- package/src/llama.cpp/src/llama-vocab.h +97 -142
- package/src/llama.cpp/src/llama.cpp +7160 -20314
- package/src/llama.cpp/src/unicode.cpp +8 -3
- package/src/llama.cpp/tests/CMakeLists.txt +2 -0
- package/src/llama.cpp/tests/test-autorelease.cpp +3 -3
- package/src/llama.cpp/tests/test-backend-ops.cpp +370 -59
- package/src/llama.cpp/tests/test-chat-template.cpp +162 -125
- package/src/llama.cpp/tests/test-gguf.cpp +222 -187
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +1 -1
- package/src/llama.cpp/tests/test-sampling.cpp +0 -1
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +4 -4
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +9 -7
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +8 -6
|
@@ -0,0 +1,3953 @@
|
|
|
1
|
+
#include "llama-model.h"
|
|
2
|
+
|
|
3
|
+
#include "llama-impl.h"
|
|
4
|
+
#include "llama-mmap.h"
|
|
5
|
+
#include "llama-model-loader.h"
|
|
6
|
+
|
|
7
|
+
#include "ggml-cpp.h"
|
|
8
|
+
|
|
9
|
+
#include <algorithm>
|
|
10
|
+
#include <cassert>
|
|
11
|
+
#include <cstring>
|
|
12
|
+
#include <functional>
|
|
13
|
+
#include <map>
|
|
14
|
+
#include <sstream>
|
|
15
|
+
#include <stdexcept>
|
|
16
|
+
|
|
17
|
+
const char * llm_type_name(llm_type type) {
|
|
18
|
+
switch (type) {
|
|
19
|
+
case LLM_TYPE_14M: return "14M";
|
|
20
|
+
case LLM_TYPE_17M: return "17M";
|
|
21
|
+
case LLM_TYPE_22M: return "22M";
|
|
22
|
+
case LLM_TYPE_33M: return "33M";
|
|
23
|
+
case LLM_TYPE_60M: return "60M";
|
|
24
|
+
case LLM_TYPE_70M: return "70M";
|
|
25
|
+
case LLM_TYPE_80M: return "80M";
|
|
26
|
+
case LLM_TYPE_109M: return "109M";
|
|
27
|
+
case LLM_TYPE_137M: return "137M";
|
|
28
|
+
case LLM_TYPE_160M: return "160M";
|
|
29
|
+
case LLM_TYPE_220M: return "220M";
|
|
30
|
+
case LLM_TYPE_250M: return "250M";
|
|
31
|
+
case LLM_TYPE_270M: return "270M";
|
|
32
|
+
case LLM_TYPE_335M: return "335M";
|
|
33
|
+
case LLM_TYPE_410M: return "410M";
|
|
34
|
+
case LLM_TYPE_450M: return "450M";
|
|
35
|
+
case LLM_TYPE_770M: return "770M";
|
|
36
|
+
case LLM_TYPE_780M: return "780M";
|
|
37
|
+
case LLM_TYPE_0_5B: return "0.5B";
|
|
38
|
+
case LLM_TYPE_1B: return "1B";
|
|
39
|
+
case LLM_TYPE_1_3B: return "1.3B";
|
|
40
|
+
case LLM_TYPE_1_4B: return "1.4B";
|
|
41
|
+
case LLM_TYPE_1_5B: return "1.5B";
|
|
42
|
+
case LLM_TYPE_1_6B: return "1.6B";
|
|
43
|
+
case LLM_TYPE_2B: return "2B";
|
|
44
|
+
case LLM_TYPE_2_8B: return "2.8B";
|
|
45
|
+
case LLM_TYPE_3B: return "3B";
|
|
46
|
+
case LLM_TYPE_4B: return "4B";
|
|
47
|
+
case LLM_TYPE_6B: return "6B";
|
|
48
|
+
case LLM_TYPE_6_9B: return "6.9B";
|
|
49
|
+
case LLM_TYPE_7B: return "7B";
|
|
50
|
+
case LLM_TYPE_8B: return "8B";
|
|
51
|
+
case LLM_TYPE_9B: return "9B";
|
|
52
|
+
case LLM_TYPE_11B: return "11B";
|
|
53
|
+
case LLM_TYPE_12B: return "12B";
|
|
54
|
+
case LLM_TYPE_13B: return "13B";
|
|
55
|
+
case LLM_TYPE_14B: return "14B";
|
|
56
|
+
case LLM_TYPE_15B: return "15B";
|
|
57
|
+
case LLM_TYPE_16B: return "16B";
|
|
58
|
+
case LLM_TYPE_20B: return "20B";
|
|
59
|
+
case LLM_TYPE_30B: return "30B";
|
|
60
|
+
case LLM_TYPE_32B: return "32B";
|
|
61
|
+
case LLM_TYPE_34B: return "34B";
|
|
62
|
+
case LLM_TYPE_35B: return "35B";
|
|
63
|
+
case LLM_TYPE_40B: return "40B";
|
|
64
|
+
case LLM_TYPE_65B: return "65B";
|
|
65
|
+
case LLM_TYPE_70B: return "70B";
|
|
66
|
+
case LLM_TYPE_236B: return "236B";
|
|
67
|
+
case LLM_TYPE_314B: return "314B";
|
|
68
|
+
case LLM_TYPE_671B: return "671B";
|
|
69
|
+
case LLM_TYPE_SMALL: return "0.1B";
|
|
70
|
+
case LLM_TYPE_MEDIUM: return "0.4B";
|
|
71
|
+
case LLM_TYPE_LARGE: return "0.8B";
|
|
72
|
+
case LLM_TYPE_XL: return "1.5B";
|
|
73
|
+
case LLM_TYPE_A1_7B: return "A1.7B";
|
|
74
|
+
case LLM_TYPE_A2_7B: return "A2.7B";
|
|
75
|
+
case LLM_TYPE_8x7B: return "8x7B";
|
|
76
|
+
case LLM_TYPE_8x22B: return "8x22B";
|
|
77
|
+
case LLM_TYPE_16x12B: return "16x12B";
|
|
78
|
+
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
|
79
|
+
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
|
80
|
+
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
81
|
+
case LLM_TYPE_27B: return "27B";
|
|
82
|
+
default: return "?B";
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
static const char * llama_expert_gating_func_name(llama_expert_gating_func_type type) {
|
|
87
|
+
switch (type) {
|
|
88
|
+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
|
|
89
|
+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
|
|
90
|
+
default: return "unknown";
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
|
95
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
|
96
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
|
97
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
|
98
|
+
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
|
99
|
+
};
|
|
100
|
+
|
|
101
|
+
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
102
|
+
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
|
103
|
+
if (kv.second == name) {
|
|
104
|
+
return (llama_rope_scaling_type) kv.first;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// checks if the weight tensor can be used with the specified buffer type and device
|
|
112
|
+
static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w, ggml_op op, ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev) {
|
|
113
|
+
GGML_ASSERT(w != nullptr);
|
|
114
|
+
|
|
115
|
+
if (op == GGML_OP_NONE) {
|
|
116
|
+
return true;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
ggml_init_params params = {
|
|
120
|
+
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
|
121
|
+
/*.mem_buffer =*/ NULL,
|
|
122
|
+
/*.no_alloc =*/ true,
|
|
123
|
+
};
|
|
124
|
+
ggml_context_ptr ctx_ptr { ggml_init(params) };
|
|
125
|
+
if (!ctx_ptr) {
|
|
126
|
+
throw std::runtime_error(format("failed to create ggml context"));
|
|
127
|
+
}
|
|
128
|
+
ggml_context * ctx = ctx_ptr.get();
|
|
129
|
+
|
|
130
|
+
ggml_tensor * op_tensor = nullptr;
|
|
131
|
+
|
|
132
|
+
switch (op) {
|
|
133
|
+
case GGML_OP_GET_ROWS:
|
|
134
|
+
{
|
|
135
|
+
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
|
136
|
+
op_tensor = ggml_get_rows(ctx, w, b);
|
|
137
|
+
} break;
|
|
138
|
+
case GGML_OP_MUL_MAT:
|
|
139
|
+
{
|
|
140
|
+
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], 512, w->ne[2], w->ne[3]);
|
|
141
|
+
op_tensor = ggml_mul_mat(ctx, w, b);
|
|
142
|
+
} break;
|
|
143
|
+
case GGML_OP_MUL_MAT_ID:
|
|
144
|
+
{
|
|
145
|
+
int n_expert_used = hparams.n_expert_used;
|
|
146
|
+
ggml_tensor * b = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, w->ne[0], n_expert_used, 512);
|
|
147
|
+
ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_expert_used, 512);
|
|
148
|
+
op_tensor = ggml_mul_mat_id(ctx, w, b, ids);
|
|
149
|
+
} break;
|
|
150
|
+
case GGML_OP_ADD:
|
|
151
|
+
{
|
|
152
|
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
153
|
+
op_tensor = ggml_add(ctx, a, w);
|
|
154
|
+
} break;
|
|
155
|
+
case GGML_OP_MUL:
|
|
156
|
+
{
|
|
157
|
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
158
|
+
op_tensor = ggml_mul(ctx, a, w);
|
|
159
|
+
} break;
|
|
160
|
+
case GGML_OP_DIV:
|
|
161
|
+
{
|
|
162
|
+
ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, w->ne[0]);
|
|
163
|
+
op_tensor = ggml_div(ctx, a, w);
|
|
164
|
+
} break;
|
|
165
|
+
case GGML_OP_ROPE:
|
|
166
|
+
{
|
|
167
|
+
int n_embd_head = hparams.n_embd_head_v;
|
|
168
|
+
int n_head = hparams.n_head();
|
|
169
|
+
ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
|
|
170
|
+
ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
|
|
171
|
+
op_tensor = ggml_rope_ext(
|
|
172
|
+
ctx, a, b, w,
|
|
173
|
+
0, 0, 0, 0, 0,
|
|
174
|
+
0, 0, 0, 0
|
|
175
|
+
);
|
|
176
|
+
|
|
177
|
+
} break;
|
|
178
|
+
case GGML_OP_SSM_CONV:
|
|
179
|
+
{
|
|
180
|
+
// FIXME
|
|
181
|
+
ggml_tensor * conv_x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 12345, w->ne[1], 6789);
|
|
182
|
+
op_tensor = ggml_ssm_conv(ctx, conv_x, w);
|
|
183
|
+
} break;
|
|
184
|
+
case GGML_OP_SSM_SCAN:
|
|
185
|
+
{
|
|
186
|
+
// FIXME
|
|
187
|
+
const int64_t d_state = w->ne[0];
|
|
188
|
+
const int64_t d_inner = w->ne[1];
|
|
189
|
+
const int64_t n_seq_tokens = 512;
|
|
190
|
+
const int64_t n_seqs = 1;
|
|
191
|
+
ggml_tensor * s = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, d_inner, n_seqs);
|
|
192
|
+
ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
|
|
193
|
+
ggml_tensor * dt = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_inner, n_seq_tokens, n_seqs);
|
|
194
|
+
ggml_tensor * B = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
|
|
195
|
+
ggml_tensor * C = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, d_state, n_seq_tokens, n_seqs);
|
|
196
|
+
op_tensor = ggml_ssm_scan(ctx, s, x, dt, w, B, C);
|
|
197
|
+
} break;
|
|
198
|
+
case GGML_OP_RWKV_WKV6:
|
|
199
|
+
{
|
|
200
|
+
// FIXME
|
|
201
|
+
const int64_t S = 123;
|
|
202
|
+
const int64_t H = 123;
|
|
203
|
+
const int64_t n_tokens = 123;
|
|
204
|
+
const int64_t n_seqs = 123;
|
|
205
|
+
ggml_tensor * k = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
206
|
+
ggml_tensor * v = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
207
|
+
ggml_tensor * r = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
208
|
+
ggml_tensor * tf = w;
|
|
209
|
+
ggml_tensor * td = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, S, H, n_tokens);
|
|
210
|
+
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
|
211
|
+
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
|
212
|
+
} break;
|
|
213
|
+
case GGML_OP_IM2COL:
|
|
214
|
+
{
|
|
215
|
+
const int n_embd = hparams.n_embd;
|
|
216
|
+
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
|
217
|
+
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
|
218
|
+
} break;
|
|
219
|
+
default:
|
|
220
|
+
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// create a temporary dummy buffer for the weight so that supports_op can check the buffer type
|
|
224
|
+
GGML_ASSERT(w->buffer == nullptr);
|
|
225
|
+
w->buffer = ggml_backend_buft_alloc_buffer(buft, 0);
|
|
226
|
+
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
|
227
|
+
ggml_backend_buffer_free(w->buffer);
|
|
228
|
+
w->buffer = nullptr;
|
|
229
|
+
|
|
230
|
+
return op_supported;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// lists of buffer types used for each layer
|
|
234
|
+
using buft_list_t = std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>>;
|
|
235
|
+
|
|
236
|
+
// find the first buffer type in the list that can use the tensor
|
|
237
|
+
static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hparams, ggml_tensor * tensor, ggml_op op, const buft_list_t & buft_list) {
|
|
238
|
+
GGML_ASSERT(!buft_list.empty());
|
|
239
|
+
for (const auto & cur : buft_list) {
|
|
240
|
+
ggml_backend_dev_t cur_dev = cur.first;
|
|
241
|
+
ggml_backend_buffer_type_t cur_buft = cur.second;
|
|
242
|
+
if (weight_buft_supported(hparams, tensor, op, cur_buft, cur_dev)) {
|
|
243
|
+
return cur_buft;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
return nullptr;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// CPU: ACCEL -> CPU extra -> GPU host -> CPU
|
|
250
|
+
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
|
251
|
+
buft_list_t buft_list;
|
|
252
|
+
|
|
253
|
+
// add ACCEL buffer types
|
|
254
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
255
|
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
256
|
+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
|
257
|
+
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
258
|
+
// skip
|
|
259
|
+
if (buft != ggml_backend_cpu_buffer_type()) {
|
|
260
|
+
buft_list.emplace_back(dev, buft);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// add extra buffer types
|
|
266
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
267
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
268
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
269
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
270
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
271
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
272
|
+
while (extra_bufts && *extra_bufts) {
|
|
273
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
274
|
+
++extra_bufts;
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// add a host buffer type
|
|
279
|
+
// storing the tensors in a host buffer is useful when the processing of large batches
|
|
280
|
+
// is offloaded to a GPU device, since it reduces the time spent on data transfers
|
|
281
|
+
// generally, this will be done using the first device in the list
|
|
282
|
+
// a better approach would be to handle this on a weight-by-weight basis using the offload_op
|
|
283
|
+
// function of the device to determine if it would benefit from being stored in a host buffer
|
|
284
|
+
for (auto * dev : devices) {
|
|
285
|
+
ggml_backend_buffer_type_t buft = ggml_backend_dev_host_buffer_type(dev);
|
|
286
|
+
if (buft) {
|
|
287
|
+
buft_list.emplace_back(dev, buft);
|
|
288
|
+
break;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// add the CPU buffer type
|
|
293
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
294
|
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
295
|
+
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
296
|
+
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
return buft_list;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// GPU: split if LLAMA_SPLIT_MODE_ROW -> GPU
|
|
304
|
+
static buft_list_t make_gpu_buft_list(ggml_backend_dev_t dev, enum llama_split_mode split_mode, const float * tensor_split) {
|
|
305
|
+
buft_list_t buft_list;
|
|
306
|
+
|
|
307
|
+
// add the device split buffer type if requested and available
|
|
308
|
+
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
|
309
|
+
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
|
|
310
|
+
auto ggml_backend_split_buffer_type_fn = (ggml_backend_split_buffer_type_t)
|
|
311
|
+
ggml_backend_reg_get_proc_address(reg, "ggml_backend_split_buffer_type");
|
|
312
|
+
if (ggml_backend_split_buffer_type_fn) {
|
|
313
|
+
size_t dev_index = [&]() {
|
|
314
|
+
auto * reg = ggml_backend_dev_backend_reg(dev);
|
|
315
|
+
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); ++i) {
|
|
316
|
+
if (ggml_backend_reg_dev_get(reg, i) == dev) {
|
|
317
|
+
return i;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
throw std::runtime_error(format("device %s not found in its backend reg", ggml_backend_dev_name(dev)));
|
|
321
|
+
}();
|
|
322
|
+
auto * buft = ggml_backend_split_buffer_type_fn(dev_index, tensor_split);
|
|
323
|
+
if (buft != nullptr) {
|
|
324
|
+
buft_list.emplace_back(dev, buft);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// add the device default buffer type
|
|
330
|
+
buft_list.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
|
|
331
|
+
|
|
332
|
+
return buft_list;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
struct llama_model::impl {
|
|
336
|
+
impl() {}
|
|
337
|
+
~impl() {}
|
|
338
|
+
|
|
339
|
+
uint64_t n_elements = 0;
|
|
340
|
+
|
|
341
|
+
size_t n_bytes = 0;
|
|
342
|
+
|
|
343
|
+
std::string desc_str;
|
|
344
|
+
|
|
345
|
+
// model memory mapped files
|
|
346
|
+
llama_mmaps mappings;
|
|
347
|
+
|
|
348
|
+
// objects representing data potentially being locked in memory
|
|
349
|
+
llama_mlocks mlock_bufs;
|
|
350
|
+
llama_mlocks mlock_mmaps;
|
|
351
|
+
|
|
352
|
+
// contexts where the model tensors metadata is stored
|
|
353
|
+
std::vector<ggml_context_ptr> ctxs;
|
|
354
|
+
|
|
355
|
+
// the model memory buffers for the tensor data
|
|
356
|
+
std::vector<ggml_backend_buffer_ptr> bufs;
|
|
357
|
+
|
|
358
|
+
buft_list_t cpu_buft_list;
|
|
359
|
+
std::map<ggml_backend_dev_t, buft_list_t> gpu_buft_list;
|
|
360
|
+
|
|
361
|
+
struct layer_dev {
|
|
362
|
+
ggml_backend_dev_t dev;
|
|
363
|
+
buft_list_t * buft_list;
|
|
364
|
+
};
|
|
365
|
+
|
|
366
|
+
layer_dev dev_input = {};
|
|
367
|
+
layer_dev dev_output = {};
|
|
368
|
+
std::vector<layer_dev> dev_layer;
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
llama_model::llama_model(const struct llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
llama_model::~llama_model() {}
|
|
375
|
+
|
|
376
|
+
void llama_model::load_stats(llama_model_loader & ml) {
|
|
377
|
+
pimpl->n_elements = ml.n_elements;
|
|
378
|
+
pimpl->n_bytes = ml.n_bytes;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
void llama_model::load_arch(llama_model_loader & ml) {
|
|
382
|
+
arch = ml.get_arch();
|
|
383
|
+
if (arch == LLM_ARCH_UNKNOWN) {
|
|
384
|
+
throw std::runtime_error("unknown model architecture: '" + ml.get_arch_name() + "'");
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
void llama_model::load_hparams(llama_model_loader & ml) {
|
|
389
|
+
const gguf_context * ctx = ml.meta.get();
|
|
390
|
+
|
|
391
|
+
// get metadata as string
|
|
392
|
+
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
|
393
|
+
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
|
394
|
+
if (type == GGUF_TYPE_ARRAY) {
|
|
395
|
+
continue;
|
|
396
|
+
}
|
|
397
|
+
const char * name = gguf_get_key(ctx, i);
|
|
398
|
+
const std::string value = gguf_kv_to_str(ctx, i);
|
|
399
|
+
gguf_kv.emplace(name, value);
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
// get general kv
|
|
403
|
+
ml.get_key(LLM_KV_GENERAL_NAME, name, false);
|
|
404
|
+
|
|
405
|
+
// everything past this point is not vocab-related
|
|
406
|
+
if (hparams.vocab_only) {
|
|
407
|
+
return;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
|
411
|
+
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
|
412
|
+
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
|
413
|
+
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
414
|
+
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
415
|
+
|
|
416
|
+
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
|
417
|
+
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
|
418
|
+
|
|
419
|
+
ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
|
420
|
+
ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
|
421
|
+
|
|
422
|
+
ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
|
423
|
+
ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
|
427
|
+
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
|
428
|
+
if (hparams.n_expert > 0) {
|
|
429
|
+
GGML_ASSERT(hparams.n_expert_used > 0);
|
|
430
|
+
} else {
|
|
431
|
+
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
// zero-out the array hparams
|
|
435
|
+
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
436
|
+
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
437
|
+
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
438
|
+
|
|
439
|
+
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
|
440
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
|
441
|
+
|
|
442
|
+
// n_head_kv is optional, default to n_head
|
|
443
|
+
hparams.n_head_kv_arr = hparams.n_head_arr;
|
|
444
|
+
|
|
445
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false);
|
|
446
|
+
|
|
447
|
+
bool rope_finetuned = false;
|
|
448
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
|
449
|
+
hparams.rope_finetuned = rope_finetuned;
|
|
450
|
+
|
|
451
|
+
hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
|
|
452
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
|
|
453
|
+
|
|
454
|
+
// rope_freq_base (optional)
|
|
455
|
+
hparams.rope_freq_base_train = 10000.0f;
|
|
456
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
|
|
457
|
+
|
|
458
|
+
std::string rope_scaling("linear");
|
|
459
|
+
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
|
460
|
+
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
|
461
|
+
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
|
462
|
+
|
|
463
|
+
// rope_freq_scale (inverse of the kv) is optional
|
|
464
|
+
float ropescale = 0.0f;
|
|
465
|
+
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
|
466
|
+
// try the old key name
|
|
467
|
+
ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
|
|
468
|
+
}
|
|
469
|
+
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
|
470
|
+
|
|
471
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
|
|
472
|
+
|
|
473
|
+
// non-transformer models do not have attention heads
|
|
474
|
+
if (hparams.n_head() > 0) {
|
|
475
|
+
// gpt-neox n_rot = rotary_pct * (n_embd / n_head)
|
|
476
|
+
// gpt-j n_rot = rotary_dim
|
|
477
|
+
|
|
478
|
+
hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
|
|
479
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
|
|
480
|
+
|
|
481
|
+
hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
|
|
482
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
|
|
483
|
+
|
|
484
|
+
// sanity check for n_rot (optional)
|
|
485
|
+
hparams.n_rot = hparams.n_embd_head_k;
|
|
486
|
+
|
|
487
|
+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
|
488
|
+
|
|
489
|
+
if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON) {
|
|
490
|
+
if (hparams.n_rot != hparams.n_embd_head_k) {
|
|
491
|
+
throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
} else {
|
|
495
|
+
hparams.n_rot = 0;
|
|
496
|
+
hparams.n_embd_head_k = 0;
|
|
497
|
+
hparams.n_embd_head_v = 0;
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// for differentiating model types
|
|
501
|
+
uint32_t n_vocab = 0;
|
|
502
|
+
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
|
|
503
|
+
|
|
504
|
+
// arch-specific KVs
|
|
505
|
+
switch (arch) {
|
|
506
|
+
case LLM_ARCH_LLAMA:
|
|
507
|
+
{
|
|
508
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
509
|
+
|
|
510
|
+
if (hparams.n_expert == 8) {
|
|
511
|
+
switch (hparams.n_layer) {
|
|
512
|
+
case 32: type = LLM_TYPE_8x7B; break;
|
|
513
|
+
case 56: type = LLM_TYPE_8x22B; break;
|
|
514
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
515
|
+
}
|
|
516
|
+
} else {
|
|
517
|
+
switch (hparams.n_layer) {
|
|
518
|
+
case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B
|
|
519
|
+
case 22: type = LLM_TYPE_1B; break;
|
|
520
|
+
case 26: type = LLM_TYPE_3B; break;
|
|
521
|
+
case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B
|
|
522
|
+
// granite uses a vocab with len 49152
|
|
523
|
+
case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break;
|
|
524
|
+
case 36: type = LLM_TYPE_8B; break; // granite
|
|
525
|
+
case 40: type = LLM_TYPE_13B; break;
|
|
526
|
+
case 48: type = LLM_TYPE_34B; break;
|
|
527
|
+
case 60: type = LLM_TYPE_30B; break;
|
|
528
|
+
case 80: type = hparams.n_head() == hparams.n_head_kv() ? LLM_TYPE_65B : LLM_TYPE_70B; break;
|
|
529
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
} break;
|
|
533
|
+
case LLM_ARCH_DECI:
|
|
534
|
+
{
|
|
535
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
536
|
+
switch (hparams.n_layer) {
|
|
537
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
538
|
+
case 80: type = LLM_TYPE_70B; break;
|
|
539
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
540
|
+
}
|
|
541
|
+
} break;
|
|
542
|
+
case LLM_ARCH_MINICPM:
|
|
543
|
+
{
|
|
544
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
545
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
546
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
547
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
548
|
+
|
|
549
|
+
switch (hparams.n_layer) {
|
|
550
|
+
case 52: type = LLM_TYPE_1B; break;
|
|
551
|
+
case 40: type = LLM_TYPE_2B; break;
|
|
552
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
553
|
+
}
|
|
554
|
+
} break;
|
|
555
|
+
case LLM_ARCH_MINICPM3:
|
|
556
|
+
{
|
|
557
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
558
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
559
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
560
|
+
|
|
561
|
+
switch (hparams.n_layer) {
|
|
562
|
+
case 62: type = LLM_TYPE_4B; break;
|
|
563
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
564
|
+
}
|
|
565
|
+
} break;
|
|
566
|
+
case LLM_ARCH_GROK:
|
|
567
|
+
{
|
|
568
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
569
|
+
|
|
570
|
+
switch (hparams.n_layer) {
|
|
571
|
+
case 64: type = LLM_TYPE_314B; break;
|
|
572
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
573
|
+
}
|
|
574
|
+
} break;
|
|
575
|
+
case LLM_ARCH_FALCON:
|
|
576
|
+
{
|
|
577
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
578
|
+
|
|
579
|
+
switch (hparams.n_layer) {
|
|
580
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
581
|
+
case 60: type = LLM_TYPE_40B; break;
|
|
582
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
583
|
+
}
|
|
584
|
+
} break;
|
|
585
|
+
case LLM_ARCH_BAICHUAN:
|
|
586
|
+
{
|
|
587
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
588
|
+
switch (hparams.n_layer) {
|
|
589
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
590
|
+
case 40: type = LLM_TYPE_13B; break;
|
|
591
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
if (type == LLM_TYPE_13B) {
|
|
595
|
+
// TODO: become GGUF KV parameter
|
|
596
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
597
|
+
}
|
|
598
|
+
} break;
|
|
599
|
+
case LLM_ARCH_STARCODER:
|
|
600
|
+
{
|
|
601
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
602
|
+
switch (hparams.n_layer) {
|
|
603
|
+
case 24: type = LLM_TYPE_1B; break;
|
|
604
|
+
case 36: type = LLM_TYPE_3B; break;
|
|
605
|
+
case 42: type = LLM_TYPE_7B; break;
|
|
606
|
+
case 40: type = LLM_TYPE_15B; break;
|
|
607
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
608
|
+
}
|
|
609
|
+
} break;
|
|
610
|
+
case LLM_ARCH_REFACT:
|
|
611
|
+
{
|
|
612
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
613
|
+
switch (hparams.n_layer) {
|
|
614
|
+
case 32: type = LLM_TYPE_1B; break;
|
|
615
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
// TODO: become GGUF KV parameter
|
|
619
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
620
|
+
} break;
|
|
621
|
+
case LLM_ARCH_BERT:
|
|
622
|
+
{
|
|
623
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
624
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
625
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
626
|
+
|
|
627
|
+
switch (hparams.n_layer) {
|
|
628
|
+
case 3:
|
|
629
|
+
type = LLM_TYPE_17M; break; // bge-micro
|
|
630
|
+
case 6:
|
|
631
|
+
type = LLM_TYPE_22M; break; // MiniLM-L6
|
|
632
|
+
case 12:
|
|
633
|
+
switch (hparams.n_embd) {
|
|
634
|
+
case 384: type = LLM_TYPE_33M; break; // MiniLM-L12, bge-small
|
|
635
|
+
case 768: type = LLM_TYPE_109M; break; // bge-base
|
|
636
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
637
|
+
} break;
|
|
638
|
+
case 24:
|
|
639
|
+
type = LLM_TYPE_335M; break; // bge-large
|
|
640
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
641
|
+
}
|
|
642
|
+
} break;
|
|
643
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
644
|
+
{
|
|
645
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
646
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
647
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
|
648
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
649
|
+
|
|
650
|
+
switch (hparams.n_layer) {
|
|
651
|
+
case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small
|
|
652
|
+
case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base
|
|
653
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
654
|
+
}
|
|
655
|
+
} break;
|
|
656
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
657
|
+
{
|
|
658
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
659
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
660
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
661
|
+
|
|
662
|
+
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
663
|
+
type = LLM_TYPE_137M;
|
|
664
|
+
}
|
|
665
|
+
} break;
|
|
666
|
+
case LLM_ARCH_BLOOM:
|
|
667
|
+
{
|
|
668
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
669
|
+
|
|
670
|
+
switch (hparams.n_layer) {
|
|
671
|
+
case 24: type = LLM_TYPE_1B; break;
|
|
672
|
+
case 30:
|
|
673
|
+
switch (hparams.n_embd) {
|
|
674
|
+
case 2560: type = LLM_TYPE_3B; break;
|
|
675
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
676
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
677
|
+
} break;
|
|
678
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
// TODO: become GGUF KV parameter
|
|
682
|
+
hparams.f_max_alibi_bias = 8.0f;
|
|
683
|
+
} break;
|
|
684
|
+
case LLM_ARCH_MPT:
|
|
685
|
+
{
|
|
686
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
687
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
|
688
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
689
|
+
|
|
690
|
+
switch (hparams.n_layer) {
|
|
691
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
692
|
+
case 48: type = LLM_TYPE_30B; break;
|
|
693
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
694
|
+
}
|
|
695
|
+
} break;
|
|
696
|
+
case LLM_ARCH_STABLELM:
|
|
697
|
+
{
|
|
698
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
699
|
+
|
|
700
|
+
switch (hparams.n_layer) {
|
|
701
|
+
case 24: type = LLM_TYPE_1B; break;
|
|
702
|
+
case 32: type = LLM_TYPE_3B; break;
|
|
703
|
+
case 40: type = LLM_TYPE_12B; break;
|
|
704
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
705
|
+
}
|
|
706
|
+
} break;
|
|
707
|
+
case LLM_ARCH_QWEN:
|
|
708
|
+
{
|
|
709
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
710
|
+
|
|
711
|
+
switch (hparams.n_layer) {
|
|
712
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
713
|
+
case 40: type = LLM_TYPE_13B; break;
|
|
714
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
715
|
+
}
|
|
716
|
+
} break;
|
|
717
|
+
case LLM_ARCH_QWEN2VL:
|
|
718
|
+
{
|
|
719
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true);
|
|
720
|
+
}
|
|
721
|
+
// fall through
|
|
722
|
+
case LLM_ARCH_QWEN2:
|
|
723
|
+
{
|
|
724
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
725
|
+
switch (hparams.n_layer) {
|
|
726
|
+
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
|
727
|
+
case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break;
|
|
728
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
729
|
+
case 36: type = LLM_TYPE_3B; break;
|
|
730
|
+
case 40: type = hparams.n_head() == 20 ? LLM_TYPE_4B : LLM_TYPE_13B; break;
|
|
731
|
+
case 48: type = LLM_TYPE_14B; break;
|
|
732
|
+
case 64: type = LLM_TYPE_32B; break;
|
|
733
|
+
case 80: type = LLM_TYPE_70B; break;
|
|
734
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
735
|
+
}
|
|
736
|
+
} break;
|
|
737
|
+
case LLM_ARCH_QWEN2MOE:
|
|
738
|
+
{
|
|
739
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
740
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
|
|
741
|
+
|
|
742
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
743
|
+
switch (hparams.n_layer) {
|
|
744
|
+
case 24: type = LLM_TYPE_A2_7B; break;
|
|
745
|
+
case 28: type = LLM_TYPE_57B_A14B; break;
|
|
746
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
747
|
+
}
|
|
748
|
+
} break;
|
|
749
|
+
case LLM_ARCH_PHI2:
|
|
750
|
+
{
|
|
751
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
752
|
+
|
|
753
|
+
switch (hparams.n_layer) {
|
|
754
|
+
case 24: type = LLM_TYPE_1B; break;
|
|
755
|
+
case 32: type = LLM_TYPE_3B; break;
|
|
756
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
757
|
+
}
|
|
758
|
+
} break;
|
|
759
|
+
case LLM_ARCH_PHI3:
|
|
760
|
+
{
|
|
761
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
762
|
+
|
|
763
|
+
switch (hparams.n_layer) {
|
|
764
|
+
case 24: type = LLM_TYPE_1B; break;
|
|
765
|
+
case 32: type = LLM_TYPE_3B; break;
|
|
766
|
+
case 40: type = LLM_TYPE_14B; break;
|
|
767
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
// for backward compatibility ; see: https://github.com/ggerganov/llama.cpp/pull/8931
|
|
771
|
+
if ((hparams.n_layer == 32 || hparams.n_layer == 40) && hparams.n_ctx_train == 4096) {
|
|
772
|
+
// default value for Phi-3-mini-4k-instruct and Phi-3-medium-4k-instruct
|
|
773
|
+
hparams.n_swa = 2047;
|
|
774
|
+
} else if (hparams.n_layer == 32 && hparams.n_head_kv(0) == 32 && hparams.n_ctx_train == 131072) {
|
|
775
|
+
// default value for Phi-3-mini-128k-instruct
|
|
776
|
+
hparams.n_swa = 262144;
|
|
777
|
+
} else if (hparams.n_layer == 40 && hparams.n_ctx_train == 131072) {
|
|
778
|
+
// default value for Phi-3-medium-128k-instruct
|
|
779
|
+
hparams.n_swa = 131072;
|
|
780
|
+
}
|
|
781
|
+
bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
782
|
+
if (!found_swa && hparams.n_swa == 0) {
|
|
783
|
+
throw std::runtime_error("invalid value for sliding_window");
|
|
784
|
+
}
|
|
785
|
+
} break;
|
|
786
|
+
case LLM_ARCH_PHIMOE:
|
|
787
|
+
{
|
|
788
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
789
|
+
|
|
790
|
+
switch (hparams.n_layer) {
|
|
791
|
+
case 32: type = LLM_TYPE_16x3_8B; break;
|
|
792
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
793
|
+
}
|
|
794
|
+
} break;
|
|
795
|
+
case LLM_ARCH_PLAMO:
|
|
796
|
+
{
|
|
797
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
798
|
+
|
|
799
|
+
switch (hparams.n_layer) {
|
|
800
|
+
case 40: type = LLM_TYPE_13B; break;
|
|
801
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
802
|
+
}
|
|
803
|
+
} break;
|
|
804
|
+
case LLM_ARCH_GPT2:
|
|
805
|
+
{
|
|
806
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
807
|
+
switch (hparams.n_layer) {
|
|
808
|
+
case 12: type = LLM_TYPE_SMALL; break;
|
|
809
|
+
case 24: type = LLM_TYPE_MEDIUM; break;
|
|
810
|
+
case 36: type = LLM_TYPE_LARGE; break;
|
|
811
|
+
case 48: type = LLM_TYPE_XL; break;
|
|
812
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
813
|
+
}
|
|
814
|
+
} break;
|
|
815
|
+
case LLM_ARCH_CODESHELL:
|
|
816
|
+
{
|
|
817
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
818
|
+
switch (hparams.n_layer) {
|
|
819
|
+
case 42: type = LLM_TYPE_7B; break;
|
|
820
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
821
|
+
}
|
|
822
|
+
} break;
|
|
823
|
+
case LLM_ARCH_ORION:
|
|
824
|
+
{
|
|
825
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
826
|
+
|
|
827
|
+
switch (hparams.n_layer) {
|
|
828
|
+
case 40: type = LLM_TYPE_14B; break;
|
|
829
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
830
|
+
}
|
|
831
|
+
} break;
|
|
832
|
+
case LLM_ARCH_INTERNLM2:
|
|
833
|
+
{
|
|
834
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
835
|
+
switch (hparams.n_layer) {
|
|
836
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
837
|
+
case 48: type = LLM_TYPE_20B; break;
|
|
838
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
839
|
+
}
|
|
840
|
+
} break;
|
|
841
|
+
case LLM_ARCH_GEMMA:
|
|
842
|
+
{
|
|
843
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
844
|
+
|
|
845
|
+
switch (hparams.n_layer) {
|
|
846
|
+
case 18: type = LLM_TYPE_2B; break;
|
|
847
|
+
case 28: type = LLM_TYPE_7B; break;
|
|
848
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
849
|
+
}
|
|
850
|
+
} break;
|
|
851
|
+
case LLM_ARCH_GEMMA2:
|
|
852
|
+
{
|
|
853
|
+
hparams.n_swa = 4096; // default value of gemma 2
|
|
854
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
|
855
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
856
|
+
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
|
857
|
+
ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false);
|
|
858
|
+
hparams.attn_soft_cap = true;
|
|
859
|
+
|
|
860
|
+
switch (hparams.n_layer) {
|
|
861
|
+
case 26: type = LLM_TYPE_2B; break;
|
|
862
|
+
case 42: type = LLM_TYPE_9B; break;
|
|
863
|
+
case 46: type = LLM_TYPE_27B; break;
|
|
864
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
865
|
+
}
|
|
866
|
+
} break;
|
|
867
|
+
case LLM_ARCH_STARCODER2:
|
|
868
|
+
{
|
|
869
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
870
|
+
switch (hparams.n_layer) {
|
|
871
|
+
case 30: type = LLM_TYPE_3B; break;
|
|
872
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
873
|
+
case 40: type = LLM_TYPE_15B; break;
|
|
874
|
+
case 52: type = LLM_TYPE_20B; break; // granite
|
|
875
|
+
case 88: type = LLM_TYPE_34B; break; // granite
|
|
876
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
877
|
+
}
|
|
878
|
+
} break;
|
|
879
|
+
case LLM_ARCH_MAMBA:
|
|
880
|
+
{
|
|
881
|
+
ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
|
882
|
+
ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
|
883
|
+
ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
|
884
|
+
ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
|
885
|
+
ml.get_key(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms, false);
|
|
886
|
+
|
|
887
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
888
|
+
|
|
889
|
+
switch (hparams.n_layer) {
|
|
890
|
+
case 24:
|
|
891
|
+
switch (hparams.n_embd) {
|
|
892
|
+
case 768: type = LLM_TYPE_SMALL; break;
|
|
893
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
894
|
+
} break;
|
|
895
|
+
case 48:
|
|
896
|
+
switch (hparams.n_embd) {
|
|
897
|
+
case 1024: type = LLM_TYPE_MEDIUM; break;
|
|
898
|
+
case 1536: type = LLM_TYPE_LARGE; break;
|
|
899
|
+
case 2048: type = LLM_TYPE_XL; break;
|
|
900
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
901
|
+
} break;
|
|
902
|
+
case 64:
|
|
903
|
+
switch (hparams.n_embd) {
|
|
904
|
+
case 2560: type = LLM_TYPE_3B; break;
|
|
905
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
906
|
+
} break;
|
|
907
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
908
|
+
}
|
|
909
|
+
} break;
|
|
910
|
+
case LLM_ARCH_XVERSE:
|
|
911
|
+
{
|
|
912
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
913
|
+
switch (hparams.n_layer) {
|
|
914
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
915
|
+
case 40: type = LLM_TYPE_13B; break;
|
|
916
|
+
case 80: type = LLM_TYPE_65B; break;
|
|
917
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
918
|
+
}
|
|
919
|
+
} break;
|
|
920
|
+
case LLM_ARCH_COMMAND_R:
|
|
921
|
+
{
|
|
922
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
923
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
924
|
+
switch (hparams.n_layer) {
|
|
925
|
+
case 40: type = LLM_TYPE_35B; break;
|
|
926
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
927
|
+
}
|
|
928
|
+
} break;
|
|
929
|
+
case LLM_ARCH_COHERE2:
|
|
930
|
+
{
|
|
931
|
+
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
|
932
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
933
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
934
|
+
switch (hparams.n_layer) {
|
|
935
|
+
case 32: type = LLM_TYPE_8B; break;
|
|
936
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
937
|
+
}
|
|
938
|
+
} break;
|
|
939
|
+
case LLM_ARCH_DBRX:
|
|
940
|
+
{
|
|
941
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
942
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
|
943
|
+
|
|
944
|
+
switch (hparams.n_layer) {
|
|
945
|
+
case 40: type = LLM_TYPE_16x12B; break;
|
|
946
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
947
|
+
}
|
|
948
|
+
} break;
|
|
949
|
+
case LLM_ARCH_OLMO:
|
|
950
|
+
{
|
|
951
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
952
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
|
953
|
+
|
|
954
|
+
switch (hparams.n_layer) {
|
|
955
|
+
case 22: type = LLM_TYPE_1B; break;
|
|
956
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
957
|
+
case 80: type = LLM_TYPE_70B; break;
|
|
958
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
959
|
+
}
|
|
960
|
+
} break;
|
|
961
|
+
case LLM_ARCH_OLMO2:
|
|
962
|
+
{
|
|
963
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
964
|
+
|
|
965
|
+
switch (hparams.n_layer) {
|
|
966
|
+
case 16: type = LLM_TYPE_1B; break;
|
|
967
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
968
|
+
case 40: type = LLM_TYPE_13B; break;
|
|
969
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
970
|
+
}
|
|
971
|
+
} break;
|
|
972
|
+
case LLM_ARCH_OLMOE:
|
|
973
|
+
{
|
|
974
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
975
|
+
switch (hparams.n_layer) {
|
|
976
|
+
case 16: type = LLM_TYPE_A1_7B; break;
|
|
977
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
978
|
+
}
|
|
979
|
+
} break;
|
|
980
|
+
case LLM_ARCH_OPENELM:
|
|
981
|
+
{
|
|
982
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
983
|
+
|
|
984
|
+
switch (hparams.n_layer) {
|
|
985
|
+
case 16: type = LLM_TYPE_270M; break;
|
|
986
|
+
case 20: type = LLM_TYPE_450M; break;
|
|
987
|
+
case 28: type = LLM_TYPE_1B; break;
|
|
988
|
+
case 36: type = LLM_TYPE_3B; break;
|
|
989
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
990
|
+
}
|
|
991
|
+
} break;
|
|
992
|
+
case LLM_ARCH_GPTNEOX:
|
|
993
|
+
{
|
|
994
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
995
|
+
ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
|
996
|
+
switch (hparams.n_layer) {
|
|
997
|
+
case 6:
|
|
998
|
+
switch (hparams.n_ff()) {
|
|
999
|
+
case 512: type = LLM_TYPE_14M; break;
|
|
1000
|
+
case 2048: type = LLM_TYPE_70M; break;
|
|
1001
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1002
|
+
} break;
|
|
1003
|
+
case 12:
|
|
1004
|
+
switch (hparams.n_ff()) {
|
|
1005
|
+
case 3072: type = LLM_TYPE_160M; break;
|
|
1006
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1007
|
+
} break;
|
|
1008
|
+
case 16:
|
|
1009
|
+
switch (hparams.n_ff()) {
|
|
1010
|
+
case 8192: type = LLM_TYPE_1B; break;
|
|
1011
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1012
|
+
} break;
|
|
1013
|
+
case 24:
|
|
1014
|
+
switch (hparams.n_ff()) {
|
|
1015
|
+
case 4096: type = LLM_TYPE_410M; break;
|
|
1016
|
+
case 8192: type = LLM_TYPE_1_4B; break;
|
|
1017
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1018
|
+
} break;
|
|
1019
|
+
case 32:
|
|
1020
|
+
switch (hparams.n_ff()) {
|
|
1021
|
+
case 10240: type = LLM_TYPE_2_8B; break;
|
|
1022
|
+
case 16384: type = LLM_TYPE_6_9B; break;
|
|
1023
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1024
|
+
} break;
|
|
1025
|
+
case 36:
|
|
1026
|
+
switch (hparams.n_ff()) {
|
|
1027
|
+
case 20480: type = LLM_TYPE_12B; break;
|
|
1028
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1029
|
+
} break;
|
|
1030
|
+
case 44:
|
|
1031
|
+
switch (hparams.n_ff()) {
|
|
1032
|
+
case 24576: type = LLM_TYPE_20B; break;
|
|
1033
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1034
|
+
} break;
|
|
1035
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1036
|
+
}
|
|
1037
|
+
} break;
|
|
1038
|
+
case LLM_ARCH_ARCTIC:
|
|
1039
|
+
{
|
|
1040
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1041
|
+
|
|
1042
|
+
if (hparams.n_expert == 128) {
|
|
1043
|
+
switch (hparams.n_layer) {
|
|
1044
|
+
case 35: type = LLM_TYPE_10B_128x3_66B; break;
|
|
1045
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1046
|
+
}
|
|
1047
|
+
} else {
|
|
1048
|
+
type = LLM_TYPE_UNKNOWN;
|
|
1049
|
+
}
|
|
1050
|
+
} break;
|
|
1051
|
+
case LLM_ARCH_DEEPSEEK:
|
|
1052
|
+
{
|
|
1053
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1054
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1055
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1056
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1057
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1058
|
+
|
|
1059
|
+
switch (hparams.n_layer) {
|
|
1060
|
+
case 28: type = LLM_TYPE_20B; break;
|
|
1061
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1062
|
+
}
|
|
1063
|
+
} break;
|
|
1064
|
+
case LLM_ARCH_DEEPSEEK2:
|
|
1065
|
+
{
|
|
1066
|
+
bool is_lite = (hparams.n_layer == 27);
|
|
1067
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1068
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1069
|
+
if (!is_lite) {
|
|
1070
|
+
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
1071
|
+
}
|
|
1072
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1073
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1074
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1075
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1076
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1077
|
+
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
|
|
1078
|
+
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
|
|
1079
|
+
// for compatibility with existing DeepSeek V2 and V2.5 GGUFs
|
|
1080
|
+
// that have no expert_gating_func model parameter set
|
|
1081
|
+
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
|
|
1082
|
+
}
|
|
1083
|
+
ml.get_key(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
|
1084
|
+
|
|
1085
|
+
switch (hparams.n_layer) {
|
|
1086
|
+
case 27: type = LLM_TYPE_16B; break;
|
|
1087
|
+
case 60: type = LLM_TYPE_236B; break;
|
|
1088
|
+
case 61: type = LLM_TYPE_671B; break;
|
|
1089
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1090
|
+
}
|
|
1091
|
+
} break;
|
|
1092
|
+
case LLM_ARCH_CHATGLM:
|
|
1093
|
+
{
|
|
1094
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1095
|
+
switch (hparams.n_layer) {
|
|
1096
|
+
case 28: type = LLM_TYPE_6B; break;
|
|
1097
|
+
case 40: type = LLM_TYPE_9B; break;
|
|
1098
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1099
|
+
}
|
|
1100
|
+
} break;
|
|
1101
|
+
case LLM_ARCH_BITNET:
|
|
1102
|
+
{
|
|
1103
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1104
|
+
|
|
1105
|
+
switch (hparams.n_layer) {
|
|
1106
|
+
case 26: type = LLM_TYPE_3B; break;
|
|
1107
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1108
|
+
}
|
|
1109
|
+
} break;
|
|
1110
|
+
case LLM_ARCH_T5:
|
|
1111
|
+
{
|
|
1112
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1113
|
+
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
|
1114
|
+
|
|
1115
|
+
uint32_t dec_start_token_id;
|
|
1116
|
+
if (ml.get_key(LLM_KV_DECODER_START_TOKEN_ID, dec_start_token_id, false)) {
|
|
1117
|
+
hparams.dec_start_token_id = dec_start_token_id;
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
switch (hparams.n_layer) {
|
|
1121
|
+
case 6: type = LLM_TYPE_60M; break; // t5-small
|
|
1122
|
+
case 8: type = LLM_TYPE_80M; break; // flan-t5-small
|
|
1123
|
+
case 12:
|
|
1124
|
+
switch (hparams.n_ff()) {
|
|
1125
|
+
case 3072: type = LLM_TYPE_220M; break; // t5-base
|
|
1126
|
+
case 2048: type = LLM_TYPE_250M; break; // flan-t5-base
|
|
1127
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1128
|
+
} break;
|
|
1129
|
+
case 24:
|
|
1130
|
+
switch (hparams.n_ff()) {
|
|
1131
|
+
case 4096: type = LLM_TYPE_770M; break; // t5-large
|
|
1132
|
+
case 2816: type = LLM_TYPE_780M; break; // flan-t5-large
|
|
1133
|
+
case 16384: type = LLM_TYPE_3B; break; // t5-3b
|
|
1134
|
+
case 5120: type = LLM_TYPE_3B; break; // flan-t5-xl
|
|
1135
|
+
case 65536: type = LLM_TYPE_11B; break; // t5-11b
|
|
1136
|
+
case 10240: type = LLM_TYPE_11B; break; // flan-t5-xxl
|
|
1137
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1138
|
+
} break;
|
|
1139
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1140
|
+
}
|
|
1141
|
+
} break;
|
|
1142
|
+
case LLM_ARCH_T5ENCODER:
|
|
1143
|
+
{
|
|
1144
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1145
|
+
ml.get_key(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
|
1146
|
+
type = LLM_TYPE_UNKNOWN;
|
|
1147
|
+
} break;
|
|
1148
|
+
case LLM_ARCH_JAIS:
|
|
1149
|
+
{
|
|
1150
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
1151
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
|
1152
|
+
|
|
1153
|
+
switch (hparams.n_layer) {
|
|
1154
|
+
case 24: type = LLM_TYPE_1_3B; break;
|
|
1155
|
+
case 40: type = LLM_TYPE_13B; break;
|
|
1156
|
+
/* TODO: add variants */
|
|
1157
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1158
|
+
}
|
|
1159
|
+
} break;
|
|
1160
|
+
case LLM_ARCH_NEMOTRON:
|
|
1161
|
+
{
|
|
1162
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
1163
|
+
switch (hparams.n_layer) {
|
|
1164
|
+
case 32: type = LLM_TYPE_4B; break;
|
|
1165
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1166
|
+
}
|
|
1167
|
+
} break;
|
|
1168
|
+
case LLM_ARCH_EXAONE:
|
|
1169
|
+
{
|
|
1170
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1171
|
+
|
|
1172
|
+
switch (hparams.n_layer) {
|
|
1173
|
+
case 32: type = LLM_TYPE_8B; break;
|
|
1174
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1175
|
+
}
|
|
1176
|
+
} break;
|
|
1177
|
+
case LLM_ARCH_RWKV6:
|
|
1178
|
+
case LLM_ARCH_RWKV6QWEN2:
|
|
1179
|
+
{
|
|
1180
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false);
|
|
1181
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false);
|
|
1182
|
+
ml.get_key(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
|
1183
|
+
ml.get_key(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
|
1184
|
+
ml.get_key(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
|
1185
|
+
ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false);
|
|
1186
|
+
ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false);
|
|
1187
|
+
|
|
1188
|
+
switch (hparams.n_layer) {
|
|
1189
|
+
case 24: type = LLM_TYPE_1_6B; break;
|
|
1190
|
+
case 32:
|
|
1191
|
+
switch (hparams.n_embd) {
|
|
1192
|
+
case 2560: type = LLM_TYPE_3B; break;
|
|
1193
|
+
case 4096: type = LLM_TYPE_7B; break;
|
|
1194
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1195
|
+
} break;
|
|
1196
|
+
case 61: type = LLM_TYPE_14B; break;
|
|
1197
|
+
case 64: type = LLM_TYPE_32B; break;
|
|
1198
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1199
|
+
}
|
|
1200
|
+
} break;
|
|
1201
|
+
case LLM_ARCH_GRANITE:
|
|
1202
|
+
case LLM_ARCH_GRANITE_MOE:
|
|
1203
|
+
{
|
|
1204
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1205
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
1206
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
1207
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
1208
|
+
ml.get_key(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
|
1209
|
+
|
|
1210
|
+
switch (hparams.n_layer) {
|
|
1211
|
+
case 32: type = LLM_TYPE_3B; break;
|
|
1212
|
+
case 40: type = LLM_TYPE_3B; break;
|
|
1213
|
+
// Add additional layer/vocab/etc checks here for other model sizes
|
|
1214
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1215
|
+
}
|
|
1216
|
+
} break;
|
|
1217
|
+
case LLM_ARCH_CHAMELEON:
|
|
1218
|
+
{
|
|
1219
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1220
|
+
hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default
|
|
1221
|
+
ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
|
1222
|
+
|
|
1223
|
+
switch (hparams.n_layer) {
|
|
1224
|
+
case 32: type = LLM_TYPE_7B; break;
|
|
1225
|
+
case 48: type = LLM_TYPE_34B; break;
|
|
1226
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1227
|
+
}
|
|
1228
|
+
} break;
|
|
1229
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
1230
|
+
{
|
|
1231
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
1232
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
|
1233
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
|
1234
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
1235
|
+
} break;
|
|
1236
|
+
default: throw std::runtime_error("unsupported model architecture");
|
|
1237
|
+
}
|
|
1238
|
+
|
|
1239
|
+
pimpl->n_bytes = ml.n_bytes;
|
|
1240
|
+
|
|
1241
|
+
pimpl->desc_str = arch_name() + " " + type_name() + " " + ml.ftype_name();
|
|
1242
|
+
|
|
1243
|
+
if (hparams.f_max_alibi_bias > 0.0f) {
|
|
1244
|
+
hparams.use_alibi = true;
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
hparams.rope_type = llama_model_rope_type(this);
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
void llama_model::load_vocab(llama_model_loader & ml) {
|
|
1251
|
+
const auto kv = LLM_KV(arch);
|
|
1252
|
+
|
|
1253
|
+
vocab.load(ml, kv);
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
1257
|
+
const auto & split_mode = params.split_mode;
|
|
1258
|
+
const auto & n_gpu_layers = params.n_gpu_layers;
|
|
1259
|
+
const auto & use_mlock = params.use_mlock;
|
|
1260
|
+
const auto & tensor_split = params.tensor_split;
|
|
1261
|
+
|
|
1262
|
+
const int n_layer = hparams.n_layer;
|
|
1263
|
+
|
|
1264
|
+
const bool use_mmap_buffer = true;
|
|
1265
|
+
|
|
1266
|
+
// build a list of buffer types for the CPU and GPU devices
|
|
1267
|
+
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
|
|
1268
|
+
for (auto * dev : devices) {
|
|
1269
|
+
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
|
|
1270
|
+
// add CPU buffer types as a fallback
|
|
1271
|
+
buft_list.insert(buft_list.end(), pimpl->cpu_buft_list.begin(), pimpl->cpu_buft_list.end());
|
|
1272
|
+
pimpl->gpu_buft_list.emplace(dev, std::move(buft_list));
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
// calculate the split points
|
|
1276
|
+
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + n_devices(), [](float x) { return x == 0.0f; });
|
|
1277
|
+
std::vector<float> splits(n_devices());
|
|
1278
|
+
if (all_zero) {
|
|
1279
|
+
// default split, by free memory
|
|
1280
|
+
for (size_t i = 0; i < n_devices(); ++i) {
|
|
1281
|
+
ggml_backend_dev_t dev = devices[i];
|
|
1282
|
+
size_t total;
|
|
1283
|
+
size_t free;
|
|
1284
|
+
ggml_backend_dev_memory(dev, &free, &total);
|
|
1285
|
+
splits[i] = free;
|
|
1286
|
+
}
|
|
1287
|
+
} else {
|
|
1288
|
+
std::copy(tensor_split, tensor_split + n_devices(), splits.begin());
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
// sum and normalize the splits to get the split points
|
|
1292
|
+
float split_sum = 0.0f;
|
|
1293
|
+
for (size_t i = 0; i < n_devices(); ++i) {
|
|
1294
|
+
split_sum += splits[i];
|
|
1295
|
+
splits[i] = split_sum;
|
|
1296
|
+
}
|
|
1297
|
+
for (size_t i = 0; i < n_devices(); ++i) {
|
|
1298
|
+
splits[i] /= split_sum;
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1302
|
+
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
|
1303
|
+
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
|
1304
|
+
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
|
1305
|
+
if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) {
|
|
1306
|
+
return {cpu_dev, &pimpl->cpu_buft_list};
|
|
1307
|
+
}
|
|
1308
|
+
const int layer_gpu = std::upper_bound(splits.begin(), splits.begin() + n_devices(), float(il - i_gpu_start)/act_gpu_layers) - splits.begin();
|
|
1309
|
+
auto * dev = devices.at(layer_gpu);
|
|
1310
|
+
return {dev, &pimpl->gpu_buft_list.at(dev)};
|
|
1311
|
+
};
|
|
1312
|
+
|
|
1313
|
+
// assign the input layer
|
|
1314
|
+
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
|
1315
|
+
pimpl->dev_input = { cpu_dev, &pimpl->cpu_buft_list };
|
|
1316
|
+
|
|
1317
|
+
// assign the repeating layers to the devices according to the splits
|
|
1318
|
+
pimpl->dev_layer.resize(n_layer);
|
|
1319
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
1320
|
+
pimpl->dev_layer[il] = get_layer_buft_list(il);
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
// assign the output layer
|
|
1324
|
+
pimpl->dev_output = get_layer_buft_list(n_layer);
|
|
1325
|
+
|
|
1326
|
+
// one ggml context per buffer type
|
|
1327
|
+
int max_n_tensors = ml.n_tensors;
|
|
1328
|
+
max_n_tensors += 1; // duplicated output tensor
|
|
1329
|
+
max_n_tensors += n_layer*2; // duplicated rope freq tensors
|
|
1330
|
+
const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors;
|
|
1331
|
+
|
|
1332
|
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
1333
|
+
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
|
1334
|
+
auto it = ctx_map.find(buft);
|
|
1335
|
+
if (it == ctx_map.end()) {
|
|
1336
|
+
ggml_init_params params = {
|
|
1337
|
+
/*.mem_size =*/ ctx_size,
|
|
1338
|
+
/*.mem_buffer =*/ NULL,
|
|
1339
|
+
/*.no_alloc =*/ true,
|
|
1340
|
+
};
|
|
1341
|
+
|
|
1342
|
+
ggml_context * ctx = ggml_init(params);
|
|
1343
|
+
if (!ctx) {
|
|
1344
|
+
throw std::runtime_error(format("failed to create ggml context"));
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
ctx_map[buft] = ctx;
|
|
1348
|
+
pimpl->ctxs.emplace_back(ctx);
|
|
1349
|
+
|
|
1350
|
+
return ctx;
|
|
1351
|
+
}
|
|
1352
|
+
return it->second;
|
|
1353
|
+
};
|
|
1354
|
+
|
|
1355
|
+
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
|
|
1356
|
+
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
|
|
1357
|
+
|
|
1358
|
+
// create tensors for the weights
|
|
1359
|
+
{
|
|
1360
|
+
// note: cast to int64_t since we will use these for the tensor dimensions
|
|
1361
|
+
const int64_t n_head = hparams.n_head();
|
|
1362
|
+
const int64_t n_head_kv = hparams.n_head_kv();
|
|
1363
|
+
const int64_t n_embd = hparams.n_embd;
|
|
1364
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
1365
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
1366
|
+
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
1367
|
+
const int64_t n_embd_head_v = hparams.n_embd_head_v;
|
|
1368
|
+
const int64_t n_ff = hparams.n_ff();
|
|
1369
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
1370
|
+
const int64_t n_vocab = vocab.n_tokens();
|
|
1371
|
+
const int64_t n_token_types = vocab.n_token_types();
|
|
1372
|
+
const int64_t n_rot = hparams.n_rot;
|
|
1373
|
+
const int64_t n_expert = hparams.n_expert;
|
|
1374
|
+
const int64_t n_expert_used = hparams.n_expert_used;
|
|
1375
|
+
const int64_t n_ctx_train = hparams.n_ctx_train;
|
|
1376
|
+
|
|
1377
|
+
if (n_expert > 0 && hparams.n_expert_used == 0) {
|
|
1378
|
+
throw std::runtime_error("model has expert layers but no expert layers are used");
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
int n_moved_tensors = 0;
|
|
1382
|
+
ggml_tensor * first_moved_tensor = nullptr;
|
|
1383
|
+
ggml_backend_buffer_type_t first_moved_from_buft = nullptr;
|
|
1384
|
+
ggml_backend_buffer_type_t first_moved_to_buft = nullptr;
|
|
1385
|
+
|
|
1386
|
+
auto create_tensor = [&](const LLM_TN_IMPL & tn, const std::initializer_list<int64_t> & ne, int flags) -> ggml_tensor * {
|
|
1387
|
+
ggml_tensor * t_meta = ml.get_tensor_meta(tn.str().c_str());
|
|
1388
|
+
|
|
1389
|
+
if (!t_meta) {
|
|
1390
|
+
if (flags & TENSOR_NOT_REQUIRED) {
|
|
1391
|
+
return nullptr;
|
|
1392
|
+
}
|
|
1393
|
+
throw std::runtime_error(format("missing tensor '%s'", tn.str().c_str()));
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
// some models use the token embedding tensor as the output, but since these are used in different layers and with different ops
|
|
1397
|
+
// the tensor is duplicated
|
|
1398
|
+
// to handle this, we check if the tensor is duplicated, and if so, we assume that it is being loaded as the output tensor
|
|
1399
|
+
llm_tensor tn_tensor = tn.tensor;
|
|
1400
|
+
if (tn.tensor == LLM_TENSOR_TOKEN_EMBD && flags & TENSOR_DUPLICATED) {
|
|
1401
|
+
tn_tensor = LLM_TENSOR_OUTPUT;
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
llm_tensor_info info;
|
|
1405
|
+
try {
|
|
1406
|
+
info = llm_tensor_info_for(tn_tensor);
|
|
1407
|
+
} catch (const std::out_of_range & e) {
|
|
1408
|
+
throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str()));
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
// tensors with "bias" suffix are always used with GGML_OP_ADD
|
|
1412
|
+
ggml_op op;
|
|
1413
|
+
bool bias = tn.suffix != nullptr && strcmp(tn.suffix, "bias") == 0;
|
|
1414
|
+
if (bias) {
|
|
1415
|
+
op = GGML_OP_ADD;
|
|
1416
|
+
} else {
|
|
1417
|
+
op = info.op;
|
|
1418
|
+
}
|
|
1419
|
+
|
|
1420
|
+
// sanity checks
|
|
1421
|
+
if (info.layer == LLM_TENSOR_LAYER_INPUT || info.layer == LLM_TENSOR_LAYER_OUTPUT) {
|
|
1422
|
+
if (tn.bid != -1) {
|
|
1423
|
+
GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());
|
|
1424
|
+
}
|
|
1425
|
+
} else {
|
|
1426
|
+
if (tn.bid == -1) {
|
|
1427
|
+
GGML_ABORT("repeating layer tensor %s used without a layer number", tn.str().c_str());
|
|
1428
|
+
}
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
// select the buffer type for this tensor
|
|
1432
|
+
buft_list_t * buft_list;
|
|
1433
|
+
switch (info.layer) {
|
|
1434
|
+
case LLM_TENSOR_LAYER_INPUT:
|
|
1435
|
+
buft_list = pimpl->dev_input.buft_list;
|
|
1436
|
+
break;
|
|
1437
|
+
case LLM_TENSOR_LAYER_OUTPUT:
|
|
1438
|
+
buft_list = pimpl->dev_output.buft_list;
|
|
1439
|
+
break;
|
|
1440
|
+
case LLM_TENSOR_LAYER_REPEATING:
|
|
1441
|
+
buft_list = pimpl->dev_layer.at(tn.bid).buft_list;
|
|
1442
|
+
break;
|
|
1443
|
+
default:
|
|
1444
|
+
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
|
1448
|
+
if (!buft) {
|
|
1449
|
+
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
// avoid using a host buffer when using mmap
|
|
1453
|
+
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
|
1454
|
+
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
|
1455
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1456
|
+
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
if (buft != buft_list->front().second) {
|
|
1460
|
+
n_moved_tensors++;
|
|
1461
|
+
if (!first_moved_tensor) {
|
|
1462
|
+
first_moved_tensor = t_meta;
|
|
1463
|
+
first_moved_from_buft = buft_list->front().second;
|
|
1464
|
+
first_moved_to_buft = buft;
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
ggml_context * ctx = ctx_for_buft(buft);
|
|
1469
|
+
|
|
1470
|
+
// if duplicated, check if the original tensor was allocated in the same buffer type context and avoid creating a new one
|
|
1471
|
+
if (flags & TENSOR_DUPLICATED) {
|
|
1472
|
+
ggml_tensor * t = ggml_get_tensor(ctx, tn.str().c_str());
|
|
1473
|
+
if (t) {
|
|
1474
|
+
return t;
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
1477
|
+
return ml.create_tensor(ctx, tn, ne, flags);
|
|
1478
|
+
};
|
|
1479
|
+
|
|
1480
|
+
layers.resize(n_layer);
|
|
1481
|
+
|
|
1482
|
+
// TODO: move to a separate function
|
|
1483
|
+
const auto tn = LLM_TN(arch);
|
|
1484
|
+
switch (arch) {
|
|
1485
|
+
case LLM_ARCH_LLAMA:
|
|
1486
|
+
case LLM_ARCH_REFACT:
|
|
1487
|
+
case LLM_ARCH_MINICPM:
|
|
1488
|
+
case LLM_ARCH_GRANITE:
|
|
1489
|
+
case LLM_ARCH_GRANITE_MOE:
|
|
1490
|
+
{
|
|
1491
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1492
|
+
|
|
1493
|
+
// output
|
|
1494
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1495
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1496
|
+
|
|
1497
|
+
// if output is NULL, init from the input tok embed
|
|
1498
|
+
if (output == NULL) {
|
|
1499
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1500
|
+
}
|
|
1501
|
+
|
|
1502
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1503
|
+
auto & layer = layers[i];
|
|
1504
|
+
|
|
1505
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1506
|
+
|
|
1507
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
1508
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
1509
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
1510
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
1511
|
+
|
|
1512
|
+
// optional bias tensors
|
|
1513
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1514
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
1515
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
1516
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1517
|
+
|
|
1518
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1519
|
+
|
|
1520
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
1521
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1522
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1523
|
+
}
|
|
1524
|
+
else {
|
|
1525
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1526
|
+
}
|
|
1527
|
+
|
|
1528
|
+
if (n_expert == 0) {
|
|
1529
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1530
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1531
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1532
|
+
|
|
1533
|
+
// optional MLP bias
|
|
1534
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
1535
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1536
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
1537
|
+
} else {
|
|
1538
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1539
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
|
1540
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
1541
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
1542
|
+
}
|
|
1543
|
+
}
|
|
1544
|
+
} break;
|
|
1545
|
+
case LLM_ARCH_DECI:
|
|
1546
|
+
{
|
|
1547
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1548
|
+
|
|
1549
|
+
// output
|
|
1550
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1551
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1552
|
+
|
|
1553
|
+
// if output is NULL, init from the input tok embed
|
|
1554
|
+
if (output == NULL) {
|
|
1555
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1556
|
+
}
|
|
1557
|
+
|
|
1558
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1559
|
+
auto & layer = layers[i];
|
|
1560
|
+
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i);
|
|
1561
|
+
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i);
|
|
1562
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i);
|
|
1563
|
+
const int64_t n_ff = hparams.n_ff(i);
|
|
1564
|
+
const int64_t n_head = hparams.n_head(i);
|
|
1565
|
+
const int64_t n_head_kv = hparams.n_head_kv(i);
|
|
1566
|
+
|
|
1567
|
+
if (n_head_kv == 0 && n_head > 0) {
|
|
1568
|
+
// linear attention for DeciLMCausalModel
|
|
1569
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1570
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1571
|
+
}
|
|
1572
|
+
else if (n_head_kv > 0) {
|
|
1573
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1574
|
+
|
|
1575
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
1576
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
1577
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
1578
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
1579
|
+
}
|
|
1580
|
+
|
|
1581
|
+
// optional bias tensors
|
|
1582
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1583
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
1584
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
1585
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1586
|
+
|
|
1587
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1588
|
+
|
|
1589
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
1590
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1591
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1592
|
+
}
|
|
1593
|
+
else {
|
|
1594
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1595
|
+
}
|
|
1596
|
+
|
|
1597
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1598
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1599
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1600
|
+
|
|
1601
|
+
// optional MLP bias
|
|
1602
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
1603
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1604
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
1605
|
+
}
|
|
1606
|
+
} break;
|
|
1607
|
+
case LLM_ARCH_MINICPM3:
|
|
1608
|
+
{
|
|
1609
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
1610
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
1611
|
+
|
|
1612
|
+
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
1613
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
1614
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1615
|
+
|
|
1616
|
+
// output
|
|
1617
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1618
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1619
|
+
|
|
1620
|
+
// if output is NULL, init from the input tok embed
|
|
1621
|
+
if (output == NULL) {
|
|
1622
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1623
|
+
}
|
|
1624
|
+
|
|
1625
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1626
|
+
auto & layer = layers[i];
|
|
1627
|
+
|
|
1628
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1629
|
+
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
|
1630
|
+
|
|
1631
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
1632
|
+
|
|
1633
|
+
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
1634
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
|
|
1635
|
+
|
|
1636
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
|
1637
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
|
1638
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
|
1639
|
+
|
|
1640
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1641
|
+
|
|
1642
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1643
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1644
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1645
|
+
|
|
1646
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1647
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head_qk_rope/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1648
|
+
}
|
|
1649
|
+
} break;
|
|
1650
|
+
case LLM_ARCH_GROK:
|
|
1651
|
+
{
|
|
1652
|
+
if (n_expert == 0) {
|
|
1653
|
+
throw std::runtime_error("Grok model cannot have zero experts");
|
|
1654
|
+
}
|
|
1655
|
+
|
|
1656
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1657
|
+
|
|
1658
|
+
// output
|
|
1659
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1660
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1661
|
+
|
|
1662
|
+
// if output is NULL, init from the input tok embed
|
|
1663
|
+
if (output == NULL) {
|
|
1664
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1665
|
+
}
|
|
1666
|
+
|
|
1667
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1668
|
+
auto & layer = layers[i];
|
|
1669
|
+
|
|
1670
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1671
|
+
|
|
1672
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
1673
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1674
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1675
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1676
|
+
|
|
1677
|
+
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1678
|
+
|
|
1679
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1680
|
+
|
|
1681
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1682
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, TENSOR_NOT_REQUIRED);
|
|
1683
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
1684
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
1685
|
+
|
|
1686
|
+
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1687
|
+
}
|
|
1688
|
+
} break;
|
|
1689
|
+
case LLM_ARCH_DBRX:
|
|
1690
|
+
{
|
|
1691
|
+
if (n_expert == 0) {
|
|
1692
|
+
throw std::runtime_error("DBRX model cannot have zero experts");
|
|
1693
|
+
}
|
|
1694
|
+
|
|
1695
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1696
|
+
|
|
1697
|
+
// output
|
|
1698
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1699
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
1700
|
+
|
|
1701
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1702
|
+
auto & layer = layers[i];
|
|
1703
|
+
|
|
1704
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1705
|
+
|
|
1706
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1707
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1708
|
+
|
|
1709
|
+
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1710
|
+
|
|
1711
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1712
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
1713
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
|
1714
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
1715
|
+
}
|
|
1716
|
+
} break;
|
|
1717
|
+
case LLM_ARCH_BAICHUAN:
|
|
1718
|
+
{
|
|
1719
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1720
|
+
{
|
|
1721
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1722
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
1723
|
+
}
|
|
1724
|
+
|
|
1725
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1726
|
+
auto & layer = layers[i];
|
|
1727
|
+
|
|
1728
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1729
|
+
|
|
1730
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
1731
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1732
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1733
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1734
|
+
|
|
1735
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1736
|
+
|
|
1737
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1738
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1739
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1740
|
+
}
|
|
1741
|
+
} break;
|
|
1742
|
+
case LLM_ARCH_FALCON:
|
|
1743
|
+
{
|
|
1744
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1745
|
+
|
|
1746
|
+
// output
|
|
1747
|
+
{
|
|
1748
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1749
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
1750
|
+
|
|
1751
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1752
|
+
if (!output) {
|
|
1753
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
|
|
1754
|
+
}
|
|
1755
|
+
}
|
|
1756
|
+
|
|
1757
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1758
|
+
auto & layer = layers[i];
|
|
1759
|
+
|
|
1760
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1761
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
1762
|
+
|
|
1763
|
+
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1764
|
+
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1765
|
+
|
|
1766
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1767
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1768
|
+
|
|
1769
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1770
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1771
|
+
}
|
|
1772
|
+
} break;
|
|
1773
|
+
case LLM_ARCH_STARCODER:
|
|
1774
|
+
{
|
|
1775
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1776
|
+
pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
|
1777
|
+
|
|
1778
|
+
// output
|
|
1779
|
+
{
|
|
1780
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1781
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
1782
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1783
|
+
if (!output) {
|
|
1784
|
+
// needs to be on GPU
|
|
1785
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1786
|
+
}
|
|
1787
|
+
|
|
1788
|
+
}
|
|
1789
|
+
|
|
1790
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1791
|
+
auto & layer = layers[i];
|
|
1792
|
+
|
|
1793
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1794
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
1795
|
+
|
|
1796
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1797
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
1798
|
+
|
|
1799
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1800
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
1801
|
+
|
|
1802
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1803
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
1804
|
+
|
|
1805
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1806
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
1807
|
+
|
|
1808
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1809
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
1810
|
+
}
|
|
1811
|
+
} break;
|
|
1812
|
+
case LLM_ARCH_BERT:
|
|
1813
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
1814
|
+
{
|
|
1815
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1816
|
+
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
|
1817
|
+
|
|
1818
|
+
if (arch == LLM_ARCH_BERT) {
|
|
1819
|
+
pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
|
1820
|
+
|
|
1821
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, n_embd}, TENSOR_NOT_REQUIRED);
|
|
1822
|
+
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1823
|
+
|
|
1824
|
+
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
|
|
1825
|
+
cls_out_b = create_tensor(tn(LLM_TENSOR_CLS_OUT, "bias"), {1}, TENSOR_NOT_REQUIRED);
|
|
1826
|
+
}
|
|
1827
|
+
|
|
1828
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
1829
|
+
tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
|
|
1830
|
+
|
|
1831
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1832
|
+
auto & layer = layers[i];
|
|
1833
|
+
|
|
1834
|
+
if (arch == LLM_ARCH_BERT) {
|
|
1835
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
1836
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
1837
|
+
|
|
1838
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1839
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
1840
|
+
|
|
1841
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1842
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
1843
|
+
} else {
|
|
1844
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1848
|
+
|
|
1849
|
+
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1850
|
+
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
1851
|
+
|
|
1852
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1853
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1854
|
+
|
|
1855
|
+
if (arch == LLM_ARCH_BERT) {
|
|
1856
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
1857
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
1858
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
1859
|
+
} else {
|
|
1860
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1864
|
+
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
1865
|
+
}
|
|
1866
|
+
} break;
|
|
1867
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
1868
|
+
{
|
|
1869
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // word_embeddings
|
|
1870
|
+
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0); // token_type_embeddings
|
|
1871
|
+
|
|
1872
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); // LayerNorm
|
|
1873
|
+
tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0); //LayerNorm bias
|
|
1874
|
+
|
|
1875
|
+
cls = create_tensor(tn(LLM_TENSOR_CLS, "weight"), {n_embd, 1}, TENSOR_NOT_REQUIRED);
|
|
1876
|
+
cls_b = create_tensor(tn(LLM_TENSOR_CLS, "bias"), {1}, TENSOR_NOT_REQUIRED);
|
|
1877
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1878
|
+
auto & layer = layers[i]; // JinaBertLayer
|
|
1879
|
+
|
|
1880
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
1881
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
1882
|
+
|
|
1883
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1884
|
+
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1885
|
+
|
|
1886
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1887
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
1888
|
+
|
|
1889
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1890
|
+
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1891
|
+
|
|
1892
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
1893
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
1894
|
+
|
|
1895
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); //output_dens
|
|
1896
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0); //output_dens
|
|
1897
|
+
|
|
1898
|
+
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0); //output_norm
|
|
1899
|
+
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
1900
|
+
|
|
1901
|
+
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1902
|
+
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1903
|
+
|
|
1904
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1905
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1906
|
+
|
|
1907
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1908
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
1909
|
+
|
|
1910
|
+
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1911
|
+
layer.layer_out_norm_b = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
1912
|
+
}
|
|
1913
|
+
} break;
|
|
1914
|
+
case LLM_ARCH_BLOOM:
|
|
1915
|
+
{
|
|
1916
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1917
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
1918
|
+
tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
|
|
1919
|
+
|
|
1920
|
+
// output
|
|
1921
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1922
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
1923
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
1924
|
+
|
|
1925
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1926
|
+
auto & layer = layers[i];
|
|
1927
|
+
|
|
1928
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1929
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
1930
|
+
|
|
1931
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1932
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
1933
|
+
|
|
1934
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1935
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
1936
|
+
|
|
1937
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1938
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
1939
|
+
|
|
1940
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1941
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
1942
|
+
|
|
1943
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1944
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
1945
|
+
}
|
|
1946
|
+
} break;
|
|
1947
|
+
case LLM_ARCH_MPT:
|
|
1948
|
+
{
|
|
1949
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1950
|
+
pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, TENSOR_NOT_REQUIRED);
|
|
1951
|
+
|
|
1952
|
+
// output
|
|
1953
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1954
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1955
|
+
|
|
1956
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1957
|
+
if (!output) {
|
|
1958
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // needs to be on GPU
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1961
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1962
|
+
auto & layer = layers[i];
|
|
1963
|
+
|
|
1964
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1965
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1966
|
+
|
|
1967
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1968
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
1969
|
+
|
|
1970
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1971
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1972
|
+
|
|
1973
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1974
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1975
|
+
|
|
1976
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1977
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1978
|
+
|
|
1979
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1980
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
1981
|
+
|
|
1982
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1983
|
+
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1984
|
+
|
|
1985
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1986
|
+
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
1987
|
+
|
|
1988
|
+
// AWQ ScaleActivation layer
|
|
1989
|
+
layer.ffn_act = create_tensor(tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
1990
|
+
}
|
|
1991
|
+
} break;
|
|
1992
|
+
case LLM_ARCH_STABLELM:
|
|
1993
|
+
{
|
|
1994
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1995
|
+
|
|
1996
|
+
// output
|
|
1997
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
1998
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1999
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2000
|
+
|
|
2001
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2002
|
+
auto & layer = layers[i];
|
|
2003
|
+
|
|
2004
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2005
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2006
|
+
|
|
2007
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2008
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2009
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2010
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2011
|
+
|
|
2012
|
+
// optional bias tensors, present in Stable LM 2 1.6B
|
|
2013
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2014
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2015
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2016
|
+
|
|
2017
|
+
// optional q and k layernorms, present in StableLM 2 12B
|
|
2018
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
|
|
2019
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
|
|
2020
|
+
|
|
2021
|
+
// optional FFN norm, not present in StableLM 2 12B which uses parallel residual
|
|
2022
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2023
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2024
|
+
|
|
2025
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2026
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2027
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2028
|
+
}
|
|
2029
|
+
} break;
|
|
2030
|
+
case LLM_ARCH_QWEN:
|
|
2031
|
+
{
|
|
2032
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2033
|
+
|
|
2034
|
+
// output
|
|
2035
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2036
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2037
|
+
|
|
2038
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2039
|
+
auto & layer = layers[i];
|
|
2040
|
+
|
|
2041
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2042
|
+
|
|
2043
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}, 0);
|
|
2044
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}, 0);
|
|
2045
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2046
|
+
|
|
2047
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2048
|
+
|
|
2049
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}, 0);
|
|
2050
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}, 0);
|
|
2051
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}, 0);
|
|
2052
|
+
}
|
|
2053
|
+
} break;
|
|
2054
|
+
case LLM_ARCH_QWEN2:
|
|
2055
|
+
case LLM_ARCH_QWEN2VL:
|
|
2056
|
+
{
|
|
2057
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2058
|
+
|
|
2059
|
+
// output
|
|
2060
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2061
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2062
|
+
// if output is NULL, init from the input tok embed
|
|
2063
|
+
if (output == NULL) {
|
|
2064
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2065
|
+
}
|
|
2066
|
+
|
|
2067
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2068
|
+
auto & layer = layers[i];
|
|
2069
|
+
|
|
2070
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2071
|
+
|
|
2072
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2073
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2074
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2075
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2076
|
+
|
|
2077
|
+
// optional bias tensors
|
|
2078
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
2079
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
2080
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
2081
|
+
|
|
2082
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2083
|
+
|
|
2084
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2085
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2086
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2087
|
+
}
|
|
2088
|
+
} break;
|
|
2089
|
+
case LLM_ARCH_QWEN2MOE:
|
|
2090
|
+
{
|
|
2091
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2092
|
+
|
|
2093
|
+
// output
|
|
2094
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2095
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2096
|
+
|
|
2097
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2098
|
+
auto & layer = layers[i];
|
|
2099
|
+
|
|
2100
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2101
|
+
|
|
2102
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2103
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2104
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2105
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2106
|
+
|
|
2107
|
+
// optional bias tensors
|
|
2108
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
2109
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
2110
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
2111
|
+
|
|
2112
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2113
|
+
|
|
2114
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2115
|
+
|
|
2116
|
+
if (n_expert == 0) {
|
|
2117
|
+
throw std::runtime_error("n_expert must be > 0 for QWEN2MOE");
|
|
2118
|
+
}
|
|
2119
|
+
if (n_expert_used == 0) {
|
|
2120
|
+
throw std::runtime_error("n_expert_used must be > 0 for QWEN2MOE");
|
|
2121
|
+
}
|
|
2122
|
+
|
|
2123
|
+
// MoE branch
|
|
2124
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
2125
|
+
|
|
2126
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2127
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2128
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2129
|
+
|
|
2130
|
+
// Shared expert branch
|
|
2131
|
+
const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff;
|
|
2132
|
+
|
|
2133
|
+
layer.ffn_gate_inp_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}, 0);
|
|
2134
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
2135
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
|
|
2136
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
2137
|
+
}
|
|
2138
|
+
} break;
|
|
2139
|
+
case LLM_ARCH_PHI2:
|
|
2140
|
+
{
|
|
2141
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2142
|
+
|
|
2143
|
+
// output
|
|
2144
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2145
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2146
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2147
|
+
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, 0);
|
|
2148
|
+
|
|
2149
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2150
|
+
auto & layer = layers[i];
|
|
2151
|
+
|
|
2152
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2153
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2154
|
+
|
|
2155
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2156
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2157
|
+
|
|
2158
|
+
if (layer.wqkv == nullptr) {
|
|
2159
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2160
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
2161
|
+
|
|
2162
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2163
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
2164
|
+
|
|
2165
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2166
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
2167
|
+
}
|
|
2168
|
+
|
|
2169
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2170
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2171
|
+
|
|
2172
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2173
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2174
|
+
|
|
2175
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2176
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
2177
|
+
}
|
|
2178
|
+
} break;
|
|
2179
|
+
case LLM_ARCH_PHI3:
|
|
2180
|
+
{
|
|
2181
|
+
const int64_t n_embd_head = n_embd / n_head;
|
|
2182
|
+
|
|
2183
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2184
|
+
|
|
2185
|
+
// output
|
|
2186
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2187
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);
|
|
2188
|
+
|
|
2189
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2190
|
+
auto & layer = layers[i];
|
|
2191
|
+
|
|
2192
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2193
|
+
|
|
2194
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
|
|
2195
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
|
2196
|
+
|
|
2197
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0);
|
|
2198
|
+
|
|
2199
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
2200
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }, 0);
|
|
2201
|
+
|
|
2202
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2203
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_embd_head/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
2204
|
+
}
|
|
2205
|
+
} break;
|
|
2206
|
+
case LLM_ARCH_PLAMO:
|
|
2207
|
+
{
|
|
2208
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2209
|
+
|
|
2210
|
+
// output
|
|
2211
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2212
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2213
|
+
|
|
2214
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2215
|
+
auto & layer = layers[i];
|
|
2216
|
+
|
|
2217
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2218
|
+
|
|
2219
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2220
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2221
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2222
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2223
|
+
|
|
2224
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2225
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2226
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2227
|
+
}
|
|
2228
|
+
} break;
|
|
2229
|
+
case LLM_ARCH_GPT2:
|
|
2230
|
+
{
|
|
2231
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2232
|
+
pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
|
|
2233
|
+
|
|
2234
|
+
// output
|
|
2235
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2236
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2237
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2238
|
+
|
|
2239
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2240
|
+
auto & layer = layers[i];
|
|
2241
|
+
|
|
2242
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2243
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2244
|
+
|
|
2245
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2246
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2247
|
+
|
|
2248
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2249
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2250
|
+
|
|
2251
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2252
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
2253
|
+
|
|
2254
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2255
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2256
|
+
|
|
2257
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2258
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
2259
|
+
}
|
|
2260
|
+
} break;
|
|
2261
|
+
case LLM_ARCH_CODESHELL:
|
|
2262
|
+
{
|
|
2263
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2264
|
+
|
|
2265
|
+
// output
|
|
2266
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2267
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2268
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2269
|
+
|
|
2270
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2271
|
+
auto & layer = layers[i];
|
|
2272
|
+
|
|
2273
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2274
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2275
|
+
|
|
2276
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2277
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2278
|
+
|
|
2279
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2280
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2281
|
+
|
|
2282
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2283
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
2284
|
+
|
|
2285
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2286
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2287
|
+
|
|
2288
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2289
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
2290
|
+
}
|
|
2291
|
+
} break;
|
|
2292
|
+
case LLM_ARCH_ORION:
|
|
2293
|
+
{
|
|
2294
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2295
|
+
|
|
2296
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2297
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2298
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2299
|
+
|
|
2300
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2301
|
+
auto & layer = layers[i];
|
|
2302
|
+
|
|
2303
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2304
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2305
|
+
|
|
2306
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2307
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2308
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2309
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2310
|
+
|
|
2311
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2312
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
2313
|
+
|
|
2314
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2315
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2316
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2317
|
+
}
|
|
2318
|
+
} break;
|
|
2319
|
+
case LLM_ARCH_INTERNLM2:
|
|
2320
|
+
{
|
|
2321
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2322
|
+
|
|
2323
|
+
// output
|
|
2324
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2325
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2326
|
+
|
|
2327
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2328
|
+
auto & layer = layers[i];
|
|
2329
|
+
|
|
2330
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2331
|
+
// layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2332
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2333
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2334
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2335
|
+
|
|
2336
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2337
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2338
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2339
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2340
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2341
|
+
}
|
|
2342
|
+
} break;
|
|
2343
|
+
case LLM_ARCH_GEMMA:
|
|
2344
|
+
{
|
|
2345
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2346
|
+
|
|
2347
|
+
// output
|
|
2348
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2349
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
|
2350
|
+
|
|
2351
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2352
|
+
auto & layer = layers[i];
|
|
2353
|
+
|
|
2354
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2355
|
+
|
|
2356
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2357
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2358
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2359
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2360
|
+
|
|
2361
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2362
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2363
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2364
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2365
|
+
}
|
|
2366
|
+
} break;
|
|
2367
|
+
case LLM_ARCH_GEMMA2:
|
|
2368
|
+
{
|
|
2369
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2370
|
+
|
|
2371
|
+
// output
|
|
2372
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2373
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); // same as tok_embd, duplicated to allow offloading
|
|
2374
|
+
|
|
2375
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2376
|
+
auto & layer = layers[i];
|
|
2377
|
+
|
|
2378
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2379
|
+
|
|
2380
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2381
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2382
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2383
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2384
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2385
|
+
|
|
2386
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2387
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2388
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2389
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2390
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2391
|
+
}
|
|
2392
|
+
} break;
|
|
2393
|
+
case LLM_ARCH_STARCODER2:
|
|
2394
|
+
{
|
|
2395
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2396
|
+
|
|
2397
|
+
// output
|
|
2398
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2399
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2400
|
+
|
|
2401
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2402
|
+
// if output is NULL, init from the input tok embed
|
|
2403
|
+
if (output == NULL) {
|
|
2404
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2405
|
+
}
|
|
2406
|
+
|
|
2407
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2408
|
+
auto & layer = layers[i];
|
|
2409
|
+
|
|
2410
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2411
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2412
|
+
|
|
2413
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2414
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2415
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2416
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2417
|
+
|
|
2418
|
+
// optional bias tensors
|
|
2419
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
2420
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, 0);
|
|
2421
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, 0);
|
|
2422
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2423
|
+
|
|
2424
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2425
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
2426
|
+
|
|
2427
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2428
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2429
|
+
|
|
2430
|
+
// optional bias tensors
|
|
2431
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2432
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, 0);
|
|
2433
|
+
}
|
|
2434
|
+
} break;
|
|
2435
|
+
case LLM_ARCH_MAMBA:
|
|
2436
|
+
{
|
|
2437
|
+
const int64_t d_conv = hparams.ssm_d_conv;
|
|
2438
|
+
const int64_t d_inner = hparams.ssm_d_inner;
|
|
2439
|
+
const int64_t d_state = hparams.ssm_d_state;
|
|
2440
|
+
const int64_t dt_rank = hparams.ssm_dt_rank;
|
|
2441
|
+
|
|
2442
|
+
// only an expansion factor of 2 is supported for now
|
|
2443
|
+
if (2 * n_embd != d_inner) {
|
|
2444
|
+
throw std::runtime_error("only an expansion factor of 2 is supported for now");
|
|
2445
|
+
}
|
|
2446
|
+
|
|
2447
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2448
|
+
|
|
2449
|
+
// output
|
|
2450
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2451
|
+
|
|
2452
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2453
|
+
// if output is NULL, init from the input tok embed, duplicated to allow offloading
|
|
2454
|
+
if (output == NULL) {
|
|
2455
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2456
|
+
}
|
|
2457
|
+
|
|
2458
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2459
|
+
auto & layer = layers[i];
|
|
2460
|
+
|
|
2461
|
+
// norm
|
|
2462
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2463
|
+
|
|
2464
|
+
layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner}, 0);
|
|
2465
|
+
|
|
2466
|
+
layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner}, 0);
|
|
2467
|
+
layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner}, 0);
|
|
2468
|
+
|
|
2469
|
+
layer.ssm_x = create_tensor(tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state}, 0);
|
|
2470
|
+
|
|
2471
|
+
layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner}, 0);
|
|
2472
|
+
layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner}, 0);
|
|
2473
|
+
|
|
2474
|
+
// no "weight" suffix for these
|
|
2475
|
+
layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner}, 0);
|
|
2476
|
+
layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {d_inner}, 0);
|
|
2477
|
+
|
|
2478
|
+
// out_proj
|
|
2479
|
+
layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}, 0);
|
|
2480
|
+
}
|
|
2481
|
+
} break;
|
|
2482
|
+
case LLM_ARCH_XVERSE:
|
|
2483
|
+
{
|
|
2484
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2485
|
+
|
|
2486
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2487
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2488
|
+
|
|
2489
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2490
|
+
auto & layer = layers[i];
|
|
2491
|
+
|
|
2492
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2493
|
+
|
|
2494
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2495
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2496
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2497
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2498
|
+
|
|
2499
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2500
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2501
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2502
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2503
|
+
}
|
|
2504
|
+
} break;
|
|
2505
|
+
case LLM_ARCH_COMMAND_R:
|
|
2506
|
+
{
|
|
2507
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2508
|
+
|
|
2509
|
+
// output
|
|
2510
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2511
|
+
// init output from the input tok embed
|
|
2512
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2513
|
+
|
|
2514
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2515
|
+
auto & layer = layers[i];
|
|
2516
|
+
|
|
2517
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2518
|
+
|
|
2519
|
+
if (n_layer >= 64){
|
|
2520
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
|
|
2521
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
|
|
2522
|
+
}
|
|
2523
|
+
|
|
2524
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2525
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2526
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2527
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2528
|
+
|
|
2529
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2530
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2531
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2532
|
+
}
|
|
2533
|
+
} break;
|
|
2534
|
+
case LLM_ARCH_COHERE2:
|
|
2535
|
+
{
|
|
2536
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
|
|
2537
|
+
|
|
2538
|
+
// output
|
|
2539
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
|
2540
|
+
// init output from the input tok embed
|
|
2541
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab },
|
|
2542
|
+
TENSOR_DUPLICATED);
|
|
2543
|
+
|
|
2544
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2545
|
+
auto & layer = layers[i];
|
|
2546
|
+
|
|
2547
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2548
|
+
|
|
2549
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd }, 0);
|
|
2550
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
2551
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_gqa }, 0);
|
|
2552
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
|
|
2553
|
+
|
|
2554
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
|
|
2555
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0);
|
|
2556
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0);
|
|
2557
|
+
}
|
|
2558
|
+
}
|
|
2559
|
+
break;
|
|
2560
|
+
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
|
2561
|
+
{
|
|
2562
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2563
|
+
|
|
2564
|
+
// output
|
|
2565
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2566
|
+
// if output is NULL, init from the input tok embed
|
|
2567
|
+
if (output == NULL) {
|
|
2568
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2569
|
+
}
|
|
2570
|
+
|
|
2571
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2572
|
+
auto & layer = layers[i];
|
|
2573
|
+
|
|
2574
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2575
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2576
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2577
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2578
|
+
|
|
2579
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2580
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2581
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2582
|
+
}
|
|
2583
|
+
} break;
|
|
2584
|
+
case LLM_ARCH_OLMO2:
|
|
2585
|
+
{
|
|
2586
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2587
|
+
|
|
2588
|
+
// output
|
|
2589
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2590
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2591
|
+
|
|
2592
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2593
|
+
auto & layer = layers[i];
|
|
2594
|
+
|
|
2595
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2596
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2597
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2598
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2599
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
|
2600
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
|
2601
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2602
|
+
|
|
2603
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2604
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2605
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2606
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
2607
|
+
}
|
|
2608
|
+
} break;
|
|
2609
|
+
case LLM_ARCH_OLMOE:
|
|
2610
|
+
{
|
|
2611
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2612
|
+
|
|
2613
|
+
// output
|
|
2614
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2615
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2616
|
+
|
|
2617
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2618
|
+
auto & layer = layers[i];
|
|
2619
|
+
|
|
2620
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2621
|
+
|
|
2622
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2623
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2624
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2625
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2626
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
|
2627
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
|
2628
|
+
|
|
2629
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2630
|
+
|
|
2631
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2632
|
+
|
|
2633
|
+
if (n_expert == 0) {
|
|
2634
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
2635
|
+
}
|
|
2636
|
+
if (n_expert_used == 0) {
|
|
2637
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
2638
|
+
}
|
|
2639
|
+
|
|
2640
|
+
// MoE branch
|
|
2641
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
2642
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0);
|
|
2643
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
2644
|
+
}
|
|
2645
|
+
} break;
|
|
2646
|
+
case LLM_ARCH_OPENELM:
|
|
2647
|
+
{
|
|
2648
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2649
|
+
|
|
2650
|
+
// output
|
|
2651
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2652
|
+
// init output from the input tok embed
|
|
2653
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2654
|
+
|
|
2655
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2656
|
+
const int64_t n_head = hparams.n_head(i);
|
|
2657
|
+
const int64_t n_head_qkv = 2*hparams.n_head_kv(i) + n_head;
|
|
2658
|
+
const int64_t n_ff = hparams.n_ff(i);
|
|
2659
|
+
|
|
2660
|
+
auto & layer = layers[i];
|
|
2661
|
+
|
|
2662
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2663
|
+
|
|
2664
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_head_qkv*n_embd_head_k}, 0);
|
|
2665
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2666
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2667
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head*n_embd_head_k, n_embd}, 0);
|
|
2668
|
+
|
|
2669
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2670
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2671
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2672
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2673
|
+
}
|
|
2674
|
+
} break;
|
|
2675
|
+
case LLM_ARCH_GPTNEOX:
|
|
2676
|
+
{
|
|
2677
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2678
|
+
|
|
2679
|
+
// output
|
|
2680
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2681
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2682
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2683
|
+
|
|
2684
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2685
|
+
auto & layer = layers[i];
|
|
2686
|
+
|
|
2687
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2688
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2689
|
+
|
|
2690
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2691
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2692
|
+
|
|
2693
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2694
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2695
|
+
|
|
2696
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2697
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
2698
|
+
|
|
2699
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2700
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2701
|
+
|
|
2702
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2703
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
2704
|
+
}
|
|
2705
|
+
} break;
|
|
2706
|
+
case LLM_ARCH_ARCTIC:
|
|
2707
|
+
{
|
|
2708
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2709
|
+
|
|
2710
|
+
// output
|
|
2711
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2712
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2713
|
+
|
|
2714
|
+
// if output is NULL, init from the input tok embed
|
|
2715
|
+
if (output == NULL) {
|
|
2716
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2717
|
+
}
|
|
2718
|
+
|
|
2719
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2720
|
+
auto & layer = layers[i];
|
|
2721
|
+
|
|
2722
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2723
|
+
|
|
2724
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2725
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2726
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2727
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2728
|
+
|
|
2729
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2730
|
+
|
|
2731
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd}, 0);
|
|
2732
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd}, 0);
|
|
2733
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd}, 0);
|
|
2734
|
+
|
|
2735
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2736
|
+
layer.ffn_norm_exps = create_tensor(tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd}, 0);
|
|
2737
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
|
2738
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2739
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
2740
|
+
}
|
|
2741
|
+
} break;
|
|
2742
|
+
case LLM_ARCH_DEEPSEEK:
|
|
2743
|
+
{
|
|
2744
|
+
|
|
2745
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
2746
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
2747
|
+
|
|
2748
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2749
|
+
|
|
2750
|
+
// output
|
|
2751
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2752
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2753
|
+
|
|
2754
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2755
|
+
auto & layer = layers[i];
|
|
2756
|
+
|
|
2757
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2758
|
+
|
|
2759
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2760
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2761
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2762
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2763
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2764
|
+
|
|
2765
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
2766
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2767
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2768
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2769
|
+
} else {
|
|
2770
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2771
|
+
|
|
2772
|
+
if (n_expert == 0) {
|
|
2773
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
2774
|
+
}
|
|
2775
|
+
if (n_expert_used == 0) {
|
|
2776
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
2777
|
+
}
|
|
2778
|
+
|
|
2779
|
+
// MoE branch
|
|
2780
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2781
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2782
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2783
|
+
|
|
2784
|
+
// Shared expert branch
|
|
2785
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
2786
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
2787
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
2788
|
+
}
|
|
2789
|
+
}
|
|
2790
|
+
} break;
|
|
2791
|
+
case LLM_ARCH_DEEPSEEK2:
|
|
2792
|
+
{
|
|
2793
|
+
const bool is_lite = (hparams.n_layer == 27);
|
|
2794
|
+
|
|
2795
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
2796
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
2797
|
+
|
|
2798
|
+
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
2799
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
2800
|
+
|
|
2801
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
2802
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
2803
|
+
|
|
2804
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2805
|
+
|
|
2806
|
+
// output
|
|
2807
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2808
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2809
|
+
|
|
2810
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2811
|
+
auto & layer = layers[i];
|
|
2812
|
+
|
|
2813
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2814
|
+
if (!is_lite) {
|
|
2815
|
+
layer.attn_q_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM, "weight", i), {q_lora_rank}, 0);
|
|
2816
|
+
}
|
|
2817
|
+
|
|
2818
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
2819
|
+
|
|
2820
|
+
if (!is_lite) {
|
|
2821
|
+
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
2822
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
|
|
2823
|
+
} else {
|
|
2824
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2825
|
+
}
|
|
2826
|
+
|
|
2827
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
|
2828
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
|
2829
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
|
2830
|
+
|
|
2831
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2832
|
+
|
|
2833
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
2834
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2835
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2836
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2837
|
+
} else {
|
|
2838
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2839
|
+
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED);
|
|
2840
|
+
|
|
2841
|
+
if (n_expert == 0) {
|
|
2842
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
2843
|
+
}
|
|
2844
|
+
if (n_expert_used == 0) {
|
|
2845
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
2846
|
+
}
|
|
2847
|
+
|
|
2848
|
+
// MoE branch
|
|
2849
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2850
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2851
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2852
|
+
|
|
2853
|
+
// Shared expert branch
|
|
2854
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
2855
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
2856
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
2857
|
+
}
|
|
2858
|
+
}
|
|
2859
|
+
} break;
|
|
2860
|
+
case LLM_ARCH_BITNET:
|
|
2861
|
+
{
|
|
2862
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2863
|
+
|
|
2864
|
+
// output
|
|
2865
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2866
|
+
|
|
2867
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2868
|
+
auto & layer = layers[i];
|
|
2869
|
+
|
|
2870
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2871
|
+
layer.attn_sub_norm = create_tensor(tn(LLM_TENSOR_ATTN_SUB_NORM, "weight", i), {n_embd}, 0);
|
|
2872
|
+
|
|
2873
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2874
|
+
layer.wq_scale = create_tensor(tn(LLM_TENSOR_ATTN_Q, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
2875
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2876
|
+
layer.wk_scale = create_tensor(tn(LLM_TENSOR_ATTN_K, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
2877
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2878
|
+
layer.wv_scale = create_tensor(tn(LLM_TENSOR_ATTN_V, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
2879
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2880
|
+
layer.wo_scale = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
2881
|
+
|
|
2882
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2883
|
+
layer.ffn_sub_norm = create_tensor(tn(LLM_TENSOR_FFN_SUB_NORM, "weight", i), {n_ff}, 0);
|
|
2884
|
+
|
|
2885
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2886
|
+
layer.ffn_gate_scale = create_tensor(tn(LLM_TENSOR_FFN_GATE, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
2887
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2888
|
+
layer.ffn_down_scale = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
2889
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2890
|
+
layer.ffn_up_scale = create_tensor(tn(LLM_TENSOR_FFN_UP, "scale", i), {1}, TENSOR_NOT_REQUIRED);
|
|
2891
|
+
}
|
|
2892
|
+
} break;
|
|
2893
|
+
case LLM_ARCH_T5:
|
|
2894
|
+
{
|
|
2895
|
+
const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
|
|
2896
|
+
|
|
2897
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2898
|
+
|
|
2899
|
+
// output
|
|
2900
|
+
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2901
|
+
output_norm = create_tensor(tn(LLM_TENSOR_DEC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2902
|
+
|
|
2903
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2904
|
+
// if output is NULL, init from the input tok embed
|
|
2905
|
+
if (output == NULL) {
|
|
2906
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2907
|
+
}
|
|
2908
|
+
|
|
2909
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2910
|
+
auto & layer = layers[i];
|
|
2911
|
+
|
|
2912
|
+
layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2913
|
+
layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
|
2914
|
+
|
|
2915
|
+
layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2916
|
+
layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2917
|
+
layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2918
|
+
layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
|
2919
|
+
|
|
2920
|
+
layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2921
|
+
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2922
|
+
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2923
|
+
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2924
|
+
|
|
2925
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_DEC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2926
|
+
layer.attn_rel_b = create_tensor(tn(LLM_TENSOR_DEC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
|
2927
|
+
|
|
2928
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_DEC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2929
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_DEC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2930
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_DEC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2931
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_DEC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
|
2932
|
+
|
|
2933
|
+
layer.attn_norm_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2934
|
+
// this tensor seems to be unused in HF transformers implementation
|
|
2935
|
+
layer.attn_rel_b_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
|
2936
|
+
|
|
2937
|
+
layer.wq_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2938
|
+
layer.wk_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2939
|
+
layer.wv_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2940
|
+
layer.wo_cross = create_tensor(tn(LLM_TENSOR_DEC_CROSS_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
|
2941
|
+
|
|
2942
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_DEC_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2943
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_DEC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2944
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_DEC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2945
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_DEC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2946
|
+
}
|
|
2947
|
+
} break;
|
|
2948
|
+
case LLM_ARCH_T5ENCODER:
|
|
2949
|
+
{
|
|
2950
|
+
const auto n_rel_attn_bkts = hparams.n_rel_attn_bkts;
|
|
2951
|
+
|
|
2952
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2953
|
+
|
|
2954
|
+
// output
|
|
2955
|
+
output_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2956
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2957
|
+
// if output is NULL, init from the input tok embed
|
|
2958
|
+
if (output == NULL) {
|
|
2959
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2960
|
+
}
|
|
2961
|
+
|
|
2962
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2963
|
+
auto & layer = layers[i];
|
|
2964
|
+
|
|
2965
|
+
layer.attn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2966
|
+
layer.attn_rel_b_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_REL_B, "weight", i), {n_head, n_rel_attn_bkts}, TENSOR_NOT_REQUIRED);
|
|
2967
|
+
|
|
2968
|
+
layer.wq_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2969
|
+
layer.wk_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
2970
|
+
layer.wv_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
2971
|
+
layer.wo_enc = create_tensor(tn(LLM_TENSOR_ENC_ATTN_OUT, "weight", i), {n_embd_v_gqa, n_embd}, 0);
|
|
2972
|
+
|
|
2973
|
+
layer.ffn_norm_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2974
|
+
layer.ffn_gate_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED);
|
|
2975
|
+
layer.ffn_down_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2976
|
+
layer.ffn_up_enc = create_tensor(tn(LLM_TENSOR_ENC_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2977
|
+
}
|
|
2978
|
+
} break;
|
|
2979
|
+
case LLM_ARCH_JAIS:
|
|
2980
|
+
{
|
|
2981
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2982
|
+
|
|
2983
|
+
// output
|
|
2984
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2985
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
2986
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2987
|
+
|
|
2988
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2989
|
+
auto & layer = layers[i];
|
|
2990
|
+
|
|
2991
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2992
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
2993
|
+
|
|
2994
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
2995
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2996
|
+
|
|
2997
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2998
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2999
|
+
|
|
3000
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3001
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
3002
|
+
|
|
3003
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3004
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
3005
|
+
|
|
3006
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3007
|
+
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, 0);
|
|
3008
|
+
|
|
3009
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3010
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
3011
|
+
}
|
|
3012
|
+
} break;
|
|
3013
|
+
case LLM_ARCH_CHATGLM:
|
|
3014
|
+
{
|
|
3015
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3016
|
+
|
|
3017
|
+
// output
|
|
3018
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3019
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3020
|
+
|
|
3021
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3022
|
+
auto & layer = layers[i];
|
|
3023
|
+
|
|
3024
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3025
|
+
|
|
3026
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
3027
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
3028
|
+
|
|
3029
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3030
|
+
|
|
3031
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3032
|
+
|
|
3033
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
|
3034
|
+
|
|
3035
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3036
|
+
}
|
|
3037
|
+
} break;
|
|
3038
|
+
case LLM_ARCH_NEMOTRON:
|
|
3039
|
+
{
|
|
3040
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3041
|
+
|
|
3042
|
+
// output
|
|
3043
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3044
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
3045
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3046
|
+
|
|
3047
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3048
|
+
auto & layer = layers[i];
|
|
3049
|
+
|
|
3050
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3051
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
3052
|
+
|
|
3053
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
3054
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3055
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3056
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3057
|
+
|
|
3058
|
+
// optional bias tensors
|
|
3059
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3060
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3061
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3062
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3063
|
+
|
|
3064
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3065
|
+
layer.ffn_norm_b = create_tensor(tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, 0);
|
|
3066
|
+
|
|
3067
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3068
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3069
|
+
|
|
3070
|
+
// optional MLP bias
|
|
3071
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3072
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
|
3073
|
+
}
|
|
3074
|
+
} break;
|
|
3075
|
+
case LLM_ARCH_EXAONE:
|
|
3076
|
+
{
|
|
3077
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3078
|
+
|
|
3079
|
+
// output
|
|
3080
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3081
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3082
|
+
|
|
3083
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3084
|
+
auto & layer = layers[i];
|
|
3085
|
+
|
|
3086
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3087
|
+
|
|
3088
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3089
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3090
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3091
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
3092
|
+
|
|
3093
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3094
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
3095
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3096
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3097
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3098
|
+
}
|
|
3099
|
+
} break;
|
|
3100
|
+
case LLM_ARCH_RWKV6:
|
|
3101
|
+
{
|
|
3102
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3103
|
+
|
|
3104
|
+
// Block 0, LN0
|
|
3105
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0);
|
|
3106
|
+
tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, 0);
|
|
3107
|
+
|
|
3108
|
+
// output
|
|
3109
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3110
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
3111
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3112
|
+
|
|
3113
|
+
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
|
3114
|
+
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
|
3115
|
+
const int head_size = hparams.wkv_head_size;
|
|
3116
|
+
const int attn_hidden_size = n_embd;
|
|
3117
|
+
const int ffn_size = hparams.n_ff_arr[0];
|
|
3118
|
+
|
|
3119
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3120
|
+
auto & layer = layers[i];
|
|
3121
|
+
|
|
3122
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3123
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
3124
|
+
|
|
3125
|
+
layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
|
|
3126
|
+
layer.attn_norm_2_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, 0);
|
|
3127
|
+
|
|
3128
|
+
layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
|
|
3129
|
+
layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
|
|
3130
|
+
|
|
3131
|
+
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
3132
|
+
layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3133
|
+
layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3134
|
+
layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3135
|
+
layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3136
|
+
layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3137
|
+
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3138
|
+
GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
|
|
3139
|
+
|
|
3140
|
+
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
|
|
3141
|
+
layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
|
|
3142
|
+
layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
|
|
3143
|
+
layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
|
|
3144
|
+
layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3145
|
+
layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3146
|
+
layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3147
|
+
layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3148
|
+
|
|
3149
|
+
layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, 0);
|
|
3150
|
+
layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, 0);
|
|
3151
|
+
layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
|
3152
|
+
|
|
3153
|
+
layer.channel_mix_lerp_k = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, 0);
|
|
3154
|
+
layer.channel_mix_lerp_r = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, 0);
|
|
3155
|
+
|
|
3156
|
+
layer.channel_mix_key = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_KEY, "weight", i), {n_embd, ffn_size}, 0);
|
|
3157
|
+
layer.channel_mix_value = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_VALUE, "weight", i), {ffn_size, n_embd}, 0);
|
|
3158
|
+
layer.channel_mix_receptance = create_tensor(tn(LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "weight", i), {n_embd, n_embd}, 0);
|
|
3159
|
+
}
|
|
3160
|
+
|
|
3161
|
+
} break;
|
|
3162
|
+
case LLM_ARCH_RWKV6QWEN2:
|
|
3163
|
+
{
|
|
3164
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3165
|
+
|
|
3166
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3167
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3168
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3169
|
+
|
|
3170
|
+
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
|
3171
|
+
const int time_decay_extra_dim = hparams.time_decay_extra_dim;
|
|
3172
|
+
const int head_size = hparams.wkv_head_size;
|
|
3173
|
+
const int attn_hidden_size = n_embd;
|
|
3174
|
+
const int n_head_kv = hparams.n_head_kv();
|
|
3175
|
+
int attn_key_value_size;
|
|
3176
|
+
if (n_head_kv == 0 || attn_hidden_size / head_size == n_head_kv) {
|
|
3177
|
+
attn_key_value_size = attn_hidden_size;
|
|
3178
|
+
} else {
|
|
3179
|
+
attn_key_value_size = n_head_kv * head_size;
|
|
3180
|
+
}
|
|
3181
|
+
|
|
3182
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3183
|
+
auto & layer = layers[i];
|
|
3184
|
+
|
|
3185
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3186
|
+
|
|
3187
|
+
layer.time_mix_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W1, "weight", i), {n_embd, time_mix_extra_dim * 5}, 0);
|
|
3188
|
+
layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
|
|
3189
|
+
|
|
3190
|
+
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
3191
|
+
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
|
|
3192
|
+
|
|
3193
|
+
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3194
|
+
layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
|
|
3195
|
+
layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
|
|
3196
|
+
layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
|
|
3197
|
+
layer.time_mix_key = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "weight", i), {n_embd, attn_key_value_size}, 0);
|
|
3198
|
+
layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {n_embd, attn_key_value_size}, 0);
|
|
3199
|
+
layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3200
|
+
layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3201
|
+
// optional bias tensors
|
|
3202
|
+
layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3203
|
+
layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3204
|
+
layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
3205
|
+
|
|
3206
|
+
layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
|
3207
|
+
|
|
3208
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3209
|
+
|
|
3210
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3211
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3212
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3213
|
+
}
|
|
3214
|
+
} break;
|
|
3215
|
+
case LLM_ARCH_CHAMELEON:
|
|
3216
|
+
{
|
|
3217
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3218
|
+
|
|
3219
|
+
// output
|
|
3220
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3221
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3222
|
+
// if output is NULL, init from the input tok embed
|
|
3223
|
+
if (output == NULL) {
|
|
3224
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3225
|
+
}
|
|
3226
|
+
|
|
3227
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3228
|
+
auto & layer = layers[i];
|
|
3229
|
+
|
|
3230
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3231
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}, 0);
|
|
3232
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}, 0);
|
|
3233
|
+
layer.attn_q_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd_head_k, n_head}, TENSOR_NOT_REQUIRED);
|
|
3234
|
+
layer.attn_k_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd_head_k, n_head_kv}, TENSOR_NOT_REQUIRED);
|
|
3235
|
+
|
|
3236
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
3237
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3238
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3239
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3240
|
+
|
|
3241
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3242
|
+
|
|
3243
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
3244
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3245
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3246
|
+
}
|
|
3247
|
+
} break;
|
|
3248
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
3249
|
+
{
|
|
3250
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
|
|
3251
|
+
|
|
3252
|
+
conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
|
|
3253
|
+
conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
|
|
3254
|
+
|
|
3255
|
+
// posnet
|
|
3256
|
+
{
|
|
3257
|
+
const int64_t n_embd = hparams.posnet.n_embd;
|
|
3258
|
+
|
|
3259
|
+
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
|
|
3260
|
+
auto & layer = layers[i].posnet;
|
|
3261
|
+
|
|
3262
|
+
// posnet:
|
|
3263
|
+
//
|
|
3264
|
+
// - resnet
|
|
3265
|
+
// - resnet
|
|
3266
|
+
// - attn
|
|
3267
|
+
// - resnet
|
|
3268
|
+
// - resnet
|
|
3269
|
+
// - norm
|
|
3270
|
+
//
|
|
3271
|
+
switch (i) {
|
|
3272
|
+
case 0:
|
|
3273
|
+
case 1:
|
|
3274
|
+
case 3:
|
|
3275
|
+
case 4:
|
|
3276
|
+
{
|
|
3277
|
+
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
|
|
3278
|
+
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
|
|
3279
|
+
|
|
3280
|
+
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
|
|
3281
|
+
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
|
|
3282
|
+
|
|
3283
|
+
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
|
|
3284
|
+
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
|
|
3285
|
+
|
|
3286
|
+
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
|
|
3287
|
+
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
|
|
3288
|
+
} break;
|
|
3289
|
+
case 2:
|
|
3290
|
+
{
|
|
3291
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
|
3292
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
|
3293
|
+
|
|
3294
|
+
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
|
|
3295
|
+
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
|
|
3296
|
+
|
|
3297
|
+
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
|
|
3298
|
+
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
|
|
3299
|
+
|
|
3300
|
+
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
|
|
3301
|
+
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
|
|
3302
|
+
|
|
3303
|
+
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
|
|
3304
|
+
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
|
|
3305
|
+
} break;
|
|
3306
|
+
case 5:
|
|
3307
|
+
{
|
|
3308
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
|
3309
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
|
3310
|
+
} break;
|
|
3311
|
+
default: GGML_ABORT("unknown posnet layer");
|
|
3312
|
+
};
|
|
3313
|
+
}
|
|
3314
|
+
}
|
|
3315
|
+
|
|
3316
|
+
GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
|
|
3317
|
+
|
|
3318
|
+
tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
|
|
3319
|
+
tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
|
|
3320
|
+
|
|
3321
|
+
// convnext
|
|
3322
|
+
{
|
|
3323
|
+
const int64_t n_embd = hparams.convnext.n_embd;
|
|
3324
|
+
|
|
3325
|
+
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
|
|
3326
|
+
auto & layer = layers[i].convnext;
|
|
3327
|
+
|
|
3328
|
+
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
|
|
3329
|
+
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
|
|
3330
|
+
|
|
3331
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
|
|
3332
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
|
|
3333
|
+
|
|
3334
|
+
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
|
|
3335
|
+
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
|
|
3336
|
+
|
|
3337
|
+
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
|
|
3338
|
+
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
|
|
3339
|
+
|
|
3340
|
+
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
|
|
3341
|
+
}
|
|
3342
|
+
|
|
3343
|
+
// output
|
|
3344
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3345
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
3346
|
+
}
|
|
3347
|
+
|
|
3348
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
|
3349
|
+
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
|
3350
|
+
} break;
|
|
3351
|
+
default:
|
|
3352
|
+
throw std::runtime_error("unknown architecture");
|
|
3353
|
+
}
|
|
3354
|
+
|
|
3355
|
+
if (n_moved_tensors > 0) {
|
|
3356
|
+
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
|
|
3357
|
+
__func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
|
|
3358
|
+
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
|
3359
|
+
}
|
|
3360
|
+
}
|
|
3361
|
+
|
|
3362
|
+
ml.done_getting_tensors();
|
|
3363
|
+
|
|
3364
|
+
ml.init_mappings(true, use_mlock ? &pimpl->mlock_mmaps : nullptr);
|
|
3365
|
+
pimpl->mappings.reserve(ml.mappings.size());
|
|
3366
|
+
|
|
3367
|
+
// create the backend buffers
|
|
3368
|
+
std::vector<std::pair<ggml_context *, llama_buf_map>> ctx_bufs;
|
|
3369
|
+
ctx_bufs.reserve(ctx_map.size());
|
|
3370
|
+
|
|
3371
|
+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
|
|
3372
|
+
const size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
|
|
3373
|
+
pimpl->bufs.reserve(n_max_backend_buffer);
|
|
3374
|
+
|
|
3375
|
+
for (auto & it : ctx_map) {
|
|
3376
|
+
ggml_backend_buffer_type_t buft = it.first;
|
|
3377
|
+
ggml_context * ctx = it.second;
|
|
3378
|
+
|
|
3379
|
+
// skip contexts without tensors
|
|
3380
|
+
if (ggml_get_first_tensor(ctx) == nullptr) {
|
|
3381
|
+
continue;
|
|
3382
|
+
}
|
|
3383
|
+
|
|
3384
|
+
llama_buf_map buf_map;
|
|
3385
|
+
buf_map.reserve(n_max_backend_buffer);
|
|
3386
|
+
|
|
3387
|
+
// check if it is possible to use buffer_from_host_ptr with this buffer type
|
|
3388
|
+
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
3389
|
+
if (!dev) {
|
|
3390
|
+
// FIXME: workaround for CPU backend buft having a NULL device
|
|
3391
|
+
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
3392
|
+
}
|
|
3393
|
+
ggml_backend_dev_props props;
|
|
3394
|
+
ggml_backend_dev_get_props(dev, &props);
|
|
3395
|
+
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
|
3396
|
+
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
|
|
3397
|
+
|
|
3398
|
+
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
|
|
3399
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
3400
|
+
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
3401
|
+
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
|
3402
|
+
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
|
3403
|
+
void * addr = nullptr;
|
|
3404
|
+
size_t first, last; // NOLINT
|
|
3405
|
+
ml.get_mapping_range(&first, &last, &addr, idx, ctx);
|
|
3406
|
+
if (first >= last) {
|
|
3407
|
+
continue;
|
|
3408
|
+
}
|
|
3409
|
+
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
|
3410
|
+
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
|
|
3411
|
+
if (buf == nullptr) {
|
|
3412
|
+
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
3413
|
+
}
|
|
3414
|
+
pimpl->bufs.emplace_back(buf);
|
|
3415
|
+
buf_map.emplace(idx, buf);
|
|
3416
|
+
}
|
|
3417
|
+
}
|
|
3418
|
+
else {
|
|
3419
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
3420
|
+
if (buf == nullptr) {
|
|
3421
|
+
throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft)));
|
|
3422
|
+
}
|
|
3423
|
+
pimpl->bufs.emplace_back(buf);
|
|
3424
|
+
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
|
|
3425
|
+
pimpl->mlock_bufs.emplace_back(new llama_mlock);
|
|
3426
|
+
auto & mlock_buf = pimpl->mlock_bufs.back();
|
|
3427
|
+
mlock_buf->init (ggml_backend_buffer_get_base(buf));
|
|
3428
|
+
mlock_buf->grow_to(ggml_backend_buffer_get_size(buf));
|
|
3429
|
+
}
|
|
3430
|
+
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
|
3431
|
+
buf_map.emplace(idx, buf);
|
|
3432
|
+
}
|
|
3433
|
+
}
|
|
3434
|
+
|
|
3435
|
+
if (pimpl->bufs.empty()) {
|
|
3436
|
+
throw std::runtime_error("failed to allocate buffer");
|
|
3437
|
+
}
|
|
3438
|
+
|
|
3439
|
+
for (auto & buf : buf_map) {
|
|
3440
|
+
// indicate that this buffer contains weights
|
|
3441
|
+
// this is used by ggml_backend_sched to improve op scheduling: ops that use a weight are preferably scheduled to the backend that contains the weight
|
|
3442
|
+
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
3443
|
+
}
|
|
3444
|
+
|
|
3445
|
+
ctx_bufs.emplace_back(ctx, buf_map);
|
|
3446
|
+
}
|
|
3447
|
+
|
|
3448
|
+
if (llama_supports_gpu_offload()) {
|
|
3449
|
+
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
3450
|
+
|
|
3451
|
+
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
|
3452
|
+
if (n_gpu_layers > (int) hparams.n_layer) {
|
|
3453
|
+
LLAMA_LOG_INFO("%s: offloading output layer to GPU\n", __func__);
|
|
3454
|
+
}
|
|
3455
|
+
|
|
3456
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
|
3457
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
|
3458
|
+
|
|
3459
|
+
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
|
3460
|
+
}
|
|
3461
|
+
|
|
3462
|
+
// print memory requirements per buffer type
|
|
3463
|
+
for (auto & buf : pimpl->bufs) {
|
|
3464
|
+
LLAMA_LOG_INFO("%s: %12s model buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf.get()), ggml_backend_buffer_get_size(buf.get()) / 1024.0 / 1024.0);
|
|
3465
|
+
}
|
|
3466
|
+
|
|
3467
|
+
// populate tensors_by_name
|
|
3468
|
+
for (auto & ctx : pimpl->ctxs) {
|
|
3469
|
+
for (auto * cur = ggml_get_first_tensor(ctx.get()); cur != NULL; cur = ggml_get_next_tensor(ctx.get(), cur)) {
|
|
3470
|
+
tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
|
3471
|
+
}
|
|
3472
|
+
}
|
|
3473
|
+
|
|
3474
|
+
// load tensor data
|
|
3475
|
+
for (auto & it : ctx_bufs) {
|
|
3476
|
+
ggml_context * ctx = it.first;
|
|
3477
|
+
auto & bufs = it.second;
|
|
3478
|
+
if (!ml.load_all_data(ctx, bufs, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) {
|
|
3479
|
+
return false;
|
|
3480
|
+
}
|
|
3481
|
+
}
|
|
3482
|
+
|
|
3483
|
+
if (use_mmap_buffer) {
|
|
3484
|
+
for (auto & mapping : ml.mappings) {
|
|
3485
|
+
pimpl->mappings.emplace_back(std::move(mapping));
|
|
3486
|
+
}
|
|
3487
|
+
}
|
|
3488
|
+
|
|
3489
|
+
return true;
|
|
3490
|
+
}
|
|
3491
|
+
|
|
3492
|
+
std::string llama_model::arch_name() const {
|
|
3493
|
+
return llm_arch_name(arch);
|
|
3494
|
+
}
|
|
3495
|
+
|
|
3496
|
+
std::string llama_model::type_name() const {
|
|
3497
|
+
return llm_type_name(type);
|
|
3498
|
+
}
|
|
3499
|
+
|
|
3500
|
+
std::string llama_model::desc() const {
|
|
3501
|
+
return pimpl->desc_str;
|
|
3502
|
+
}
|
|
3503
|
+
|
|
3504
|
+
size_t llama_model::size() const {
|
|
3505
|
+
return pimpl->n_bytes;
|
|
3506
|
+
}
|
|
3507
|
+
|
|
3508
|
+
size_t llama_model::max_nodes() const {
|
|
3509
|
+
return std::max<size_t>(8192, tensors_by_name.size()*5);
|
|
3510
|
+
}
|
|
3511
|
+
|
|
3512
|
+
size_t llama_model::n_devices() const {
|
|
3513
|
+
return devices.size();
|
|
3514
|
+
}
|
|
3515
|
+
|
|
3516
|
+
uint64_t llama_model::n_elements() const {
|
|
3517
|
+
return pimpl->n_elements;
|
|
3518
|
+
}
|
|
3519
|
+
|
|
3520
|
+
void llama_model::print_info() const {
|
|
3521
|
+
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
|
3522
|
+
|
|
3523
|
+
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
|
3524
|
+
bool is_var = false;
|
|
3525
|
+
|
|
3526
|
+
std::vector<uint32_t> v;
|
|
3527
|
+
for (uint32_t i = 0; i < n; ++i) {
|
|
3528
|
+
v.push_back(f(i));
|
|
3529
|
+
if (v[i] != v[0]) {
|
|
3530
|
+
is_var = true;
|
|
3531
|
+
}
|
|
3532
|
+
}
|
|
3533
|
+
|
|
3534
|
+
std::stringstream ss;
|
|
3535
|
+
|
|
3536
|
+
if (is_var) {
|
|
3537
|
+
ss << "[";
|
|
3538
|
+
for (uint32_t i = 0; i < n; ++i) {
|
|
3539
|
+
ss << v[i];
|
|
3540
|
+
if (i < n - 1) {
|
|
3541
|
+
ss << ", ";
|
|
3542
|
+
}
|
|
3543
|
+
}
|
|
3544
|
+
ss << "]";
|
|
3545
|
+
} else {
|
|
3546
|
+
ss << v[0];
|
|
3547
|
+
}
|
|
3548
|
+
|
|
3549
|
+
return ss.str();
|
|
3550
|
+
};
|
|
3551
|
+
|
|
3552
|
+
// hparams
|
|
3553
|
+
LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str());
|
|
3554
|
+
LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only);
|
|
3555
|
+
|
|
3556
|
+
if (!hparams.vocab_only) {
|
|
3557
|
+
LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train);
|
|
3558
|
+
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
|
3559
|
+
LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer);
|
|
3560
|
+
LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str());
|
|
3561
|
+
LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
|
|
3562
|
+
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot);
|
|
3563
|
+
LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa);
|
|
3564
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k);
|
|
3565
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v);
|
|
3566
|
+
LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str());
|
|
3567
|
+
LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
|
|
3568
|
+
LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
|
|
3569
|
+
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
|
3570
|
+
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
3571
|
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
|
3572
|
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
|
3573
|
+
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
|
3574
|
+
LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str());
|
|
3575
|
+
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
|
3576
|
+
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
|
3577
|
+
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
|
3578
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
|
3579
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
|
3580
|
+
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
|
3581
|
+
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
|
3582
|
+
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
3583
|
+
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
|
3584
|
+
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
3585
|
+
LLAMA_LOG_INFO("%s: ssm_d_conv = %u\n", __func__, hparams.ssm_d_conv);
|
|
3586
|
+
LLAMA_LOG_INFO("%s: ssm_d_inner = %u\n", __func__, hparams.ssm_d_inner);
|
|
3587
|
+
LLAMA_LOG_INFO("%s: ssm_d_state = %u\n", __func__, hparams.ssm_d_state);
|
|
3588
|
+
LLAMA_LOG_INFO("%s: ssm_dt_rank = %u\n", __func__, hparams.ssm_dt_rank);
|
|
3589
|
+
LLAMA_LOG_INFO("%s: ssm_dt_b_c_rms = %d\n", __func__, hparams.ssm_dt_b_c_rms);
|
|
3590
|
+
}
|
|
3591
|
+
|
|
3592
|
+
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, type_name().c_str());
|
|
3593
|
+
if (pimpl->n_elements >= 1e12) {
|
|
3594
|
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, pimpl->n_elements*1e-12);
|
|
3595
|
+
} else if (pimpl->n_elements >= 1e9) {
|
|
3596
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, pimpl->n_elements*1e-9);
|
|
3597
|
+
} else if (pimpl->n_elements >= 1e6) {
|
|
3598
|
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, pimpl->n_elements*1e-6);
|
|
3599
|
+
} else {
|
|
3600
|
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, pimpl->n_elements*1e-3);
|
|
3601
|
+
}
|
|
3602
|
+
|
|
3603
|
+
// general kv
|
|
3604
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, name.c_str());
|
|
3605
|
+
|
|
3606
|
+
if (arch == LLM_ARCH_DEEPSEEK) {
|
|
3607
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
3608
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
3609
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
3610
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
3611
|
+
}
|
|
3612
|
+
|
|
3613
|
+
if (arch == LLM_ARCH_DEEPSEEK2) {
|
|
3614
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
3615
|
+
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
3616
|
+
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
3617
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
3618
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
3619
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
3620
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
3621
|
+
LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((enum llama_expert_gating_func_type) hparams.expert_gating_func));
|
|
3622
|
+
LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul);
|
|
3623
|
+
}
|
|
3624
|
+
|
|
3625
|
+
if (arch == LLM_ARCH_QWEN2MOE) {
|
|
3626
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
3627
|
+
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
3628
|
+
}
|
|
3629
|
+
|
|
3630
|
+
if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
|
|
3631
|
+
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
3632
|
+
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
3633
|
+
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
3634
|
+
}
|
|
3635
|
+
|
|
3636
|
+
vocab.print_info();
|
|
3637
|
+
}
|
|
3638
|
+
|
|
3639
|
+
ggml_backend_dev_t llama_model::dev_layer(int il) const {
|
|
3640
|
+
return pimpl->dev_layer.at(il).dev;
|
|
3641
|
+
}
|
|
3642
|
+
|
|
3643
|
+
ggml_backend_dev_t llama_model::dev_output() const {
|
|
3644
|
+
return pimpl->dev_output.dev;
|
|
3645
|
+
}
|
|
3646
|
+
|
|
3647
|
+
template<typename F>
|
|
3648
|
+
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
|
3649
|
+
ggml_init_params params = {
|
|
3650
|
+
/*.mem_size =*/ ggml_tensor_overhead()*8,
|
|
3651
|
+
/*.mem_buffer =*/ NULL,
|
|
3652
|
+
/*.no_alloc =*/ true,
|
|
3653
|
+
};
|
|
3654
|
+
|
|
3655
|
+
ggml_context_ptr ctx { ggml_init(params) };
|
|
3656
|
+
if (!ctx) {
|
|
3657
|
+
throw std::runtime_error(format("failed to create ggml context"));
|
|
3658
|
+
}
|
|
3659
|
+
|
|
3660
|
+
ggml_backend_buffer_ptr buf { ggml_backend_buft_alloc_buffer(buft, 0) };
|
|
3661
|
+
ggml_tensor * op_tensor = fn(ctx.get());
|
|
3662
|
+
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
3663
|
+
if (op_tensor->src[i] != nullptr) {
|
|
3664
|
+
assert(op_tensor->src[i]->buffer == nullptr);
|
|
3665
|
+
op_tensor->src[i]->buffer = buf.get();
|
|
3666
|
+
}
|
|
3667
|
+
}
|
|
3668
|
+
|
|
3669
|
+
bool op_supported = ggml_backend_dev_supports_op(dev, op_tensor);
|
|
3670
|
+
|
|
3671
|
+
return op_supported;
|
|
3672
|
+
}
|
|
3673
|
+
|
|
3674
|
+
template<typename F>
|
|
3675
|
+
static ggml_backend_buffer_type_t select_buft(const buft_list_t & buft_list, const F & fn) {
|
|
3676
|
+
for (const auto & cur : buft_list) {
|
|
3677
|
+
ggml_backend_dev_t cur_dev = cur.first;
|
|
3678
|
+
ggml_backend_buffer_type_t cur_buft = cur.second;
|
|
3679
|
+
if (buft_supported(cur_buft, cur_dev, fn)) {
|
|
3680
|
+
return cur_buft;
|
|
3681
|
+
}
|
|
3682
|
+
}
|
|
3683
|
+
|
|
3684
|
+
throw std::runtime_error(format("no suitable buffer type found"));
|
|
3685
|
+
}
|
|
3686
|
+
|
|
3687
|
+
ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
|
|
3688
|
+
return ::select_buft(
|
|
3689
|
+
*pimpl->dev_layer.at(il).buft_list,
|
|
3690
|
+
[&](ggml_context * ctx) {
|
|
3691
|
+
ggml_tensor * cur = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
|
3692
|
+
ggml_tensor * layer_dir = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
|
|
3693
|
+
return ggml_add(ctx, cur, layer_dir);
|
|
3694
|
+
});
|
|
3695
|
+
}
|
|
3696
|
+
|
|
3697
|
+
const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
3698
|
+
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
|
3699
|
+
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
|
3700
|
+
return it.first == name;
|
|
3701
|
+
});
|
|
3702
|
+
if (it == tensors_by_name.end()) {
|
|
3703
|
+
return nullptr;
|
|
3704
|
+
}
|
|
3705
|
+
|
|
3706
|
+
return it->second;
|
|
3707
|
+
}
|
|
3708
|
+
|
|
3709
|
+
//
|
|
3710
|
+
// interface implementation
|
|
3711
|
+
//
|
|
3712
|
+
|
|
3713
|
+
struct llama_model_params llama_model_default_params() {
|
|
3714
|
+
struct llama_model_params result = {
|
|
3715
|
+
/*.devices =*/ nullptr,
|
|
3716
|
+
/*.n_gpu_layers =*/ 0,
|
|
3717
|
+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
3718
|
+
/*.main_gpu =*/ 0,
|
|
3719
|
+
/*.tensor_split =*/ nullptr,
|
|
3720
|
+
/*.progress_callback =*/ nullptr,
|
|
3721
|
+
/*.progress_callback_user_data =*/ nullptr,
|
|
3722
|
+
/*.kv_overrides =*/ nullptr,
|
|
3723
|
+
/*.vocab_only =*/ false,
|
|
3724
|
+
/*.use_mmap =*/ true,
|
|
3725
|
+
/*.use_mlock =*/ false,
|
|
3726
|
+
/*.check_tensors =*/ false,
|
|
3727
|
+
};
|
|
3728
|
+
|
|
3729
|
+
#ifdef GGML_USE_METAL
|
|
3730
|
+
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
|
3731
|
+
result.n_gpu_layers = 999;
|
|
3732
|
+
#endif
|
|
3733
|
+
|
|
3734
|
+
return result;
|
|
3735
|
+
}
|
|
3736
|
+
|
|
3737
|
+
const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model) {
|
|
3738
|
+
return &model->vocab;
|
|
3739
|
+
}
|
|
3740
|
+
|
|
3741
|
+
void llama_free_model(struct llama_model * model) {
|
|
3742
|
+
llama_model_free(model);
|
|
3743
|
+
}
|
|
3744
|
+
|
|
3745
|
+
void llama_model_free(struct llama_model * model) {
|
|
3746
|
+
delete model;
|
|
3747
|
+
}
|
|
3748
|
+
|
|
3749
|
+
int32_t llama_model_n_ctx_train(const struct llama_model * model) {
|
|
3750
|
+
return model->hparams.n_ctx_train;
|
|
3751
|
+
}
|
|
3752
|
+
|
|
3753
|
+
int32_t llama_model_n_embd(const struct llama_model * model) {
|
|
3754
|
+
return model->hparams.n_embd;
|
|
3755
|
+
}
|
|
3756
|
+
|
|
3757
|
+
int32_t llama_model_n_layer(const struct llama_model * model) {
|
|
3758
|
+
return model->hparams.n_layer;
|
|
3759
|
+
}
|
|
3760
|
+
|
|
3761
|
+
int32_t llama_model_n_head(const struct llama_model * model) {
|
|
3762
|
+
return model->hparams.n_head();
|
|
3763
|
+
}
|
|
3764
|
+
|
|
3765
|
+
// deprecated
|
|
3766
|
+
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
|
3767
|
+
return llama_model_n_ctx_train(model);
|
|
3768
|
+
}
|
|
3769
|
+
|
|
3770
|
+
// deprecated
|
|
3771
|
+
int32_t llama_n_embd(const struct llama_model * model) {
|
|
3772
|
+
return llama_model_n_embd(model);
|
|
3773
|
+
}
|
|
3774
|
+
|
|
3775
|
+
// deprecated
|
|
3776
|
+
int32_t llama_n_layer(const struct llama_model * model) {
|
|
3777
|
+
return llama_model_n_layer(model);
|
|
3778
|
+
}
|
|
3779
|
+
|
|
3780
|
+
// deprecated
|
|
3781
|
+
int32_t llama_n_head(const struct llama_model * model) {
|
|
3782
|
+
return llama_model_n_head(model);
|
|
3783
|
+
}
|
|
3784
|
+
|
|
3785
|
+
enum llama_rope_type llama_model_rope_type(const struct llama_model * model) {
|
|
3786
|
+
switch (model->arch) {
|
|
3787
|
+
// these models do not use RoPE
|
|
3788
|
+
case LLM_ARCH_GPT2:
|
|
3789
|
+
case LLM_ARCH_GPTJ:
|
|
3790
|
+
case LLM_ARCH_MPT:
|
|
3791
|
+
case LLM_ARCH_REFACT:
|
|
3792
|
+
case LLM_ARCH_BLOOM:
|
|
3793
|
+
case LLM_ARCH_MAMBA:
|
|
3794
|
+
case LLM_ARCH_JINA_BERT_V2:
|
|
3795
|
+
case LLM_ARCH_T5:
|
|
3796
|
+
case LLM_ARCH_T5ENCODER:
|
|
3797
|
+
case LLM_ARCH_JAIS:
|
|
3798
|
+
case LLM_ARCH_RWKV6:
|
|
3799
|
+
case LLM_ARCH_RWKV6QWEN2:
|
|
3800
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
3801
|
+
return LLAMA_ROPE_TYPE_NONE;
|
|
3802
|
+
|
|
3803
|
+
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
3804
|
+
case LLM_ARCH_LLAMA:
|
|
3805
|
+
case LLM_ARCH_DECI:
|
|
3806
|
+
case LLM_ARCH_BAICHUAN:
|
|
3807
|
+
case LLM_ARCH_STARCODER:
|
|
3808
|
+
case LLM_ARCH_PLAMO:
|
|
3809
|
+
case LLM_ARCH_ORION:
|
|
3810
|
+
case LLM_ARCH_INTERNLM2:
|
|
3811
|
+
case LLM_ARCH_MINICPM:
|
|
3812
|
+
case LLM_ARCH_XVERSE:
|
|
3813
|
+
case LLM_ARCH_COMMAND_R:
|
|
3814
|
+
case LLM_ARCH_COHERE2:
|
|
3815
|
+
case LLM_ARCH_OLMO:
|
|
3816
|
+
case LLM_ARCH_ARCTIC:
|
|
3817
|
+
case LLM_ARCH_DEEPSEEK:
|
|
3818
|
+
case LLM_ARCH_DEEPSEEK2:
|
|
3819
|
+
case LLM_ARCH_CHATGLM:
|
|
3820
|
+
case LLM_ARCH_GRANITE:
|
|
3821
|
+
case LLM_ARCH_GRANITE_MOE:
|
|
3822
|
+
case LLM_ARCH_CHAMELEON:
|
|
3823
|
+
return LLAMA_ROPE_TYPE_NORM;
|
|
3824
|
+
|
|
3825
|
+
// the pairs of head values are offset by n_rot/2
|
|
3826
|
+
case LLM_ARCH_FALCON:
|
|
3827
|
+
case LLM_ARCH_GROK:
|
|
3828
|
+
case LLM_ARCH_DBRX:
|
|
3829
|
+
case LLM_ARCH_BERT:
|
|
3830
|
+
case LLM_ARCH_NOMIC_BERT:
|
|
3831
|
+
case LLM_ARCH_STABLELM:
|
|
3832
|
+
case LLM_ARCH_BITNET:
|
|
3833
|
+
case LLM_ARCH_QWEN:
|
|
3834
|
+
case LLM_ARCH_QWEN2:
|
|
3835
|
+
case LLM_ARCH_QWEN2MOE:
|
|
3836
|
+
case LLM_ARCH_OLMO2:
|
|
3837
|
+
case LLM_ARCH_OLMOE:
|
|
3838
|
+
case LLM_ARCH_PHI2:
|
|
3839
|
+
case LLM_ARCH_PHI3:
|
|
3840
|
+
case LLM_ARCH_PHIMOE:
|
|
3841
|
+
case LLM_ARCH_GEMMA:
|
|
3842
|
+
case LLM_ARCH_GEMMA2:
|
|
3843
|
+
case LLM_ARCH_STARCODER2:
|
|
3844
|
+
case LLM_ARCH_OPENELM:
|
|
3845
|
+
case LLM_ARCH_GPTNEOX:
|
|
3846
|
+
case LLM_ARCH_CODESHELL:
|
|
3847
|
+
case LLM_ARCH_NEMOTRON:
|
|
3848
|
+
case LLM_ARCH_EXAONE:
|
|
3849
|
+
case LLM_ARCH_MINICPM3:
|
|
3850
|
+
return LLAMA_ROPE_TYPE_NEOX;
|
|
3851
|
+
|
|
3852
|
+
case LLM_ARCH_QWEN2VL:
|
|
3853
|
+
return LLAMA_ROPE_TYPE_MROPE;
|
|
3854
|
+
|
|
3855
|
+
// all model arches should be listed explicitly here
|
|
3856
|
+
case LLM_ARCH_UNKNOWN:
|
|
3857
|
+
GGML_ABORT("unknown architecture");
|
|
3858
|
+
}
|
|
3859
|
+
|
|
3860
|
+
return LLAMA_ROPE_TYPE_NONE;
|
|
3861
|
+
}
|
|
3862
|
+
|
|
3863
|
+
float llama_model_rope_freq_scale_train(const struct llama_model * model) {
|
|
3864
|
+
return model->hparams.rope_freq_scale_train;
|
|
3865
|
+
}
|
|
3866
|
+
|
|
3867
|
+
int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
|
3868
|
+
const auto & it = model->gguf_kv.find(key);
|
|
3869
|
+
if (it == model->gguf_kv.end()) {
|
|
3870
|
+
if (buf_size > 0) {
|
|
3871
|
+
buf[0] = '\0';
|
|
3872
|
+
}
|
|
3873
|
+
return -1;
|
|
3874
|
+
}
|
|
3875
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
|
3876
|
+
}
|
|
3877
|
+
|
|
3878
|
+
int32_t llama_model_meta_count(const struct llama_model * model) {
|
|
3879
|
+
return (int)model->gguf_kv.size();
|
|
3880
|
+
}
|
|
3881
|
+
|
|
3882
|
+
int32_t llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
|
3883
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
|
3884
|
+
if (buf_size > 0) {
|
|
3885
|
+
buf[0] = '\0';
|
|
3886
|
+
}
|
|
3887
|
+
return -1;
|
|
3888
|
+
}
|
|
3889
|
+
auto it = model->gguf_kv.begin();
|
|
3890
|
+
std::advance(it, i);
|
|
3891
|
+
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
|
3892
|
+
}
|
|
3893
|
+
|
|
3894
|
+
int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size) {
|
|
3895
|
+
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
|
3896
|
+
if (buf_size > 0) {
|
|
3897
|
+
buf[0] = '\0';
|
|
3898
|
+
}
|
|
3899
|
+
return -1;
|
|
3900
|
+
}
|
|
3901
|
+
auto it = model->gguf_kv.begin();
|
|
3902
|
+
std::advance(it, i);
|
|
3903
|
+
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
|
3904
|
+
}
|
|
3905
|
+
|
|
3906
|
+
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
|
3907
|
+
return snprintf(buf, buf_size, "%s", model->desc().c_str());
|
|
3908
|
+
}
|
|
3909
|
+
|
|
3910
|
+
uint64_t llama_model_size(const struct llama_model * model) {
|
|
3911
|
+
return model->size();
|
|
3912
|
+
}
|
|
3913
|
+
|
|
3914
|
+
const char * llama_model_chat_template(const struct llama_model * model) {
|
|
3915
|
+
const auto & it = model->gguf_kv.find(LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE));
|
|
3916
|
+
if (it == model->gguf_kv.end()) {
|
|
3917
|
+
return nullptr;
|
|
3918
|
+
}
|
|
3919
|
+
|
|
3920
|
+
return it->second.c_str();
|
|
3921
|
+
}
|
|
3922
|
+
|
|
3923
|
+
uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
3924
|
+
return model->n_elements();
|
|
3925
|
+
}
|
|
3926
|
+
|
|
3927
|
+
bool llama_model_has_encoder(const struct llama_model * model) {
|
|
3928
|
+
switch (model->arch) {
|
|
3929
|
+
case LLM_ARCH_T5: return true;
|
|
3930
|
+
case LLM_ARCH_T5ENCODER: return true;
|
|
3931
|
+
default: return false;
|
|
3932
|
+
}
|
|
3933
|
+
}
|
|
3934
|
+
|
|
3935
|
+
bool llama_model_has_decoder(const struct llama_model * model) {
|
|
3936
|
+
switch (model->arch) {
|
|
3937
|
+
case LLM_ARCH_T5ENCODER: return false;
|
|
3938
|
+
default: return true;
|
|
3939
|
+
}
|
|
3940
|
+
}
|
|
3941
|
+
|
|
3942
|
+
llama_token llama_model_decoder_start_token(const struct llama_model * model) {
|
|
3943
|
+
return model->hparams.dec_start_token_id;
|
|
3944
|
+
}
|
|
3945
|
+
|
|
3946
|
+
bool llama_model_is_recurrent(const struct llama_model * model) {
|
|
3947
|
+
switch (model->arch) {
|
|
3948
|
+
case LLM_ARCH_MAMBA: return true;
|
|
3949
|
+
case LLM_ARCH_RWKV6: return true;
|
|
3950
|
+
case LLM_ARCH_RWKV6QWEN2: return true;
|
|
3951
|
+
default: return false;
|
|
3952
|
+
}
|
|
3953
|
+
}
|