@fugood/llama.node 0.3.15 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +243 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +14 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +15 -8
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +2413 -228
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1004 -13516
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +127 -33
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +29 -293
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +12 -43
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +210 -286
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +692 -126
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +21 -10
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +161 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1544 -291
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +139 -57
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
#include <cmath>
|
|
18
18
|
#include <functional>
|
|
19
19
|
#include <map>
|
|
20
|
+
#include <regex>
|
|
20
21
|
#include <sstream>
|
|
21
22
|
#include <stdexcept>
|
|
22
23
|
|
|
@@ -42,11 +43,14 @@ const char * llm_type_name(llm_type type) {
|
|
|
42
43
|
case LLM_TYPE_770M: return "770M";
|
|
43
44
|
case LLM_TYPE_780M: return "780M";
|
|
44
45
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
46
|
+
case LLM_TYPE_0_6B: return "0.6B";
|
|
45
47
|
case LLM_TYPE_1B: return "1B";
|
|
46
48
|
case LLM_TYPE_1_3B: return "1.3B";
|
|
47
49
|
case LLM_TYPE_1_4B: return "1.4B";
|
|
48
50
|
case LLM_TYPE_1_5B: return "1.5B";
|
|
49
51
|
case LLM_TYPE_1_6B: return "1.6B";
|
|
52
|
+
case LLM_TYPE_1_7B: return "1.7B";
|
|
53
|
+
case LLM_TYPE_1_8B: return "1.8B";
|
|
50
54
|
case LLM_TYPE_2B: return "2B";
|
|
51
55
|
case LLM_TYPE_2_8B: return "2.8B";
|
|
52
56
|
case LLM_TYPE_2_9B: return "2.9B";
|
|
@@ -64,6 +68,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
64
68
|
case LLM_TYPE_15B: return "15B";
|
|
65
69
|
case LLM_TYPE_16B: return "16B";
|
|
66
70
|
case LLM_TYPE_20B: return "20B";
|
|
71
|
+
case LLM_TYPE_27B: return "27B";
|
|
67
72
|
case LLM_TYPE_30B: return "30B";
|
|
68
73
|
case LLM_TYPE_32B: return "32B";
|
|
69
74
|
case LLM_TYPE_34B: return "34B";
|
|
@@ -72,6 +77,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
72
77
|
case LLM_TYPE_65B: return "65B";
|
|
73
78
|
case LLM_TYPE_70B: return "70B";
|
|
74
79
|
case LLM_TYPE_236B: return "236B";
|
|
80
|
+
case LLM_TYPE_290B: return "290B";
|
|
75
81
|
case LLM_TYPE_314B: return "314B";
|
|
76
82
|
case LLM_TYPE_671B: return "671B";
|
|
77
83
|
case LLM_TYPE_SMALL: return "0.1B";
|
|
@@ -86,7 +92,10 @@ const char * llm_type_name(llm_type type) {
|
|
|
86
92
|
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
|
87
93
|
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
|
88
94
|
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
89
|
-
case
|
|
95
|
+
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
96
|
+
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
97
|
+
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
98
|
+
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
90
99
|
default: return "?B";
|
|
91
100
|
}
|
|
92
101
|
}
|
|
@@ -255,7 +264,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
255
264
|
return nullptr;
|
|
256
265
|
}
|
|
257
266
|
|
|
258
|
-
// CPU: ACCEL ->
|
|
267
|
+
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
259
268
|
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
|
260
269
|
buft_list_t buft_list;
|
|
261
270
|
|
|
@@ -271,19 +280,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
271
280
|
}
|
|
272
281
|
}
|
|
273
282
|
|
|
274
|
-
// add extra buffer types
|
|
275
|
-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
276
|
-
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
277
|
-
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
278
|
-
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
279
|
-
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
280
|
-
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
281
|
-
while (extra_bufts && *extra_bufts) {
|
|
282
|
-
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
283
|
-
++extra_bufts;
|
|
284
|
-
}
|
|
285
|
-
}
|
|
286
|
-
|
|
287
283
|
// add a host buffer type
|
|
288
284
|
// storing the tensors in a host buffer is useful when the processing of large batches
|
|
289
285
|
// is offloaded to a GPU device, since it reduces the time spent on data transfers
|
|
@@ -298,6 +294,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
298
294
|
}
|
|
299
295
|
}
|
|
300
296
|
|
|
297
|
+
// add extra buffer types, only if no GPU device is present
|
|
298
|
+
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
|
299
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
300
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
301
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
302
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
303
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
304
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
305
|
+
while (extra_bufts && *extra_bufts) {
|
|
306
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
307
|
+
++extra_bufts;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
301
311
|
// add the CPU buffer type
|
|
302
312
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
303
313
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
@@ -375,9 +385,12 @@ struct llama_model::impl {
|
|
|
375
385
|
layer_dev dev_input = {};
|
|
376
386
|
layer_dev dev_output = {};
|
|
377
387
|
std::vector<layer_dev> dev_layer;
|
|
388
|
+
|
|
389
|
+
bool has_tensor_overrides;
|
|
378
390
|
};
|
|
379
391
|
|
|
380
392
|
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
|
|
393
|
+
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
381
394
|
}
|
|
382
395
|
|
|
383
396
|
llama_model::~llama_model() {}
|
|
@@ -543,6 +556,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
543
556
|
}
|
|
544
557
|
}
|
|
545
558
|
} break;
|
|
559
|
+
case LLM_ARCH_LLAMA4:
|
|
560
|
+
{
|
|
561
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
562
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
563
|
+
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
564
|
+
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
|
|
565
|
+
hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
|
566
|
+
hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
|
|
567
|
+
|
|
568
|
+
switch (hparams.n_expert) {
|
|
569
|
+
case 16: type = LLM_TYPE_17B_16E; break;
|
|
570
|
+
case 128: type = LLM_TYPE_17B_128E; break;
|
|
571
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
if (type == LLM_TYPE_17B_128E) {
|
|
575
|
+
hparams.use_kq_norm = false;
|
|
576
|
+
}
|
|
577
|
+
} break;
|
|
546
578
|
case LLM_ARCH_DECI:
|
|
547
579
|
{
|
|
548
580
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -667,10 +699,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
667
699
|
}
|
|
668
700
|
} break;
|
|
669
701
|
case LLM_ARCH_NOMIC_BERT:
|
|
702
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
670
703
|
{
|
|
671
704
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
672
705
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
673
706
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
707
|
+
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
|
674
708
|
|
|
675
709
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
676
710
|
type = LLM_TYPE_137M;
|
|
@@ -759,6 +793,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
759
793
|
default: type = LLM_TYPE_UNKNOWN;
|
|
760
794
|
}
|
|
761
795
|
} break;
|
|
796
|
+
case LLM_ARCH_QWEN3:
|
|
797
|
+
{
|
|
798
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
799
|
+
switch (hparams.n_layer) {
|
|
800
|
+
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
801
|
+
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
|
802
|
+
case 40: type = LLM_TYPE_14B; break;
|
|
803
|
+
case 64: type = LLM_TYPE_32B; break;
|
|
804
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
805
|
+
}
|
|
806
|
+
} break;
|
|
807
|
+
case LLM_ARCH_QWEN3MOE:
|
|
808
|
+
{
|
|
809
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
810
|
+
|
|
811
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
812
|
+
switch (hparams.n_layer) {
|
|
813
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
814
|
+
case 94: type = LLM_TYPE_235B_A22B; break;
|
|
815
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
816
|
+
}
|
|
817
|
+
} break;
|
|
762
818
|
case LLM_ARCH_PHI2:
|
|
763
819
|
{
|
|
764
820
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1112,6 +1168,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1112
1168
|
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
1113
1169
|
}
|
|
1114
1170
|
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1171
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
|
|
1172
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
|
1115
1173
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1116
1174
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1117
1175
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
@@ -1131,6 +1189,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1131
1189
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1132
1190
|
}
|
|
1133
1191
|
} break;
|
|
1192
|
+
case LLM_ARCH_PLM:
|
|
1193
|
+
{
|
|
1194
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1195
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1196
|
+
switch (hparams.n_layer) {
|
|
1197
|
+
case 32: type = LLM_TYPE_1_8B; break;
|
|
1198
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1199
|
+
}
|
|
1200
|
+
} break;
|
|
1134
1201
|
case LLM_ARCH_CHATGLM:
|
|
1135
1202
|
{
|
|
1136
1203
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1152,6 +1219,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1152
1219
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1153
1220
|
}
|
|
1154
1221
|
} break;
|
|
1222
|
+
case LLM_ARCH_GLM4:
|
|
1223
|
+
{
|
|
1224
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1225
|
+
switch (hparams.n_layer) {
|
|
1226
|
+
case 40: type = LLM_TYPE_9B; break;
|
|
1227
|
+
case 61: type = LLM_TYPE_32B; break;
|
|
1228
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1229
|
+
}
|
|
1230
|
+
} break;
|
|
1155
1231
|
case LLM_ARCH_BITNET:
|
|
1156
1232
|
{
|
|
1157
1233
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1317,6 +1393,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1317
1393
|
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
|
1318
1394
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
1319
1395
|
} break;
|
|
1396
|
+
case LLM_ARCH_BAILINGMOE:
|
|
1397
|
+
{
|
|
1398
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1399
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1400
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1401
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1402
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1403
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1404
|
+
|
|
1405
|
+
switch (hparams.n_layer) {
|
|
1406
|
+
case 28: type = LLM_TYPE_16B; break;
|
|
1407
|
+
case 88: type = LLM_TYPE_290B; break;
|
|
1408
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1409
|
+
}
|
|
1410
|
+
} break;
|
|
1320
1411
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1321
1412
|
}
|
|
1322
1413
|
|
|
@@ -1544,9 +1635,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1544
1635
|
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
|
1545
1636
|
}
|
|
1546
1637
|
|
|
1547
|
-
ggml_backend_buffer_type_t buft =
|
|
1638
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
1639
|
+
|
|
1640
|
+
// check overrides
|
|
1641
|
+
if (ml.tensor_buft_overrides) {
|
|
1642
|
+
std::string tensor_name = tn.str();
|
|
1643
|
+
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
1644
|
+
std::regex pattern(overrides->pattern);
|
|
1645
|
+
if (std::regex_search(tensor_name, pattern)) {
|
|
1646
|
+
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
|
|
1647
|
+
buft = overrides->buft;
|
|
1648
|
+
break;
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
|
|
1548
1653
|
if (!buft) {
|
|
1549
|
-
|
|
1654
|
+
buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
|
1655
|
+
if (!buft) {
|
|
1656
|
+
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
|
1657
|
+
}
|
|
1550
1658
|
}
|
|
1551
1659
|
|
|
1552
1660
|
// avoid using a host buffer when using mmap
|
|
@@ -1642,6 +1750,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1642
1750
|
}
|
|
1643
1751
|
}
|
|
1644
1752
|
} break;
|
|
1753
|
+
case LLM_ARCH_LLAMA4:
|
|
1754
|
+
{
|
|
1755
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1756
|
+
|
|
1757
|
+
// output
|
|
1758
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1759
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1760
|
+
|
|
1761
|
+
// if output is NULL, init from the input tok embed
|
|
1762
|
+
if (output == NULL) {
|
|
1763
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
|
|
1767
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1768
|
+
bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
|
|
1769
|
+
|
|
1770
|
+
auto & layer = layers[i];
|
|
1771
|
+
|
|
1772
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1773
|
+
|
|
1774
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
1775
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
1776
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
1777
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
1778
|
+
|
|
1779
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1780
|
+
|
|
1781
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1782
|
+
|
|
1783
|
+
if (is_moe_layer) {
|
|
1784
|
+
int n_ff_exp = hparams.n_ff_exp;
|
|
1785
|
+
|
|
1786
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1787
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
1788
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
|
|
1789
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
1790
|
+
|
|
1791
|
+
// Shared expert
|
|
1792
|
+
const int64_t n_ff_shexp = n_ff_exp;
|
|
1793
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
1794
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
|
|
1795
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
1796
|
+
} else {
|
|
1797
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1798
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1799
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1800
|
+
}
|
|
1801
|
+
}
|
|
1802
|
+
} break;
|
|
1645
1803
|
case LLM_ARCH_DECI:
|
|
1646
1804
|
{
|
|
1647
1805
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -1911,6 +2069,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1911
2069
|
} break;
|
|
1912
2070
|
case LLM_ARCH_BERT:
|
|
1913
2071
|
case LLM_ARCH_NOMIC_BERT:
|
|
2072
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
1914
2073
|
{
|
|
1915
2074
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1916
2075
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
|
@@ -1944,20 +2103,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1944
2103
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1945
2104
|
}
|
|
1946
2105
|
|
|
2106
|
+
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2107
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2108
|
+
}
|
|
2109
|
+
|
|
1947
2110
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1948
2111
|
|
|
1949
2112
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1950
2113
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
1951
2114
|
|
|
1952
|
-
|
|
1953
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1954
|
-
|
|
1955
|
-
if (arch == LLM_ARCH_BERT) {
|
|
2115
|
+
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
1956
2116
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
1957
|
-
layer.
|
|
1958
|
-
layer.
|
|
2117
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
|
2118
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2119
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1959
2120
|
} else {
|
|
1960
|
-
layer.
|
|
2121
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2122
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2123
|
+
|
|
2124
|
+
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2125
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2126
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
2127
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2128
|
+
} else {
|
|
2129
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2130
|
+
}
|
|
1961
2131
|
}
|
|
1962
2132
|
|
|
1963
2133
|
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -2210,9 +2380,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2210
2380
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
2211
2381
|
|
|
2212
2382
|
// optional bias tensors
|
|
2213
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
2214
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
2215
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
2383
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
2384
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2385
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
2216
2386
|
|
|
2217
2387
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2218
2388
|
|
|
@@ -2241,6 +2411,77 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2241
2411
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
2242
2412
|
}
|
|
2243
2413
|
} break;
|
|
2414
|
+
case LLM_ARCH_QWEN3:
|
|
2415
|
+
{
|
|
2416
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2417
|
+
|
|
2418
|
+
// output
|
|
2419
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2420
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2421
|
+
// if output is NULL, init from the input tok embed
|
|
2422
|
+
if (output == NULL) {
|
|
2423
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2426
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2427
|
+
auto & layer = layers[i];
|
|
2428
|
+
|
|
2429
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2430
|
+
|
|
2431
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2432
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2433
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2434
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2435
|
+
|
|
2436
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2437
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2438
|
+
|
|
2439
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2440
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2441
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2442
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2443
|
+
}
|
|
2444
|
+
} break;
|
|
2445
|
+
case LLM_ARCH_QWEN3MOE:
|
|
2446
|
+
{
|
|
2447
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2448
|
+
|
|
2449
|
+
// output
|
|
2450
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2451
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2452
|
+
|
|
2453
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2454
|
+
auto & layer = layers[i];
|
|
2455
|
+
|
|
2456
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2457
|
+
|
|
2458
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2459
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2460
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2461
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2462
|
+
|
|
2463
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2464
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2465
|
+
|
|
2466
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2467
|
+
|
|
2468
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2469
|
+
|
|
2470
|
+
if (n_expert == 0) {
|
|
2471
|
+
throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
|
|
2472
|
+
}
|
|
2473
|
+
if (n_expert_used == 0) {
|
|
2474
|
+
throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
|
|
2475
|
+
}
|
|
2476
|
+
|
|
2477
|
+
// MoE branch
|
|
2478
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
2479
|
+
|
|
2480
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2481
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2482
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2483
|
+
}
|
|
2484
|
+
} break;
|
|
2244
2485
|
case LLM_ARCH_PHI2:
|
|
2245
2486
|
{
|
|
2246
2487
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -2329,7 +2570,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2329
2570
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
|
2330
2571
|
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), { n_embd }, 0);
|
|
2331
2572
|
|
|
2332
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa },
|
|
2573
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, TENSOR_NOT_REQUIRED);
|
|
2333
2574
|
if (layer.wqkv == nullptr) {
|
|
2334
2575
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
2335
2576
|
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, 0);
|
|
@@ -2558,7 +2799,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2558
2799
|
|
|
2559
2800
|
// output
|
|
2560
2801
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2561
|
-
output = create_tensor(tn(
|
|
2802
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2803
|
+
|
|
2804
|
+
// if output is NULL, init from the input tok embed
|
|
2805
|
+
if (output == NULL) {
|
|
2806
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2807
|
+
}
|
|
2562
2808
|
|
|
2563
2809
|
for (int i = 0; i < n_layer; ++i) {
|
|
2564
2810
|
auto & layer = layers[i];
|
|
@@ -2985,8 +3231,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2985
3231
|
{
|
|
2986
3232
|
const bool is_lite = (hparams.n_layer == 27);
|
|
2987
3233
|
|
|
3234
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
3235
|
+
|
|
3236
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
3237
|
+
const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
|
3238
|
+
const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
|
3239
|
+
|
|
2988
3240
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
2989
|
-
const int64_t n_embd_head_qk_nope =
|
|
3241
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
|
2990
3242
|
|
|
2991
3243
|
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
2992
3244
|
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
@@ -3012,14 +3264,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3012
3264
|
|
|
3013
3265
|
if (!is_lite) {
|
|
3014
3266
|
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
3015
|
-
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head *
|
|
3267
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
|
3016
3268
|
} else {
|
|
3017
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd,
|
|
3269
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
|
|
3018
3270
|
}
|
|
3019
3271
|
|
|
3020
|
-
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank +
|
|
3021
|
-
|
|
3022
|
-
|
|
3272
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
|
|
3273
|
+
|
|
3274
|
+
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
|
|
3275
|
+
if (is_mla) {
|
|
3276
|
+
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
|
|
3277
|
+
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
|
|
3278
|
+
} else {
|
|
3279
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
|
|
3280
|
+
}
|
|
3281
|
+
|
|
3282
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
|
|
3023
3283
|
|
|
3024
3284
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3025
3285
|
|
|
@@ -3050,6 +3310,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3050
3310
|
}
|
|
3051
3311
|
}
|
|
3052
3312
|
} break;
|
|
3313
|
+
case LLM_ARCH_PLM:
|
|
3314
|
+
{
|
|
3315
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
3316
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
3317
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
3318
|
+
|
|
3319
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3320
|
+
|
|
3321
|
+
// output
|
|
3322
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3323
|
+
// output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3324
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3325
|
+
|
|
3326
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3327
|
+
auto & layer = layers[i];
|
|
3328
|
+
|
|
3329
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3330
|
+
|
|
3331
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3332
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
|
3333
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
3334
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
|
3335
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
|
3336
|
+
|
|
3337
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3338
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3339
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3340
|
+
}
|
|
3341
|
+
} break;
|
|
3053
3342
|
case LLM_ARCH_BITNET:
|
|
3054
3343
|
{
|
|
3055
3344
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3215,16 +3504,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3215
3504
|
auto & layer = layers[i];
|
|
3216
3505
|
|
|
3217
3506
|
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3218
|
-
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa},
|
|
3219
|
-
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa},
|
|
3507
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3508
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3220
3509
|
|
|
3221
3510
|
if (layer.wqkv == nullptr) {
|
|
3222
3511
|
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3223
3512
|
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3224
3513
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3225
|
-
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd},
|
|
3226
|
-
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa},
|
|
3227
|
-
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa},
|
|
3514
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3515
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3516
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3228
3517
|
}
|
|
3229
3518
|
|
|
3230
3519
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
@@ -3236,23 +3525,62 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3236
3525
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3237
3526
|
}
|
|
3238
3527
|
} break;
|
|
3239
|
-
case
|
|
3528
|
+
case LLM_ARCH_GLM4:
|
|
3240
3529
|
{
|
|
3241
3530
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3242
3531
|
|
|
3243
3532
|
// output
|
|
3244
|
-
output_norm
|
|
3245
|
-
|
|
3246
|
-
output
|
|
3533
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3534
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3535
|
+
// if output is NULL, init from the input tok embed
|
|
3536
|
+
if (output == NULL) {
|
|
3537
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3538
|
+
}
|
|
3247
3539
|
|
|
3248
3540
|
for (int i = 0; i < n_layer; ++i) {
|
|
3249
3541
|
auto & layer = layers[i];
|
|
3250
3542
|
|
|
3251
|
-
layer.attn_norm
|
|
3252
|
-
layer.
|
|
3543
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3544
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3545
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3253
3546
|
|
|
3254
|
-
layer.
|
|
3255
|
-
|
|
3547
|
+
if (layer.wqkv == nullptr) {
|
|
3548
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3549
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3550
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3551
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3552
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3553
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3554
|
+
}
|
|
3555
|
+
|
|
3556
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3557
|
+
|
|
3558
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3559
|
+
|
|
3560
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3561
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3562
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
|
3563
|
+
|
|
3564
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3565
|
+
}
|
|
3566
|
+
} break;
|
|
3567
|
+
case LLM_ARCH_NEMOTRON:
|
|
3568
|
+
{
|
|
3569
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3570
|
+
|
|
3571
|
+
// output
|
|
3572
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3573
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
3574
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3575
|
+
|
|
3576
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3577
|
+
auto & layer = layers[i];
|
|
3578
|
+
|
|
3579
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3580
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, 0);
|
|
3581
|
+
|
|
3582
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
3583
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3256
3584
|
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
3257
3585
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3258
3586
|
|
|
@@ -3335,12 +3663,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3335
3663
|
layer.time_mix_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_W2, "weight", i), {time_mix_extra_dim, n_embd, 5}, 0);
|
|
3336
3664
|
|
|
3337
3665
|
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
3338
|
-
layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1},
|
|
3339
|
-
layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1},
|
|
3340
|
-
layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1},
|
|
3341
|
-
layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1},
|
|
3342
|
-
layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1},
|
|
3343
|
-
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5},
|
|
3666
|
+
layer.time_mix_lerp_w = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_W, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3667
|
+
layer.time_mix_lerp_k = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_K, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3668
|
+
layer.time_mix_lerp_v = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_V, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3669
|
+
layer.time_mix_lerp_r = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_R, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3670
|
+
layer.time_mix_lerp_g = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_G, "weight", i), {n_embd, 1, 1}, TENSOR_NOT_REQUIRED);
|
|
3671
|
+
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, TENSOR_NOT_REQUIRED);
|
|
3344
3672
|
GGML_ASSERT(!(layer.time_mix_lerp_fused == NULL && layer.time_mix_lerp_w == NULL));
|
|
3345
3673
|
|
|
3346
3674
|
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, 0);
|
|
@@ -3370,7 +3698,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3370
3698
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3371
3699
|
|
|
3372
3700
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3373
|
-
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},
|
|
3701
|
+
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3374
3702
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3375
3703
|
|
|
3376
3704
|
const int time_mix_extra_dim = hparams.time_mix_extra_dim;
|
|
@@ -3396,7 +3724,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3396
3724
|
layer.time_mix_lerp_x = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_X, "weight", i), {n_embd, 1, 1}, 0);
|
|
3397
3725
|
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 5}, 0);
|
|
3398
3726
|
|
|
3399
|
-
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size},
|
|
3727
|
+
layer.time_mix_first = create_tensor(tn(LLM_TENSOR_TIME_MIX_FIRST, "weight", i), {head_size, n_embd / head_size}, TENSOR_NOT_REQUIRED);
|
|
3400
3728
|
layer.time_mix_decay = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY, "weight", i), {n_embd}, 0);
|
|
3401
3729
|
layer.time_mix_decay_w1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W1, "weight", i), {n_embd, time_decay_extra_dim}, 0);
|
|
3402
3730
|
layer.time_mix_decay_w2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_DECAY_W2, "weight", i), {time_decay_extra_dim, attn_hidden_size}, 0);
|
|
@@ -3405,9 +3733,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3405
3733
|
layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3406
3734
|
layer.time_mix_gate = create_tensor(tn(LLM_TENSOR_TIME_MIX_GATE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3407
3735
|
// optional bias tensors
|
|
3408
|
-
layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size},
|
|
3409
|
-
layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size},
|
|
3410
|
-
layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size},
|
|
3736
|
+
layer.time_mix_key_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_KEY, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
|
|
3737
|
+
layer.time_mix_value_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "bias", i), {attn_key_value_size}, TENSOR_NOT_REQUIRED);
|
|
3738
|
+
layer.time_mix_receptance_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "bias", i), {attn_hidden_size}, TENSOR_NOT_REQUIRED);
|
|
3411
3739
|
|
|
3412
3740
|
layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
|
3413
3741
|
|
|
@@ -3528,8 +3856,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3528
3856
|
layer.time_mix_v2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_V2, "weight", i), {n_lora_value_res_mix, n_embd}, 0);
|
|
3529
3857
|
}
|
|
3530
3858
|
|
|
3531
|
-
layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate},
|
|
3532
|
-
layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd},
|
|
3859
|
+
layer.time_mix_g1 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G1, "weight", i), {n_embd, n_lora_gate}, TENSOR_NOT_REQUIRED);
|
|
3860
|
+
layer.time_mix_g2 = create_tensor(tn(LLM_TENSOR_TIME_MIX_G2, "weight", i), {n_lora_gate, n_embd}, TENSOR_NOT_REQUIRED);
|
|
3533
3861
|
|
|
3534
3862
|
try {
|
|
3535
3863
|
layer.time_mix_lerp_fused = create_tensor(tn(LLM_TENSOR_TIME_MIX_LERP_FUSED, "weight", i), {n_embd, 1, 1, 6}, 0);
|
|
@@ -3546,8 +3874,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3546
3874
|
layer.time_mix_value = create_tensor(tn(LLM_TENSOR_TIME_MIX_VALUE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3547
3875
|
layer.time_mix_receptance = create_tensor(tn(LLM_TENSOR_TIME_MIX_RECEPTANCE, "weight", i), {attn_hidden_size, n_embd}, 0);
|
|
3548
3876
|
|
|
3549
|
-
layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd},
|
|
3550
|
-
layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd},
|
|
3877
|
+
layer.time_mix_ln = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3878
|
+
layer.time_mix_ln_b = create_tensor(tn(LLM_TENSOR_TIME_MIX_LN, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3551
3879
|
layer.time_mix_output = create_tensor(tn(LLM_TENSOR_TIME_MIX_OUTPUT, "weight", i), {n_embd, attn_hidden_size}, 0);
|
|
3552
3880
|
|
|
3553
3881
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -3694,6 +4022,46 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3694
4022
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
|
3695
4023
|
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
|
3696
4024
|
} break;
|
|
4025
|
+
case LLM_ARCH_BAILINGMOE:
|
|
4026
|
+
{
|
|
4027
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
4028
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4029
|
+
|
|
4030
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4031
|
+
|
|
4032
|
+
// output
|
|
4033
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4034
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
4035
|
+
|
|
4036
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4037
|
+
auto & layer = layers[i];
|
|
4038
|
+
|
|
4039
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4040
|
+
|
|
4041
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
|
|
4042
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
4043
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
4044
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
|
|
4045
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4046
|
+
|
|
4047
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4048
|
+
|
|
4049
|
+
if (n_expert == 0) {
|
|
4050
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
4051
|
+
}
|
|
4052
|
+
if (n_expert_used == 0) {
|
|
4053
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
4054
|
+
}
|
|
4055
|
+
|
|
4056
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4057
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4058
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4059
|
+
|
|
4060
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4061
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
4062
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4063
|
+
}
|
|
4064
|
+
} break;
|
|
3697
4065
|
default:
|
|
3698
4066
|
throw std::runtime_error("unknown architecture");
|
|
3699
4067
|
}
|
|
@@ -3962,6 +4330,8 @@ void llama_model::print_info() const {
|
|
|
3962
4330
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
3963
4331
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
3964
4332
|
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
4333
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
|
4334
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
|
3965
4335
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
3966
4336
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
3967
4337
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
@@ -3975,12 +4345,24 @@ void llama_model::print_info() const {
|
|
|
3975
4345
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
3976
4346
|
}
|
|
3977
4347
|
|
|
4348
|
+
if (arch == LLM_ARCH_QWEN3MOE) {
|
|
4349
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4350
|
+
}
|
|
4351
|
+
|
|
3978
4352
|
if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
|
|
3979
4353
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
3980
4354
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
3981
4355
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
3982
4356
|
}
|
|
3983
4357
|
|
|
4358
|
+
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
4359
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
4360
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4361
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
4362
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
4363
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
4364
|
+
}
|
|
4365
|
+
|
|
3984
4366
|
vocab.print_info();
|
|
3985
4367
|
}
|
|
3986
4368
|
|
|
@@ -4042,6 +4424,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
|
|
|
4042
4424
|
});
|
|
4043
4425
|
}
|
|
4044
4426
|
|
|
4427
|
+
bool llama_model::has_tensor_overrides() const {
|
|
4428
|
+
return pimpl->has_tensor_overrides;
|
|
4429
|
+
}
|
|
4430
|
+
|
|
4045
4431
|
const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
4046
4432
|
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
|
4047
4433
|
[name](const std::pair<std::string, ggml_tensor *> & it) {
|
|
@@ -4069,12 +4455,22 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4069
4455
|
// inp_pos - contains the positions
|
|
4070
4456
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
4071
4457
|
|
|
4458
|
+
// temperature tuning
|
|
4459
|
+
ggml_tensor * inp_attn_scale = nullptr;
|
|
4460
|
+
if (arch == LLM_ARCH_LLAMA4) {
|
|
4461
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
4462
|
+
}
|
|
4463
|
+
|
|
4072
4464
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
4073
4465
|
|
|
4074
4466
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4075
4467
|
for (int il = 0; il < n_layer; ++il) {
|
|
4076
4468
|
ggml_tensor * inpSA = inpL;
|
|
4077
4469
|
|
|
4470
|
+
bool use_rope = arch == LLM_ARCH_LLAMA4
|
|
4471
|
+
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
|
4472
|
+
: true;
|
|
4473
|
+
|
|
4078
4474
|
// norm
|
|
4079
4475
|
cur = build_norm(inpL,
|
|
4080
4476
|
model.layers[il].attn_norm, NULL,
|
|
@@ -4112,25 +4508,38 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4112
4508
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
4113
4509
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
4114
4510
|
|
|
4115
|
-
|
|
4116
|
-
|
|
4117
|
-
|
|
4118
|
-
|
|
4119
|
-
|
|
4511
|
+
if (use_rope) {
|
|
4512
|
+
Qcur = ggml_rope_ext(
|
|
4513
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
4514
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4515
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4516
|
+
);
|
|
4120
4517
|
|
|
4121
|
-
|
|
4122
|
-
|
|
4123
|
-
|
|
4124
|
-
|
|
4125
|
-
|
|
4518
|
+
Kcur = ggml_rope_ext(
|
|
4519
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
4520
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4521
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4522
|
+
);
|
|
4523
|
+
} else if (inp_attn_scale) {
|
|
4524
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
4525
|
+
}
|
|
4126
4526
|
|
|
4127
4527
|
cb(Qcur, "Qcur", il);
|
|
4128
4528
|
cb(Kcur, "Kcur", il);
|
|
4129
4529
|
cb(Vcur, "Vcur", il);
|
|
4130
4530
|
|
|
4531
|
+
if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
|
|
4532
|
+
// Llama4TextL2Norm
|
|
4533
|
+
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
|
4534
|
+
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
|
4535
|
+
cb(Qcur, "Qcur_normed", il);
|
|
4536
|
+
cb(Kcur, "Kcur_normed", il);
|
|
4537
|
+
}
|
|
4538
|
+
|
|
4131
4539
|
cur = build_attn(inp_attn, gf,
|
|
4132
4540
|
model.layers[il].wo, model.layers[il].bo,
|
|
4133
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
4541
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4542
|
+
cb(cur, "attn_out", il);
|
|
4134
4543
|
}
|
|
4135
4544
|
|
|
4136
4545
|
if (il == n_layer - 1) {
|
|
@@ -4148,7 +4557,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4148
4557
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
4149
4558
|
cb(ffn_inp, "ffn_inp", il);
|
|
4150
4559
|
|
|
4151
|
-
// feed-forward network
|
|
4560
|
+
// feed-forward network (non-MoE)
|
|
4152
4561
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
4153
4562
|
|
|
4154
4563
|
cur = build_norm(ffn_inp,
|
|
@@ -4163,6 +4572,38 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4163
4572
|
NULL,
|
|
4164
4573
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4165
4574
|
cb(cur, "ffn_out", il);
|
|
4575
|
+
|
|
4576
|
+
} else if (arch == LLM_ARCH_LLAMA4) {
|
|
4577
|
+
// llama4 MoE
|
|
4578
|
+
ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
|
4579
|
+
model.layers[il].ffn_norm, NULL,
|
|
4580
|
+
LLM_NORM_RMS, il);
|
|
4581
|
+
cb(cur, "ffn_norm", il);
|
|
4582
|
+
|
|
4583
|
+
ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
|
|
4584
|
+
model.layers[il].ffn_gate_inp,
|
|
4585
|
+
model.layers[il].ffn_up_exps,
|
|
4586
|
+
model.layers[il].ffn_gate_exps,
|
|
4587
|
+
model.layers[il].ffn_down_exps,
|
|
4588
|
+
nullptr,
|
|
4589
|
+
n_expert, n_expert_used,
|
|
4590
|
+
LLM_FFN_SILU, false,
|
|
4591
|
+
false, 0.0,
|
|
4592
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
|
4593
|
+
il);
|
|
4594
|
+
|
|
4595
|
+
// Shared experts
|
|
4596
|
+
ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
|
|
4597
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
4598
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
4599
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
4600
|
+
NULL,
|
|
4601
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4602
|
+
cb(shexp_out, "ffn_moe_shexp", il);
|
|
4603
|
+
|
|
4604
|
+
cur = ggml_add(ctx0, moe_out, shexp_out);
|
|
4605
|
+
cb(cur, "ffn_moe_out_merged", il);
|
|
4606
|
+
|
|
4166
4607
|
} else {
|
|
4167
4608
|
// MoE branch
|
|
4168
4609
|
cur = build_norm(ffn_inp,
|
|
@@ -4310,7 +4751,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4310
4751
|
|
|
4311
4752
|
cur = build_attn(inp_attn, gf,
|
|
4312
4753
|
model.layers[il].wo, model.layers[il].bo,
|
|
4313
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
4754
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4314
4755
|
}
|
|
4315
4756
|
|
|
4316
4757
|
if (il == n_layer - 1) {
|
|
@@ -4452,7 +4893,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
4452
4893
|
|
|
4453
4894
|
cur = build_attn(inp_attn, gf,
|
|
4454
4895
|
model.layers[il].wo, NULL,
|
|
4455
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4896
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4456
4897
|
}
|
|
4457
4898
|
|
|
4458
4899
|
if (il == n_layer - 1) {
|
|
@@ -4567,7 +5008,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
4567
5008
|
|
|
4568
5009
|
cur = build_attn(inp_attn, gf,
|
|
4569
5010
|
model.layers[il].wo, NULL,
|
|
4570
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5011
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4571
5012
|
}
|
|
4572
5013
|
|
|
4573
5014
|
if (il == n_layer - 1) {
|
|
@@ -4692,7 +5133,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
4692
5133
|
|
|
4693
5134
|
cur = build_attn(inp_attn, gf,
|
|
4694
5135
|
model.layers[il].wo, NULL,
|
|
4695
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5136
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4696
5137
|
}
|
|
4697
5138
|
|
|
4698
5139
|
if (il == n_layer - 1) {
|
|
@@ -4822,7 +5263,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
4822
5263
|
|
|
4823
5264
|
cur = build_attn(inp_attn, gf,
|
|
4824
5265
|
model.layers[il].wo, model.layers[il].bo,
|
|
4825
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
5266
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
4826
5267
|
}
|
|
4827
5268
|
|
|
4828
5269
|
if (il == n_layer - 1) {
|
|
@@ -4973,7 +5414,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
4973
5414
|
|
|
4974
5415
|
cur = build_attn(inp_attn, gf,
|
|
4975
5416
|
model.layers[il].wo, NULL,
|
|
4976
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5417
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4977
5418
|
}
|
|
4978
5419
|
|
|
4979
5420
|
if (il == n_layer - 1) {
|
|
@@ -5087,7 +5528,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5087
5528
|
|
|
5088
5529
|
cur = build_attn(inp_attn, gf,
|
|
5089
5530
|
model.layers[il].wo, model.layers[il].bo,
|
|
5090
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5531
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5091
5532
|
}
|
|
5092
5533
|
|
|
5093
5534
|
if (il == n_layer - 1) {
|
|
@@ -5186,7 +5627,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5186
5627
|
|
|
5187
5628
|
cur = build_attn(inp_attn, gf,
|
|
5188
5629
|
model.layers[il].wo, NULL,
|
|
5189
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5630
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5190
5631
|
}
|
|
5191
5632
|
|
|
5192
5633
|
if (il == n_layer - 1) {
|
|
@@ -5313,6 +5754,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5313
5754
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5314
5755
|
cb(cur, "wqkv", il);
|
|
5315
5756
|
|
|
5757
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5758
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
5759
|
+
cb(cur, "bqkv", il);
|
|
5760
|
+
}
|
|
5761
|
+
|
|
5316
5762
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
5317
5763
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
5318
5764
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
@@ -5340,7 +5786,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5340
5786
|
|
|
5341
5787
|
cur = build_attn(inp_attn, gf,
|
|
5342
5788
|
model.layers[il].wo, model.layers[il].bo,
|
|
5343
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5789
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5344
5790
|
cb(cur, "kqv_out", il);
|
|
5345
5791
|
|
|
5346
5792
|
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
@@ -5365,13 +5811,29 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5365
5811
|
cb(ffn_inp, "ffn_inp", il);
|
|
5366
5812
|
|
|
5367
5813
|
// feed-forward network
|
|
5368
|
-
if (
|
|
5814
|
+
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
|
5815
|
+
// MoE branch
|
|
5816
|
+
cur = build_moe_ffn(cur,
|
|
5817
|
+
model.layers[il].ffn_gate_inp,
|
|
5818
|
+
model.layers[il].ffn_up_exps,
|
|
5819
|
+
nullptr,
|
|
5820
|
+
model.layers[il].ffn_down_exps,
|
|
5821
|
+
nullptr,
|
|
5822
|
+
hparams.n_expert,
|
|
5823
|
+
hparams.n_expert_used,
|
|
5824
|
+
LLM_FFN_GELU,
|
|
5825
|
+
false, false,
|
|
5826
|
+
0.0f,
|
|
5827
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
5828
|
+
cb(cur, "ffn_moe_out", il);
|
|
5829
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5369
5830
|
cur = build_ffn(cur,
|
|
5370
5831
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
5371
5832
|
NULL, NULL, NULL,
|
|
5372
5833
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
5373
5834
|
NULL,
|
|
5374
5835
|
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
5836
|
+
cb(cur, "ffn_out", il);
|
|
5375
5837
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
5376
5838
|
cur = build_ffn(cur,
|
|
5377
5839
|
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5379,6 +5841,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5379
5841
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
5380
5842
|
NULL,
|
|
5381
5843
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
5844
|
+
cb(cur, "ffn_out", il);
|
|
5382
5845
|
} else {
|
|
5383
5846
|
cur = build_ffn(cur,
|
|
5384
5847
|
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5386,8 +5849,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5386
5849
|
model.layers[il].ffn_down, NULL, NULL,
|
|
5387
5850
|
NULL,
|
|
5388
5851
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
5852
|
+
cb(cur, "ffn_out", il);
|
|
5389
5853
|
}
|
|
5390
|
-
cb(cur, "ffn_out", il);
|
|
5391
5854
|
|
|
5392
5855
|
// attentions bypass the intermediate layer
|
|
5393
5856
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -5457,7 +5920,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
5457
5920
|
|
|
5458
5921
|
cur = build_attn(inp_attn, gf,
|
|
5459
5922
|
model.layers[il].wo, model.layers[il].bo,
|
|
5460
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5923
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5461
5924
|
}
|
|
5462
5925
|
|
|
5463
5926
|
if (il == n_layer - 1) {
|
|
@@ -5598,7 +6061,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
5598
6061
|
|
|
5599
6062
|
cur = build_attn(inp_attn, gf,
|
|
5600
6063
|
model.layers[il].wo, model.layers[il].bo,
|
|
5601
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6064
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5602
6065
|
}
|
|
5603
6066
|
|
|
5604
6067
|
if (il == n_layer - 1) {
|
|
@@ -5744,7 +6207,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
5744
6207
|
|
|
5745
6208
|
cur = build_attn(inp_attn, gf,
|
|
5746
6209
|
model.layers[il].wo, NULL,
|
|
5747
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6210
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5748
6211
|
}
|
|
5749
6212
|
|
|
5750
6213
|
if (il == n_layer - 1) {
|
|
@@ -5867,7 +6330,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
5867
6330
|
|
|
5868
6331
|
cur = build_attn(inp_attn, gf,
|
|
5869
6332
|
model.layers[il].wo, NULL,
|
|
5870
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6333
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5871
6334
|
}
|
|
5872
6335
|
|
|
5873
6336
|
if (il == n_layer - 1) {
|
|
@@ -5987,7 +6450,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
5987
6450
|
|
|
5988
6451
|
cur = build_attn(inp_attn, gf,
|
|
5989
6452
|
model.layers[il].wo, model.layers[il].bo,
|
|
5990
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6453
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5991
6454
|
}
|
|
5992
6455
|
|
|
5993
6456
|
if (il == n_layer - 1) {
|
|
@@ -6108,7 +6571,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6108
6571
|
|
|
6109
6572
|
cur = build_attn(inp_attn, gf,
|
|
6110
6573
|
model.layers[il].wo, model.layers[il].bo,
|
|
6111
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6574
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6112
6575
|
}
|
|
6113
6576
|
|
|
6114
6577
|
if (il == n_layer - 1) {
|
|
@@ -6193,16 +6656,25 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6193
6656
|
{
|
|
6194
6657
|
// compute Q and K and RoPE them
|
|
6195
6658
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6196
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
6197
6659
|
cb(Qcur, "Qcur", il);
|
|
6660
|
+
if (model.layers[il].bq) {
|
|
6661
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
6662
|
+
cb(Qcur, "Qcur", il);
|
|
6663
|
+
}
|
|
6198
6664
|
|
|
6199
6665
|
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6200
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
6201
6666
|
cb(Kcur, "Kcur", il);
|
|
6667
|
+
if (model.layers[il].bk) {
|
|
6668
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
6669
|
+
cb(Kcur, "Kcur", il);
|
|
6670
|
+
}
|
|
6202
6671
|
|
|
6203
6672
|
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6204
|
-
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
6205
6673
|
cb(Vcur, "Vcur", il);
|
|
6674
|
+
if (model.layers[il].bv) {
|
|
6675
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
6676
|
+
cb(Vcur, "Vcur", il);
|
|
6677
|
+
}
|
|
6206
6678
|
|
|
6207
6679
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6208
6680
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
@@ -6226,7 +6698,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6226
6698
|
|
|
6227
6699
|
cur = build_attn(inp_attn, gf,
|
|
6228
6700
|
model.layers[il].wo, model.layers[il].bo,
|
|
6229
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6701
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6230
6702
|
}
|
|
6231
6703
|
|
|
6232
6704
|
if (il == n_layer - 1) {
|
|
@@ -6257,7 +6729,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6257
6729
|
false, 0.0,
|
|
6258
6730
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
6259
6731
|
il);
|
|
6260
|
-
cb(
|
|
6732
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
6261
6733
|
|
|
6262
6734
|
// FFN shared expert
|
|
6263
6735
|
{
|
|
@@ -6313,16 +6785,14 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6313
6785
|
}
|
|
6314
6786
|
};
|
|
6315
6787
|
|
|
6316
|
-
struct
|
|
6317
|
-
|
|
6788
|
+
struct llm_build_qwen3 : public llm_graph_context {
|
|
6789
|
+
llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6318
6790
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6319
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6320
6791
|
|
|
6321
6792
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6793
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
6322
6794
|
|
|
6323
6795
|
ggml_tensor * cur;
|
|
6324
|
-
ggml_tensor * attn_norm_output;
|
|
6325
|
-
ggml_tensor * ffn_output;
|
|
6326
6796
|
ggml_tensor * inpL;
|
|
6327
6797
|
|
|
6328
6798
|
inpL = build_inp_embd(model.tok_embd);
|
|
@@ -6333,48 +6803,42 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6333
6803
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6334
6804
|
|
|
6335
6805
|
for (int il = 0; il < n_layer; ++il) {
|
|
6336
|
-
|
|
6337
|
-
|
|
6338
|
-
|
|
6339
|
-
|
|
6340
|
-
|
|
6806
|
+
ggml_tensor * inpSA = inpL;
|
|
6807
|
+
|
|
6808
|
+
// norm
|
|
6809
|
+
cur = build_norm(inpL,
|
|
6810
|
+
model.layers[il].attn_norm, NULL,
|
|
6811
|
+
LLM_NORM_RMS, il);
|
|
6812
|
+
cb(cur, "attn_norm", il);
|
|
6341
6813
|
|
|
6342
6814
|
// self-attention
|
|
6343
6815
|
{
|
|
6344
|
-
|
|
6345
|
-
ggml_tensor *
|
|
6346
|
-
ggml_tensor * Vcur = nullptr;
|
|
6347
|
-
|
|
6348
|
-
if (model.layers[il].wqkv) {
|
|
6349
|
-
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
6350
|
-
cb(cur, "wqkv", il);
|
|
6351
|
-
|
|
6352
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6353
|
-
cb(cur, "bqkv", il);
|
|
6354
|
-
|
|
6355
|
-
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6356
|
-
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6357
|
-
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6358
|
-
} else {
|
|
6359
|
-
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
6360
|
-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
6361
|
-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
6362
|
-
}
|
|
6363
|
-
|
|
6816
|
+
// compute Q and K and RoPE them
|
|
6817
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6364
6818
|
cb(Qcur, "Qcur", il);
|
|
6819
|
+
|
|
6820
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6365
6821
|
cb(Kcur, "Kcur", il);
|
|
6822
|
+
|
|
6823
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6366
6824
|
cb(Vcur, "Vcur", il);
|
|
6367
6825
|
|
|
6368
6826
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6369
6827
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6370
6828
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6371
6829
|
|
|
6830
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
6831
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6832
|
+
|
|
6372
6833
|
Qcur = ggml_rope_ext(
|
|
6373
6834
|
ctx0, Qcur, inp_pos, nullptr,
|
|
6374
6835
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6375
6836
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6376
6837
|
);
|
|
6377
6838
|
|
|
6839
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
6840
|
+
cb(Kcur, "Kcur_normed", il);
|
|
6841
|
+
|
|
6378
6842
|
Kcur = ggml_rope_ext(
|
|
6379
6843
|
ctx0, Kcur, inp_pos, nullptr,
|
|
6380
6844
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -6385,36 +6849,36 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6385
6849
|
cb(Kcur, "Kcur", il);
|
|
6386
6850
|
cb(Vcur, "Vcur", il);
|
|
6387
6851
|
|
|
6388
|
-
// with phi2, we scale the Q to avoid precision issues
|
|
6389
|
-
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
|
6390
|
-
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
6391
|
-
|
|
6392
6852
|
cur = build_attn(inp_attn, gf,
|
|
6393
6853
|
model.layers[il].wo, model.layers[il].bo,
|
|
6394
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
6854
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6395
6855
|
}
|
|
6396
6856
|
|
|
6397
6857
|
if (il == n_layer - 1) {
|
|
6398
6858
|
// skip computing output for unused tokens
|
|
6399
6859
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6400
|
-
cur
|
|
6401
|
-
|
|
6402
|
-
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
6860
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6861
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6403
6862
|
}
|
|
6404
6863
|
|
|
6405
|
-
|
|
6406
|
-
|
|
6407
|
-
ffn_output = build_ffn(attn_norm_output,
|
|
6408
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
6409
|
-
NULL, NULL, NULL,
|
|
6410
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6411
|
-
NULL,
|
|
6412
|
-
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
6413
|
-
cb(ffn_output, "ffn_out", il);
|
|
6414
|
-
}
|
|
6864
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
6865
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6415
6866
|
|
|
6416
|
-
|
|
6417
|
-
cur =
|
|
6867
|
+
// feed-forward network
|
|
6868
|
+
cur = build_norm(ffn_inp,
|
|
6869
|
+
model.layers[il].ffn_norm, NULL,
|
|
6870
|
+
LLM_NORM_RMS, il);
|
|
6871
|
+
cb(cur, "ffn_norm", il);
|
|
6872
|
+
|
|
6873
|
+
cur = build_ffn(cur,
|
|
6874
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
6875
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
6876
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
6877
|
+
NULL,
|
|
6878
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
6879
|
+
cb(cur, "ffn_out", il);
|
|
6880
|
+
|
|
6881
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6418
6882
|
|
|
6419
6883
|
cur = build_cvec(cur, il);
|
|
6420
6884
|
cb(cur, "l_out", il);
|
|
@@ -6423,18 +6887,17 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6423
6887
|
inpL = cur;
|
|
6424
6888
|
}
|
|
6425
6889
|
|
|
6426
|
-
cur =
|
|
6427
|
-
|
|
6428
|
-
|
|
6429
|
-
|
|
6890
|
+
cur = inpL;
|
|
6891
|
+
|
|
6892
|
+
cur = build_norm(cur,
|
|
6893
|
+
model.output_norm, NULL,
|
|
6894
|
+
LLM_NORM_RMS, -1);
|
|
6430
6895
|
|
|
6431
6896
|
cb(cur, "result_norm", -1);
|
|
6432
6897
|
res->t_embd = cur;
|
|
6433
6898
|
|
|
6899
|
+
// lm_head
|
|
6434
6900
|
cur = build_lora_mm(model.output, cur);
|
|
6435
|
-
cb(cur, "result_output_no_bias", -1);
|
|
6436
|
-
|
|
6437
|
-
cur = ggml_add(ctx0, cur, model.output_b);
|
|
6438
6901
|
|
|
6439
6902
|
cb(cur, "result_output", -1);
|
|
6440
6903
|
res->t_logits = cur;
|
|
@@ -6443,10 +6906,268 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6443
6906
|
}
|
|
6444
6907
|
};
|
|
6445
6908
|
|
|
6446
|
-
struct
|
|
6447
|
-
|
|
6909
|
+
struct llm_build_qwen3moe : public llm_graph_context {
|
|
6910
|
+
llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6448
6911
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6449
|
-
|
|
6912
|
+
|
|
6913
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6914
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
6915
|
+
|
|
6916
|
+
ggml_tensor * cur;
|
|
6917
|
+
ggml_tensor * inpL;
|
|
6918
|
+
|
|
6919
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
6920
|
+
|
|
6921
|
+
// inp_pos - contains the positions
|
|
6922
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
6923
|
+
|
|
6924
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6925
|
+
|
|
6926
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
6927
|
+
ggml_tensor * inpSA = inpL;
|
|
6928
|
+
|
|
6929
|
+
// norm
|
|
6930
|
+
cur = build_norm(inpL,
|
|
6931
|
+
model.layers[il].attn_norm, NULL,
|
|
6932
|
+
LLM_NORM_RMS, il);
|
|
6933
|
+
cb(cur, "attn_norm", il);
|
|
6934
|
+
|
|
6935
|
+
// self_attention
|
|
6936
|
+
{
|
|
6937
|
+
// compute Q and K and RoPE them
|
|
6938
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6939
|
+
cb(Qcur, "Qcur", il);
|
|
6940
|
+
|
|
6941
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6942
|
+
cb(Kcur, "Kcur", il);
|
|
6943
|
+
|
|
6944
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6945
|
+
cb(Vcur, "Vcur", il);
|
|
6946
|
+
|
|
6947
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6948
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6949
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6950
|
+
|
|
6951
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
6952
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6953
|
+
|
|
6954
|
+
Qcur = ggml_rope_ext(
|
|
6955
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6956
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6957
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6958
|
+
);
|
|
6959
|
+
|
|
6960
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
6961
|
+
cb(Kcur, "Kcur_normed", il);
|
|
6962
|
+
|
|
6963
|
+
Kcur = ggml_rope_ext(
|
|
6964
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6965
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6966
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6967
|
+
);
|
|
6968
|
+
|
|
6969
|
+
cb(Qcur, "Qcur", il);
|
|
6970
|
+
cb(Kcur, "Kcur", il);
|
|
6971
|
+
cb(Vcur, "Vcur", il);
|
|
6972
|
+
|
|
6973
|
+
cur = build_attn(inp_attn, gf,
|
|
6974
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
6975
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6976
|
+
}
|
|
6977
|
+
|
|
6978
|
+
if (il == n_layer - 1) {
|
|
6979
|
+
// skip computing output for unused tokens
|
|
6980
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6981
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6982
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6983
|
+
}
|
|
6984
|
+
|
|
6985
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
6986
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6987
|
+
|
|
6988
|
+
// MoE branch
|
|
6989
|
+
cur = build_norm(ffn_inp,
|
|
6990
|
+
model.layers[il].ffn_norm, NULL,
|
|
6991
|
+
LLM_NORM_RMS, il);
|
|
6992
|
+
cb(cur, "ffn_norm", il);
|
|
6993
|
+
|
|
6994
|
+
ggml_tensor * moe_out =
|
|
6995
|
+
build_moe_ffn(cur,
|
|
6996
|
+
model.layers[il].ffn_gate_inp,
|
|
6997
|
+
model.layers[il].ffn_up_exps,
|
|
6998
|
+
model.layers[il].ffn_gate_exps,
|
|
6999
|
+
model.layers[il].ffn_down_exps,
|
|
7000
|
+
nullptr,
|
|
7001
|
+
n_expert, n_expert_used,
|
|
7002
|
+
LLM_FFN_SILU, true,
|
|
7003
|
+
false, 0.0,
|
|
7004
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
7005
|
+
il);
|
|
7006
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
7007
|
+
cur = moe_out;
|
|
7008
|
+
|
|
7009
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
7010
|
+
|
|
7011
|
+
cur = build_cvec(cur, il);
|
|
7012
|
+
cb(cur, "l_out", il);
|
|
7013
|
+
|
|
7014
|
+
// input for next layer
|
|
7015
|
+
inpL = cur;
|
|
7016
|
+
}
|
|
7017
|
+
|
|
7018
|
+
cur = inpL;
|
|
7019
|
+
|
|
7020
|
+
cur = build_norm(cur,
|
|
7021
|
+
model.output_norm, NULL,
|
|
7022
|
+
LLM_NORM_RMS, -1);
|
|
7023
|
+
|
|
7024
|
+
cb(cur, "result_norm", -1);
|
|
7025
|
+
res->t_embd = cur;
|
|
7026
|
+
|
|
7027
|
+
// lm_head
|
|
7028
|
+
cur = build_lora_mm(model.output, cur);
|
|
7029
|
+
|
|
7030
|
+
cb(cur, "result_output", -1);
|
|
7031
|
+
res->t_logits = cur;
|
|
7032
|
+
|
|
7033
|
+
ggml_build_forward_expand(gf, cur);
|
|
7034
|
+
}
|
|
7035
|
+
};
|
|
7036
|
+
|
|
7037
|
+
struct llm_build_phi2 : public llm_graph_context {
|
|
7038
|
+
llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7039
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7040
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
7041
|
+
|
|
7042
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7043
|
+
|
|
7044
|
+
ggml_tensor * cur;
|
|
7045
|
+
ggml_tensor * attn_norm_output;
|
|
7046
|
+
ggml_tensor * ffn_output;
|
|
7047
|
+
ggml_tensor * inpL;
|
|
7048
|
+
|
|
7049
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
7050
|
+
|
|
7051
|
+
// inp_pos - contains the positions
|
|
7052
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
7053
|
+
|
|
7054
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7055
|
+
|
|
7056
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7057
|
+
attn_norm_output = build_norm(inpL,
|
|
7058
|
+
model.layers[il].attn_norm,
|
|
7059
|
+
model.layers[il].attn_norm_b,
|
|
7060
|
+
LLM_NORM, il);
|
|
7061
|
+
cb(attn_norm_output, "attn_norm", il);
|
|
7062
|
+
|
|
7063
|
+
// self-attention
|
|
7064
|
+
{
|
|
7065
|
+
ggml_tensor * Qcur = nullptr;
|
|
7066
|
+
ggml_tensor * Kcur = nullptr;
|
|
7067
|
+
ggml_tensor * Vcur = nullptr;
|
|
7068
|
+
|
|
7069
|
+
if (model.layers[il].wqkv) {
|
|
7070
|
+
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
7071
|
+
cb(cur, "wqkv", il);
|
|
7072
|
+
|
|
7073
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7074
|
+
cb(cur, "bqkv", il);
|
|
7075
|
+
|
|
7076
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
7077
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
7078
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7079
|
+
} else {
|
|
7080
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7081
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7082
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
7083
|
+
}
|
|
7084
|
+
|
|
7085
|
+
cb(Qcur, "Qcur", il);
|
|
7086
|
+
cb(Kcur, "Kcur", il);
|
|
7087
|
+
cb(Vcur, "Vcur", il);
|
|
7088
|
+
|
|
7089
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7090
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7091
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7092
|
+
|
|
7093
|
+
Qcur = ggml_rope_ext(
|
|
7094
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
7095
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7096
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7097
|
+
);
|
|
7098
|
+
|
|
7099
|
+
Kcur = ggml_rope_ext(
|
|
7100
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
7101
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7102
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7103
|
+
);
|
|
7104
|
+
|
|
7105
|
+
cb(Qcur, "Qcur", il);
|
|
7106
|
+
cb(Kcur, "Kcur", il);
|
|
7107
|
+
cb(Vcur, "Vcur", il);
|
|
7108
|
+
|
|
7109
|
+
// with phi2, we scale the Q to avoid precision issues
|
|
7110
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
|
7111
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
7112
|
+
|
|
7113
|
+
cur = build_attn(inp_attn, gf,
|
|
7114
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
7115
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7116
|
+
}
|
|
7117
|
+
|
|
7118
|
+
if (il == n_layer - 1) {
|
|
7119
|
+
// skip computing output for unused tokens
|
|
7120
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7121
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7122
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7123
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
7124
|
+
}
|
|
7125
|
+
|
|
7126
|
+
// FF
|
|
7127
|
+
{
|
|
7128
|
+
ffn_output = build_ffn(attn_norm_output,
|
|
7129
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7130
|
+
NULL, NULL, NULL,
|
|
7131
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
7132
|
+
NULL,
|
|
7133
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
7134
|
+
cb(ffn_output, "ffn_out", il);
|
|
7135
|
+
}
|
|
7136
|
+
|
|
7137
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
|
7138
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
7139
|
+
|
|
7140
|
+
cur = build_cvec(cur, il);
|
|
7141
|
+
cb(cur, "l_out", il);
|
|
7142
|
+
|
|
7143
|
+
// input for next layer
|
|
7144
|
+
inpL = cur;
|
|
7145
|
+
}
|
|
7146
|
+
|
|
7147
|
+
cur = build_norm(inpL,
|
|
7148
|
+
model.output_norm,
|
|
7149
|
+
model.output_norm_b,
|
|
7150
|
+
LLM_NORM, -1);
|
|
7151
|
+
|
|
7152
|
+
cb(cur, "result_norm", -1);
|
|
7153
|
+
res->t_embd = cur;
|
|
7154
|
+
|
|
7155
|
+
cur = build_lora_mm(model.output, cur);
|
|
7156
|
+
cb(cur, "result_output_no_bias", -1);
|
|
7157
|
+
|
|
7158
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
|
7159
|
+
|
|
7160
|
+
cb(cur, "result_output", -1);
|
|
7161
|
+
res->t_logits = cur;
|
|
7162
|
+
|
|
7163
|
+
ggml_build_forward_expand(gf, cur);
|
|
7164
|
+
}
|
|
7165
|
+
};
|
|
7166
|
+
|
|
7167
|
+
struct llm_build_phi3 : public llm_graph_context {
|
|
7168
|
+
llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7169
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7170
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6450
7171
|
|
|
6451
7172
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6452
7173
|
|
|
@@ -6520,7 +7241,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6520
7241
|
|
|
6521
7242
|
cur = build_attn(inp_attn, gf,
|
|
6522
7243
|
model.layers[il].wo, model.layers[il].bo,
|
|
6523
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
7244
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6524
7245
|
}
|
|
6525
7246
|
|
|
6526
7247
|
if (il == n_layer - 1) {
|
|
@@ -6655,7 +7376,7 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
6655
7376
|
|
|
6656
7377
|
cur = build_attn(inp_attn, gf,
|
|
6657
7378
|
model.layers[il].wo, NULL,
|
|
6658
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7379
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6659
7380
|
}
|
|
6660
7381
|
ggml_tensor * sa_out = cur;
|
|
6661
7382
|
|
|
@@ -6762,7 +7483,7 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
6762
7483
|
|
|
6763
7484
|
cur = build_attn(inp_attn, gf,
|
|
6764
7485
|
model.layers[il].wo, model.layers[il].bo,
|
|
6765
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7486
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6766
7487
|
}
|
|
6767
7488
|
|
|
6768
7489
|
if (il == n_layer - 1) {
|
|
@@ -6878,7 +7599,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
6878
7599
|
|
|
6879
7600
|
cur = build_attn(inp_attn, gf,
|
|
6880
7601
|
model.layers[il].wo, model.layers[il].bo,
|
|
6881
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7602
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6882
7603
|
}
|
|
6883
7604
|
|
|
6884
7605
|
if (il == n_layer - 1) {
|
|
@@ -7007,7 +7728,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
7007
7728
|
|
|
7008
7729
|
cur = build_attn(inp_attn, gf,
|
|
7009
7730
|
model.layers[il].wo, NULL,
|
|
7010
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7731
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7011
7732
|
}
|
|
7012
7733
|
|
|
7013
7734
|
if (il == n_layer - 1) {
|
|
@@ -7134,7 +7855,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
7134
7855
|
|
|
7135
7856
|
cur = build_attn(inp_attn, gf,
|
|
7136
7857
|
model.layers[il].wo, model.layers[il].bo,
|
|
7137
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7858
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7138
7859
|
}
|
|
7139
7860
|
|
|
7140
7861
|
if (il == n_layer - 1) {
|
|
@@ -7331,7 +8052,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
7331
8052
|
|
|
7332
8053
|
cur = build_attn(inp_attn, gf,
|
|
7333
8054
|
model.layers[il].wo, NULL,
|
|
7334
|
-
q_states, k_states, v_states, nullptr, kq_scale, il);
|
|
8055
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
7335
8056
|
}
|
|
7336
8057
|
|
|
7337
8058
|
if (il == n_layer - 1) {
|
|
@@ -7461,7 +8182,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
7461
8182
|
|
|
7462
8183
|
cur = build_attn(inp_attn, gf,
|
|
7463
8184
|
model.layers[il].wo, NULL,
|
|
7464
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
8185
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7465
8186
|
}
|
|
7466
8187
|
|
|
7467
8188
|
if (il == n_layer - 1) {
|
|
@@ -7583,7 +8304,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
|
7583
8304
|
|
|
7584
8305
|
cur = build_attn(inp_attn, gf,
|
|
7585
8306
|
model.layers[il].wo, NULL,
|
|
7586
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
8307
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7587
8308
|
}
|
|
7588
8309
|
|
|
7589
8310
|
cur = build_norm(cur,
|
|
@@ -7724,7 +8445,7 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
|
7724
8445
|
|
|
7725
8446
|
cur = build_attn(inp_attn, gf,
|
|
7726
8447
|
model.layers[il].wo, NULL,
|
|
7727
|
-
Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
|
|
8448
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
7728
8449
|
}
|
|
7729
8450
|
|
|
7730
8451
|
cur = build_norm(cur,
|
|
@@ -7864,7 +8585,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
7864
8585
|
|
|
7865
8586
|
cur = build_attn(inp_attn, gf,
|
|
7866
8587
|
model.layers[il].wo, model.layers[il].bo,
|
|
7867
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8588
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7868
8589
|
}
|
|
7869
8590
|
|
|
7870
8591
|
if (il == n_layer - 1) {
|
|
@@ -8199,7 +8920,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
8199
8920
|
|
|
8200
8921
|
cur = build_attn(inp_attn, gf,
|
|
8201
8922
|
model.layers[il].wo, model.layers[il].bo,
|
|
8202
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8923
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8203
8924
|
}
|
|
8204
8925
|
|
|
8205
8926
|
if (il == n_layer - 1) {
|
|
@@ -8334,7 +9055,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
8334
9055
|
|
|
8335
9056
|
cur = build_attn(inp_attn, gf,
|
|
8336
9057
|
model.layers[il].wo, model.layers[il].bo,
|
|
8337
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9058
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8338
9059
|
}
|
|
8339
9060
|
|
|
8340
9061
|
if (il == n_layer - 1) {
|
|
@@ -8465,7 +9186,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
8465
9186
|
|
|
8466
9187
|
cur = build_attn(inp_attn, gf,
|
|
8467
9188
|
model.layers[il].wo, nullptr,
|
|
8468
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9189
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8469
9190
|
}
|
|
8470
9191
|
|
|
8471
9192
|
if (il == n_layer - 1) {
|
|
@@ -8585,7 +9306,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
8585
9306
|
|
|
8586
9307
|
cur = build_attn(inp_attn, gf,
|
|
8587
9308
|
model.layers[il].wo, NULL,
|
|
8588
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9309
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8589
9310
|
}
|
|
8590
9311
|
|
|
8591
9312
|
cur = build_norm(cur,
|
|
@@ -8718,7 +9439,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
8718
9439
|
|
|
8719
9440
|
cur = build_attn(inp_attn, gf,
|
|
8720
9441
|
model.layers[il].wo, NULL,
|
|
8721
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9442
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8722
9443
|
}
|
|
8723
9444
|
|
|
8724
9445
|
if (il == n_layer - 1) {
|
|
@@ -8851,7 +9572,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
8851
9572
|
|
|
8852
9573
|
cur = build_attn(inp_attn, gf,
|
|
8853
9574
|
model.layers[il].wo, NULL,
|
|
8854
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9575
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8855
9576
|
}
|
|
8856
9577
|
|
|
8857
9578
|
if (il == n_layer - 1) {
|
|
@@ -8965,7 +9686,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
8965
9686
|
|
|
8966
9687
|
cur = build_attn(inp_attn, gf,
|
|
8967
9688
|
model.layers[il].wo, model.layers[il].bo,
|
|
8968
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9689
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8969
9690
|
}
|
|
8970
9691
|
|
|
8971
9692
|
if (il == n_layer - 1) {
|
|
@@ -9115,7 +9836,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
9115
9836
|
|
|
9116
9837
|
cur = build_attn(inp_attn, gf,
|
|
9117
9838
|
model.layers[il].wo, NULL,
|
|
9118
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9839
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9119
9840
|
}
|
|
9120
9841
|
|
|
9121
9842
|
if (il == n_layer - 1) {
|
|
@@ -9270,7 +9991,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
9270
9991
|
|
|
9271
9992
|
cur = build_attn(inp_attn, gf,
|
|
9272
9993
|
model.layers[il].wo, model.layers[il].bo,
|
|
9273
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
9994
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
9274
9995
|
}
|
|
9275
9996
|
|
|
9276
9997
|
if (il == n_layer - 1) {
|
|
@@ -9360,16 +10081,23 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9360
10081
|
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9361
10082
|
bool is_lite = (hparams.n_layer == 27);
|
|
9362
10083
|
|
|
9363
|
-
|
|
9364
|
-
|
|
9365
|
-
|
|
9366
|
-
const
|
|
9367
|
-
const
|
|
10084
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
10085
|
+
|
|
10086
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
10087
|
+
const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
|
10088
|
+
const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
|
10089
|
+
|
|
10090
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
10091
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
|
9368
10092
|
|
|
9369
|
-
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
9370
|
-
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
9371
10093
|
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
9372
10094
|
|
|
10095
|
+
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
|
10096
|
+
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
10097
|
+
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
|
10098
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
|
|
10099
|
+
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
10100
|
+
|
|
9373
10101
|
ggml_tensor * cur;
|
|
9374
10102
|
ggml_tensor * inpL;
|
|
9375
10103
|
|
|
@@ -9394,16 +10122,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9394
10122
|
{
|
|
9395
10123
|
ggml_tensor * q = NULL;
|
|
9396
10124
|
if (!is_lite) {
|
|
9397
|
-
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
|
9398
10125
|
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
|
9399
10126
|
cb(q, "q", il);
|
|
9400
10127
|
|
|
9401
10128
|
q = build_norm(q,
|
|
9402
|
-
model.layers[il].attn_q_a_norm,
|
|
10129
|
+
model.layers[il].attn_q_a_norm, nullptr,
|
|
9403
10130
|
LLM_NORM_RMS, il);
|
|
9404
10131
|
cb(q, "q", il);
|
|
9405
10132
|
|
|
9406
|
-
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
|
9407
10133
|
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
|
9408
10134
|
cb(q, "q", il);
|
|
9409
10135
|
} else {
|
|
@@ -9411,96 +10137,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9411
10137
|
cb(q, "q", il);
|
|
9412
10138
|
}
|
|
9413
10139
|
|
|
9414
|
-
// split into {
|
|
9415
|
-
ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
|
|
9416
|
-
|
|
9417
|
-
ggml_row_size(q->type,
|
|
10140
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
|
10141
|
+
ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
|
|
10142
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
|
10143
|
+
ggml_row_size(q->type, n_embd_head_k),
|
|
10144
|
+
ggml_row_size(q->type, n_embd_head_k) * n_head,
|
|
9418
10145
|
0);
|
|
9419
10146
|
cb(q_nope, "q_nope", il);
|
|
9420
10147
|
|
|
9421
|
-
// and {
|
|
9422
|
-
ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
|
|
9423
|
-
|
|
9424
|
-
ggml_row_size(q->type,
|
|
10148
|
+
// and {n_embd_head_qk_rope, n_head, n_tokens}
|
|
10149
|
+
ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
|
|
10150
|
+
n_embd_head_qk_rope, n_head, n_tokens,
|
|
10151
|
+
ggml_row_size(q->type, n_embd_head_k),
|
|
10152
|
+
ggml_row_size(q->type, n_embd_head_k) * n_head,
|
|
9425
10153
|
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
9426
10154
|
cb(q_pe, "q_pe", il);
|
|
9427
10155
|
|
|
9428
|
-
|
|
9429
|
-
|
|
9430
|
-
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
10156
|
+
ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
10157
|
+
cb(kv_cmpr_pe, "kv_cmpr_pe", il);
|
|
9431
10158
|
|
|
9432
10159
|
// split into {kv_lora_rank, n_tokens}
|
|
9433
|
-
ggml_tensor *
|
|
9434
|
-
|
|
10160
|
+
ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
|
|
10161
|
+
kv_lora_rank, n_tokens,
|
|
10162
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
9435
10163
|
0);
|
|
9436
|
-
cb(
|
|
10164
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
|
10165
|
+
|
|
10166
|
+
// and {n_embd_head_qk_rope, 1, n_tokens}
|
|
10167
|
+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
|
|
10168
|
+
n_embd_head_qk_rope, 1, n_tokens,
|
|
10169
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
10170
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
10171
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
|
|
10172
|
+
cb(k_pe, "k_pe", il);
|
|
9437
10173
|
|
|
9438
|
-
|
|
9439
|
-
|
|
9440
|
-
|
|
9441
|
-
|
|
9442
|
-
|
|
10174
|
+
q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
|
|
10175
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10176
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10177
|
+
);
|
|
10178
|
+
cb(q_pe, "q_pe", il);
|
|
10179
|
+
|
|
10180
|
+
k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
|
|
10181
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10182
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10183
|
+
);
|
|
9443
10184
|
cb(k_pe, "k_pe", il);
|
|
9444
10185
|
|
|
9445
|
-
|
|
9446
|
-
|
|
9447
|
-
kv_compressed = build_norm(kv_compressed,
|
|
9448
|
-
model.layers[il].attn_kv_a_norm, NULL,
|
|
10186
|
+
kv_cmpr = build_norm(kv_cmpr,
|
|
10187
|
+
model.layers[il].attn_kv_a_norm, nullptr,
|
|
9449
10188
|
LLM_NORM_RMS, il);
|
|
9450
|
-
cb(
|
|
10189
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
|
9451
10190
|
|
|
9452
|
-
|
|
9453
|
-
|
|
9454
|
-
|
|
10191
|
+
if (is_mla) {
|
|
10192
|
+
// {n_embd_head_qk_nope, n_tokens, n_head}
|
|
10193
|
+
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
|
10194
|
+
cb(q_nope, "q_nope_perm", il);
|
|
9455
10195
|
|
|
9456
|
-
|
|
9457
|
-
|
|
9458
|
-
|
|
9459
|
-
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
9460
|
-
0);
|
|
9461
|
-
cb(k_nope, "k_nope", il);
|
|
10196
|
+
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
|
10197
|
+
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
|
10198
|
+
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
|
9462
10199
|
|
|
9463
|
-
|
|
9464
|
-
|
|
9465
|
-
|
|
9466
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
9467
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
9468
|
-
cb(v_states, "v_states", il);
|
|
10200
|
+
// {kv_lora_rank, n_head, n_tokens}
|
|
10201
|
+
q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
|
|
10202
|
+
cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
|
|
9469
10203
|
|
|
9470
|
-
|
|
9471
|
-
|
|
10204
|
+
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
|
|
10205
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
10206
|
+
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
|
|
10207
|
+
cb(Qcur, "Qcur", il);
|
|
9472
10208
|
|
|
9473
|
-
|
|
9474
|
-
|
|
9475
|
-
0);
|
|
9476
|
-
cb(v_states, "v_states", il);
|
|
10209
|
+
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
|
|
10210
|
+
cb(kv_cmpr, "kv_cmpr_reshape", il);
|
|
9477
10211
|
|
|
9478
|
-
|
|
9479
|
-
|
|
9480
|
-
|
|
9481
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9482
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
9483
|
-
);
|
|
9484
|
-
cb(q_pe, "q_pe", il);
|
|
10212
|
+
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
|
|
10213
|
+
ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
|
|
10214
|
+
cb(Kcur, "Kcur", il);
|
|
9485
10215
|
|
|
9486
|
-
|
|
9487
|
-
|
|
9488
|
-
|
|
9489
|
-
ctx0, k_pe, inp_pos, nullptr,
|
|
9490
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9491
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
9492
|
-
);
|
|
9493
|
-
cb(k_pe, "k_pe", il);
|
|
10216
|
+
// {kv_lora_rank, 1, n_tokens}
|
|
10217
|
+
ggml_tensor * Vcur = kv_cmpr;
|
|
10218
|
+
cb(Vcur, "Vcur", il);
|
|
9494
10219
|
|
|
9495
|
-
|
|
9496
|
-
|
|
10220
|
+
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
10221
|
+
cur = build_attn(inp_attn, gf,
|
|
10222
|
+
model.layers[il].wo, NULL,
|
|
10223
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
10224
|
+
} else {
|
|
10225
|
+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
|
10226
|
+
cb(kv, "kv", il);
|
|
10227
|
+
|
|
10228
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
|
10229
|
+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
|
|
10230
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
|
10231
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
|
10232
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
|
10233
|
+
0);
|
|
10234
|
+
cb(k_nope, "k_nope_view", il);
|
|
9497
10235
|
|
|
9498
|
-
|
|
9499
|
-
|
|
10236
|
+
// and {n_embd_head_v, n_head, n_tokens}
|
|
10237
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
|
|
10238
|
+
n_embd_head_v, n_head, n_tokens,
|
|
10239
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
|
10240
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
|
10241
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope));
|
|
10242
|
+
cb(Vcur, "Vcur_view", il);
|
|
9500
10243
|
|
|
9501
|
-
|
|
9502
|
-
|
|
9503
|
-
|
|
10244
|
+
Vcur = ggml_cont(ctx0, Vcur);
|
|
10245
|
+
cb(Vcur, "Vcur_cont", il);
|
|
10246
|
+
|
|
10247
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
10248
|
+
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
|
|
10249
|
+
cb(Qcur, "Qcur", il);
|
|
10250
|
+
|
|
10251
|
+
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
|
10252
|
+
cb(Kcur, "Kcur", il);
|
|
10253
|
+
|
|
10254
|
+
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
10255
|
+
cur = build_attn(inp_attn, gf,
|
|
10256
|
+
model.layers[il].wo, NULL,
|
|
10257
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
10258
|
+
}
|
|
9504
10259
|
}
|
|
9505
10260
|
|
|
9506
10261
|
if (il == n_layer - 1) {
|
|
@@ -9666,7 +10421,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
9666
10421
|
|
|
9667
10422
|
cur = build_attn(inp_attn, gf,
|
|
9668
10423
|
NULL, NULL,
|
|
9669
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10424
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9670
10425
|
|
|
9671
10426
|
cur = build_norm(cur,
|
|
9672
10427
|
model.layers[il].attn_sub_norm, NULL,
|
|
@@ -9789,7 +10544,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
9789
10544
|
|
|
9790
10545
|
cur = build_attn(inp_attn, gf,
|
|
9791
10546
|
model.layers[il].wo_enc, nullptr,
|
|
9792
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
|
10547
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
9793
10548
|
cb(cur, "kqv_out", il);
|
|
9794
10549
|
}
|
|
9795
10550
|
|
|
@@ -9895,7 +10650,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
9895
10650
|
|
|
9896
10651
|
cur = build_attn(inp_attn_self, gf,
|
|
9897
10652
|
model.layers[il].wo, model.layers[il].bo,
|
|
9898
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
|
10653
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
9899
10654
|
cb(cur, "kqv_out", il);
|
|
9900
10655
|
}
|
|
9901
10656
|
|
|
@@ -9927,7 +10682,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
9927
10682
|
|
|
9928
10683
|
cur = build_attn(inp_attn_cross, gf,
|
|
9929
10684
|
model.layers[il].wo_cross, nullptr,
|
|
9930
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
10685
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9931
10686
|
cb(cur, "kqv_out", il);
|
|
9932
10687
|
|
|
9933
10688
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
@@ -10060,7 +10815,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
10060
10815
|
|
|
10061
10816
|
cur = build_attn(inp_attn, gf,
|
|
10062
10817
|
model.layers[il].wo, model.layers[il].bo,
|
|
10063
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
|
|
10818
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
10064
10819
|
}
|
|
10065
10820
|
|
|
10066
10821
|
if (il == n_layer - 1) {
|
|
@@ -10192,7 +10947,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
10192
10947
|
|
|
10193
10948
|
cur = build_attn(inp_attn, gf,
|
|
10194
10949
|
model.layers[il].wo, NULL,
|
|
10195
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10950
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10196
10951
|
}
|
|
10197
10952
|
|
|
10198
10953
|
if (il == n_layer - 1) {
|
|
@@ -10245,6 +11000,157 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
10245
11000
|
}
|
|
10246
11001
|
};
|
|
10247
11002
|
|
|
11003
|
+
struct llm_build_glm4 : public llm_graph_context {
|
|
11004
|
+
llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
11005
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11006
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
11007
|
+
|
|
11008
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
11009
|
+
|
|
11010
|
+
ggml_tensor * cur;
|
|
11011
|
+
ggml_tensor * inpL;
|
|
11012
|
+
|
|
11013
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
11014
|
+
|
|
11015
|
+
// inp_pos - contains the positions
|
|
11016
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
11017
|
+
|
|
11018
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11019
|
+
|
|
11020
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
11021
|
+
ggml_tensor * inpSA = inpL;
|
|
11022
|
+
|
|
11023
|
+
// Pre-attention norm
|
|
11024
|
+
cur = build_norm(inpL,
|
|
11025
|
+
model.layers[il].attn_norm,
|
|
11026
|
+
NULL,
|
|
11027
|
+
LLM_NORM_RMS, il);
|
|
11028
|
+
cb(cur, "attn_norm", il);
|
|
11029
|
+
|
|
11030
|
+
// self-attention
|
|
11031
|
+
{
|
|
11032
|
+
ggml_tensor * Qcur = nullptr;
|
|
11033
|
+
ggml_tensor * Kcur = nullptr;
|
|
11034
|
+
ggml_tensor * Vcur = nullptr;
|
|
11035
|
+
|
|
11036
|
+
if (model.layers[il].wqkv == nullptr) {
|
|
11037
|
+
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
11038
|
+
if (model.layers[il].bq) {
|
|
11039
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
11040
|
+
}
|
|
11041
|
+
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
11042
|
+
if (model.layers[il].bk) {
|
|
11043
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
11044
|
+
}
|
|
11045
|
+
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
11046
|
+
if (model.layers[il].bv) {
|
|
11047
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
11048
|
+
}
|
|
11049
|
+
} else {
|
|
11050
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
11051
|
+
cb(cur, "wqkv", il);
|
|
11052
|
+
if (model.layers[il].bqkv) {
|
|
11053
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
11054
|
+
cb(cur, "bqkv", il);
|
|
11055
|
+
}
|
|
11056
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
11057
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
11058
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
11059
|
+
}
|
|
11060
|
+
|
|
11061
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11062
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11063
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11064
|
+
|
|
11065
|
+
Qcur = ggml_rope_ext(
|
|
11066
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
11067
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11068
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11069
|
+
);
|
|
11070
|
+
|
|
11071
|
+
Kcur = ggml_rope_ext(
|
|
11072
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
11073
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11074
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11075
|
+
);
|
|
11076
|
+
|
|
11077
|
+
cb(Qcur, "Qcur", il);
|
|
11078
|
+
cb(Kcur, "Kcur", il);
|
|
11079
|
+
cb(Vcur, "Vcur", il);
|
|
11080
|
+
|
|
11081
|
+
cur = build_attn(inp_attn, gf,
|
|
11082
|
+
model.layers[il].wo, NULL,
|
|
11083
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11084
|
+
}
|
|
11085
|
+
|
|
11086
|
+
if (il == n_layer - 1) {
|
|
11087
|
+
// skip computing output for unused tokens
|
|
11088
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11089
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11090
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11091
|
+
}
|
|
11092
|
+
|
|
11093
|
+
// Post-attention norm (new!)
|
|
11094
|
+
cur = build_norm(cur,
|
|
11095
|
+
model.layers[il].attn_post_norm,
|
|
11096
|
+
NULL,
|
|
11097
|
+
LLM_NORM_RMS, il);
|
|
11098
|
+
cb(cur, "post_attn_norm", il);
|
|
11099
|
+
|
|
11100
|
+
// Add the input (residual connection after post-attention norm)
|
|
11101
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
11102
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
11103
|
+
|
|
11104
|
+
// FF
|
|
11105
|
+
{
|
|
11106
|
+
// Pre-MLP norm
|
|
11107
|
+
cur = build_norm(ffn_inp,
|
|
11108
|
+
model.layers[il].ffn_norm,
|
|
11109
|
+
NULL,
|
|
11110
|
+
LLM_NORM_RMS, il);
|
|
11111
|
+
cb(cur, "ffn_norm", il);
|
|
11112
|
+
|
|
11113
|
+
// MLP
|
|
11114
|
+
cur = build_ffn(cur,
|
|
11115
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
11116
|
+
NULL, NULL, NULL,
|
|
11117
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
11118
|
+
NULL,
|
|
11119
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
11120
|
+
cb(cur, "ffn_out", il);
|
|
11121
|
+
|
|
11122
|
+
// Post-MLP norm
|
|
11123
|
+
cur = build_norm(cur,
|
|
11124
|
+
model.layers[il].ffn_post_norm,
|
|
11125
|
+
NULL,
|
|
11126
|
+
LLM_NORM_RMS, il);
|
|
11127
|
+
cb(cur, "post_mlp_norm", il);
|
|
11128
|
+
}
|
|
11129
|
+
|
|
11130
|
+
// Add residual connection after post-MLP norm
|
|
11131
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
|
11132
|
+
cb(inpL, "l_out", il);
|
|
11133
|
+
}
|
|
11134
|
+
|
|
11135
|
+
// Final norm
|
|
11136
|
+
cur = build_norm(inpL,
|
|
11137
|
+
model.output_norm,
|
|
11138
|
+
NULL,
|
|
11139
|
+
LLM_NORM_RMS, -1);
|
|
11140
|
+
|
|
11141
|
+
cb(cur, "result_norm", -1);
|
|
11142
|
+
res->t_embd = cur;
|
|
11143
|
+
|
|
11144
|
+
// Output projection
|
|
11145
|
+
cur = build_lora_mm(model.output, cur);
|
|
11146
|
+
|
|
11147
|
+
cb(cur, "result_output", -1);
|
|
11148
|
+
res->t_logits = cur;
|
|
11149
|
+
|
|
11150
|
+
ggml_build_forward_expand(gf, cur);
|
|
11151
|
+
}
|
|
11152
|
+
};
|
|
11153
|
+
|
|
10248
11154
|
struct llm_build_nemotron : public llm_graph_context {
|
|
10249
11155
|
llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
10250
11156
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -10318,7 +11224,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
10318
11224
|
|
|
10319
11225
|
cur = build_attn(inp_attn, gf,
|
|
10320
11226
|
model.layers[il].wo, model.layers[il].bo,
|
|
10321
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11227
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10322
11228
|
}
|
|
10323
11229
|
|
|
10324
11230
|
if (il == n_layer - 1) {
|
|
@@ -10449,7 +11355,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
10449
11355
|
|
|
10450
11356
|
cur = build_attn(inp_attn, gf,
|
|
10451
11357
|
model.layers[il].wo, model.layers[il].bo,
|
|
10452
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11358
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10453
11359
|
}
|
|
10454
11360
|
|
|
10455
11361
|
if (il == n_layer - 1) {
|
|
@@ -11351,7 +12257,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
11351
12257
|
|
|
11352
12258
|
cur = build_attn(inp_attn, gf,
|
|
11353
12259
|
model.layers[il].wo, nullptr,
|
|
11354
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12260
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11355
12261
|
|
|
11356
12262
|
if (hparams.swin_norm) {
|
|
11357
12263
|
cur = build_norm(cur,
|
|
@@ -11588,32 +12494,348 @@ struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
|
|
11588
12494
|
}
|
|
11589
12495
|
};
|
|
11590
12496
|
|
|
11591
|
-
|
|
11592
|
-
|
|
12497
|
+
struct llm_build_plm : public llm_graph_context {
|
|
12498
|
+
llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
12499
|
+
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
|
|
11593
12500
|
|
|
11594
|
-
|
|
11595
|
-
|
|
11596
|
-
|
|
11597
|
-
case LLM_ARCH_RWKV6QWEN2:
|
|
11598
|
-
case LLM_ARCH_RWKV7:
|
|
11599
|
-
case LLM_ARCH_ARWKV7:
|
|
11600
|
-
{
|
|
11601
|
-
res = new llama_kv_cache_unified(hparams, {
|
|
11602
|
-
/*.get_rope_factors =*/ nullptr
|
|
11603
|
-
});
|
|
11604
|
-
} break;
|
|
11605
|
-
default:
|
|
11606
|
-
{
|
|
11607
|
-
res = new llama_kv_cache_unified(hparams, {
|
|
11608
|
-
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
|
11609
|
-
// choose long/short freq factors based on the context size
|
|
11610
|
-
if (layers[il].rope_freqs != nullptr) {
|
|
11611
|
-
return layers[il].rope_freqs;
|
|
11612
|
-
}
|
|
12501
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
12502
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
12503
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
11613
12504
|
|
|
11614
|
-
|
|
11615
|
-
|
|
11616
|
-
|
|
12505
|
+
ggml_tensor * cur;
|
|
12506
|
+
ggml_tensor * inpL;
|
|
12507
|
+
|
|
12508
|
+
// {n_embd, n_tokens}
|
|
12509
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12510
|
+
|
|
12511
|
+
// inp_pos - contains the positions
|
|
12512
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12513
|
+
|
|
12514
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12515
|
+
|
|
12516
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12517
|
+
ggml_tensor * inpSA = inpL;
|
|
12518
|
+
|
|
12519
|
+
// norm
|
|
12520
|
+
cur = build_norm(inpL,
|
|
12521
|
+
model.layers[il].attn_norm, NULL,
|
|
12522
|
+
LLM_NORM_RMS, il);
|
|
12523
|
+
cb(cur, "attn_norm", il);
|
|
12524
|
+
|
|
12525
|
+
// self_attention
|
|
12526
|
+
{
|
|
12527
|
+
ggml_tensor * q = NULL;
|
|
12528
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
12529
|
+
cb(q, "q", il);
|
|
12530
|
+
|
|
12531
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
12532
|
+
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
12533
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
12534
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
12535
|
+
0);
|
|
12536
|
+
cb(q_nope, "q_nope", il);
|
|
12537
|
+
|
|
12538
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
12539
|
+
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
12540
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
12541
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
12542
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
12543
|
+
cb(q_pe, "q_pe", il);
|
|
12544
|
+
|
|
12545
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
12546
|
+
ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
12547
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
12548
|
+
|
|
12549
|
+
// split into {kv_lora_rank, n_tokens}
|
|
12550
|
+
ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
12551
|
+
kv_pe_compresseed->nb[1],
|
|
12552
|
+
0);
|
|
12553
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
12554
|
+
|
|
12555
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
|
12556
|
+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
12557
|
+
kv_pe_compresseed->nb[1],
|
|
12558
|
+
kv_pe_compresseed->nb[1],
|
|
12559
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
12560
|
+
cb(k_pe, "k_pe", il);
|
|
12561
|
+
|
|
12562
|
+
kv_compressed = build_norm(kv_compressed,
|
|
12563
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
|
12564
|
+
LLM_NORM_RMS, il);
|
|
12565
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
12566
|
+
|
|
12567
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
|
12568
|
+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
|
12569
|
+
cb(kv, "kv", il);
|
|
12570
|
+
|
|
12571
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
12572
|
+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
|
12573
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
|
12574
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
12575
|
+
0);
|
|
12576
|
+
cb(k_nope, "k_nope", il);
|
|
12577
|
+
|
|
12578
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
|
12579
|
+
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
|
12580
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
12581
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
12582
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
12583
|
+
cb(v_states, "v_states", il);
|
|
12584
|
+
|
|
12585
|
+
v_states = ggml_cont(ctx0, v_states);
|
|
12586
|
+
cb(v_states, "v_states", il);
|
|
12587
|
+
|
|
12588
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
12589
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
12590
|
+
0);
|
|
12591
|
+
cb(v_states, "v_states", il);
|
|
12592
|
+
|
|
12593
|
+
q_pe = ggml_rope_ext(
|
|
12594
|
+
ctx0, q_pe, inp_pos, nullptr,
|
|
12595
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12596
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12597
|
+
);
|
|
12598
|
+
cb(q_pe, "q_pe", il);
|
|
12599
|
+
|
|
12600
|
+
// shared RoPE key
|
|
12601
|
+
k_pe = ggml_rope_ext(
|
|
12602
|
+
ctx0, k_pe, inp_pos, nullptr,
|
|
12603
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12604
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12605
|
+
);
|
|
12606
|
+
cb(k_pe, "k_pe", il);
|
|
12607
|
+
|
|
12608
|
+
ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
|
12609
|
+
cb(q_states, "q_states", il);
|
|
12610
|
+
|
|
12611
|
+
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
12612
|
+
cb(k_states, "k_states", il);
|
|
12613
|
+
|
|
12614
|
+
cur = build_attn(inp_attn, gf,
|
|
12615
|
+
model.layers[il].wo, NULL,
|
|
12616
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
12617
|
+
}
|
|
12618
|
+
|
|
12619
|
+
if (il == n_layer - 1) {
|
|
12620
|
+
// skip computing output for unused tokens
|
|
12621
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12622
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12623
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12624
|
+
}
|
|
12625
|
+
|
|
12626
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12627
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12628
|
+
|
|
12629
|
+
cur = build_norm(ffn_inp,
|
|
12630
|
+
model.layers[il].ffn_norm, NULL,
|
|
12631
|
+
LLM_NORM_RMS, il);
|
|
12632
|
+
cb(cur, "ffn_norm", il);
|
|
12633
|
+
|
|
12634
|
+
cur = build_ffn(cur,
|
|
12635
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
12636
|
+
NULL, NULL, NULL,
|
|
12637
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
12638
|
+
NULL,
|
|
12639
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
12640
|
+
cb(cur, "ffn_out", il);
|
|
12641
|
+
|
|
12642
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12643
|
+
|
|
12644
|
+
cur = build_cvec(cur, il);
|
|
12645
|
+
cb(cur, "l_out", il);
|
|
12646
|
+
|
|
12647
|
+
// input for next layer
|
|
12648
|
+
inpL = cur;
|
|
12649
|
+
}
|
|
12650
|
+
|
|
12651
|
+
cur = inpL;
|
|
12652
|
+
|
|
12653
|
+
cur = build_norm(cur,
|
|
12654
|
+
model.output_norm, NULL,
|
|
12655
|
+
LLM_NORM_RMS, -1);
|
|
12656
|
+
|
|
12657
|
+
cb(cur, "result_norm", -1);
|
|
12658
|
+
res->t_embd = cur;
|
|
12659
|
+
|
|
12660
|
+
cur = build_lora_mm(model.output, cur);
|
|
12661
|
+
|
|
12662
|
+
cb(cur, "result_output", -1);
|
|
12663
|
+
res->t_logits = cur;
|
|
12664
|
+
|
|
12665
|
+
ggml_build_forward_expand(gf, cur);
|
|
12666
|
+
}
|
|
12667
|
+
};
|
|
12668
|
+
|
|
12669
|
+
struct llm_build_bailingmoe : public llm_graph_context {
|
|
12670
|
+
llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
12671
|
+
ggml_tensor * cur;
|
|
12672
|
+
ggml_tensor * inpL;
|
|
12673
|
+
|
|
12674
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12675
|
+
|
|
12676
|
+
// inp_pos - contains the positions
|
|
12677
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12678
|
+
|
|
12679
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12680
|
+
|
|
12681
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12682
|
+
ggml_tensor * inpSA = inpL;
|
|
12683
|
+
|
|
12684
|
+
// norm
|
|
12685
|
+
cur = build_norm(inpL,
|
|
12686
|
+
model.layers[il].attn_norm, NULL,
|
|
12687
|
+
LLM_NORM_RMS, il);
|
|
12688
|
+
cb(cur, "attn_norm", il);
|
|
12689
|
+
|
|
12690
|
+
// self-attention
|
|
12691
|
+
{
|
|
12692
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
12693
|
+
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
|
12694
|
+
|
|
12695
|
+
// compute Q and K and RoPE them
|
|
12696
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12697
|
+
cb(Qcur, "Qcur", il);
|
|
12698
|
+
if (model.layers[il].bq) {
|
|
12699
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
12700
|
+
cb(Qcur, "Qcur", il);
|
|
12701
|
+
}
|
|
12702
|
+
|
|
12703
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12704
|
+
cb(Kcur, "Kcur", il);
|
|
12705
|
+
if (model.layers[il].bk) {
|
|
12706
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
12707
|
+
cb(Kcur, "Kcur", il);
|
|
12708
|
+
}
|
|
12709
|
+
|
|
12710
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12711
|
+
cb(Vcur, "Vcur", il);
|
|
12712
|
+
if (model.layers[il].bv) {
|
|
12713
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12714
|
+
cb(Vcur, "Vcur", il);
|
|
12715
|
+
}
|
|
12716
|
+
|
|
12717
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
12718
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
12719
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
12720
|
+
|
|
12721
|
+
Qcur = ggml_rope_ext(
|
|
12722
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
12723
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12724
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12725
|
+
);
|
|
12726
|
+
|
|
12727
|
+
Kcur = ggml_rope_ext(
|
|
12728
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
12729
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12730
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12731
|
+
);
|
|
12732
|
+
|
|
12733
|
+
cb(Qcur, "Qcur", il);
|
|
12734
|
+
cb(Kcur, "Kcur", il);
|
|
12735
|
+
cb(Vcur, "Vcur", il);
|
|
12736
|
+
|
|
12737
|
+
cur = build_attn(inp_attn, gf,
|
|
12738
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
12739
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
12740
|
+
}
|
|
12741
|
+
|
|
12742
|
+
if (il == n_layer - 1) {
|
|
12743
|
+
// skip computing output for unused tokens
|
|
12744
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12745
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12746
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12747
|
+
}
|
|
12748
|
+
|
|
12749
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12750
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12751
|
+
|
|
12752
|
+
cur = build_norm(ffn_inp,
|
|
12753
|
+
model.layers[il].ffn_norm, NULL,
|
|
12754
|
+
LLM_NORM_RMS, il);
|
|
12755
|
+
cb(cur, "ffn_norm", il);
|
|
12756
|
+
|
|
12757
|
+
ggml_tensor * moe_out =
|
|
12758
|
+
build_moe_ffn(cur,
|
|
12759
|
+
model.layers[il].ffn_gate_inp,
|
|
12760
|
+
model.layers[il].ffn_up_exps,
|
|
12761
|
+
model.layers[il].ffn_gate_exps,
|
|
12762
|
+
model.layers[il].ffn_down_exps,
|
|
12763
|
+
nullptr,
|
|
12764
|
+
n_expert, n_expert_used,
|
|
12765
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
12766
|
+
false, hparams.expert_weights_scale,
|
|
12767
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12768
|
+
il);
|
|
12769
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
12770
|
+
|
|
12771
|
+
// FFN shared expert
|
|
12772
|
+
{
|
|
12773
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
12774
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
12775
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
12776
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
12777
|
+
NULL,
|
|
12778
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12779
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
12780
|
+
|
|
12781
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
12782
|
+
cb(cur, "ffn_out", il);
|
|
12783
|
+
}
|
|
12784
|
+
|
|
12785
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12786
|
+
|
|
12787
|
+
cur = build_cvec(cur, il);
|
|
12788
|
+
cb(cur, "l_out", il);
|
|
12789
|
+
|
|
12790
|
+
// input for next layer
|
|
12791
|
+
inpL = cur;
|
|
12792
|
+
}
|
|
12793
|
+
|
|
12794
|
+
cur = inpL;
|
|
12795
|
+
|
|
12796
|
+
cur = build_norm(cur,
|
|
12797
|
+
model.output_norm, NULL,
|
|
12798
|
+
LLM_NORM_RMS, -1);
|
|
12799
|
+
|
|
12800
|
+
cb(cur, "result_norm", -1);
|
|
12801
|
+
res->t_embd = cur;
|
|
12802
|
+
|
|
12803
|
+
// lm_head
|
|
12804
|
+
cur = build_lora_mm(model.output, cur);
|
|
12805
|
+
|
|
12806
|
+
cb(cur, "result_output", -1);
|
|
12807
|
+
res->t_logits = cur;
|
|
12808
|
+
|
|
12809
|
+
ggml_build_forward_expand(gf, cur);
|
|
12810
|
+
}
|
|
12811
|
+
};
|
|
12812
|
+
|
|
12813
|
+
llama_memory_i * llama_model::create_memory() const {
|
|
12814
|
+
llama_memory_i * res;
|
|
12815
|
+
|
|
12816
|
+
switch (arch) {
|
|
12817
|
+
case LLM_ARCH_MAMBA:
|
|
12818
|
+
case LLM_ARCH_RWKV6:
|
|
12819
|
+
case LLM_ARCH_RWKV6QWEN2:
|
|
12820
|
+
case LLM_ARCH_RWKV7:
|
|
12821
|
+
case LLM_ARCH_ARWKV7:
|
|
12822
|
+
{
|
|
12823
|
+
res = new llama_kv_cache_unified(hparams, {
|
|
12824
|
+
/*.get_rope_factors =*/ nullptr
|
|
12825
|
+
});
|
|
12826
|
+
} break;
|
|
12827
|
+
default:
|
|
12828
|
+
{
|
|
12829
|
+
res = new llama_kv_cache_unified(hparams, {
|
|
12830
|
+
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
|
12831
|
+
// choose long/short freq factors based on the context size
|
|
12832
|
+
if (layers[il].rope_freqs != nullptr) {
|
|
12833
|
+
return layers[il].rope_freqs;
|
|
12834
|
+
}
|
|
12835
|
+
|
|
12836
|
+
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
|
12837
|
+
return layers[il].rope_long;
|
|
12838
|
+
}
|
|
11617
12839
|
|
|
11618
12840
|
return layers[il].rope_short;
|
|
11619
12841
|
}
|
|
@@ -11632,6 +12854,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11632
12854
|
|
|
11633
12855
|
switch (arch) {
|
|
11634
12856
|
case LLM_ARCH_LLAMA:
|
|
12857
|
+
case LLM_ARCH_LLAMA4:
|
|
11635
12858
|
case LLM_ARCH_MINICPM:
|
|
11636
12859
|
case LLM_ARCH_GRANITE:
|
|
11637
12860
|
case LLM_ARCH_GRANITE_MOE:
|
|
@@ -11665,6 +12888,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11665
12888
|
case LLM_ARCH_BERT:
|
|
11666
12889
|
case LLM_ARCH_JINA_BERT_V2:
|
|
11667
12890
|
case LLM_ARCH_NOMIC_BERT:
|
|
12891
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
11668
12892
|
{
|
|
11669
12893
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
|
11670
12894
|
} break;
|
|
@@ -11696,6 +12920,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11696
12920
|
{
|
|
11697
12921
|
llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
|
|
11698
12922
|
} break;
|
|
12923
|
+
case LLM_ARCH_QWEN3:
|
|
12924
|
+
{
|
|
12925
|
+
llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
|
|
12926
|
+
} break;
|
|
12927
|
+
case LLM_ARCH_QWEN3MOE:
|
|
12928
|
+
{
|
|
12929
|
+
llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
|
|
12930
|
+
} break;
|
|
11699
12931
|
case LLM_ARCH_PHI2:
|
|
11700
12932
|
{
|
|
11701
12933
|
llm = std::make_unique<llm_build_phi2>(*this, params, gf);
|
|
@@ -11801,6 +13033,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11801
13033
|
{
|
|
11802
13034
|
llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
|
|
11803
13035
|
} break;
|
|
13036
|
+
case LLM_ARCH_GLM4:
|
|
13037
|
+
{
|
|
13038
|
+
llm = std::make_unique<llm_build_glm4>(*this, params, gf);
|
|
13039
|
+
} break;
|
|
11804
13040
|
case LLM_ARCH_BITNET:
|
|
11805
13041
|
{
|
|
11806
13042
|
llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
|
|
@@ -11819,10 +13055,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11819
13055
|
GGML_ABORT("invalid graph type");
|
|
11820
13056
|
};
|
|
11821
13057
|
} break;
|
|
11822
|
-
|
|
11823
|
-
|
|
11824
|
-
|
|
11825
|
-
|
|
13058
|
+
case LLM_ARCH_T5ENCODER:
|
|
13059
|
+
{
|
|
13060
|
+
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
|
|
13061
|
+
}
|
|
13062
|
+
break;
|
|
11826
13063
|
case LLM_ARCH_JAIS:
|
|
11827
13064
|
{
|
|
11828
13065
|
llm = std::make_unique<llm_build_jais>(*this, params, gf);
|
|
@@ -11859,6 +13096,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11859
13096
|
{
|
|
11860
13097
|
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
|
11861
13098
|
} break;
|
|
13099
|
+
case LLM_ARCH_PLM:
|
|
13100
|
+
{
|
|
13101
|
+
llm = std::make_unique<llm_build_plm>(*this, params, gf);
|
|
13102
|
+
} break;
|
|
13103
|
+
case LLM_ARCH_BAILINGMOE:
|
|
13104
|
+
{
|
|
13105
|
+
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
|
13106
|
+
} break;
|
|
11862
13107
|
default:
|
|
11863
13108
|
GGML_ABORT("fatal error");
|
|
11864
13109
|
}
|
|
@@ -11876,6 +13121,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11876
13121
|
llama_model_params llama_model_default_params() {
|
|
11877
13122
|
llama_model_params result = {
|
|
11878
13123
|
/*.devices =*/ nullptr,
|
|
13124
|
+
/*.tensor_buft_overrides =*/ nullptr,
|
|
11879
13125
|
/*.n_gpu_layers =*/ 0,
|
|
11880
13126
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
11881
13127
|
/*.main_gpu =*/ 0,
|
|
@@ -11971,6 +13217,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
11971
13217
|
|
|
11972
13218
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
11973
13219
|
case LLM_ARCH_LLAMA:
|
|
13220
|
+
case LLM_ARCH_LLAMA4:
|
|
11974
13221
|
case LLM_ARCH_DECI:
|
|
11975
13222
|
case LLM_ARCH_BAICHUAN:
|
|
11976
13223
|
case LLM_ARCH_STARCODER:
|
|
@@ -11985,10 +13232,13 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
11985
13232
|
case LLM_ARCH_ARCTIC:
|
|
11986
13233
|
case LLM_ARCH_DEEPSEEK:
|
|
11987
13234
|
case LLM_ARCH_DEEPSEEK2:
|
|
13235
|
+
case LLM_ARCH_PLM:
|
|
11988
13236
|
case LLM_ARCH_CHATGLM:
|
|
13237
|
+
case LLM_ARCH_GLM4:
|
|
11989
13238
|
case LLM_ARCH_GRANITE:
|
|
11990
13239
|
case LLM_ARCH_GRANITE_MOE:
|
|
11991
13240
|
case LLM_ARCH_CHAMELEON:
|
|
13241
|
+
case LLM_ARCH_BAILINGMOE:
|
|
11992
13242
|
return LLAMA_ROPE_TYPE_NORM;
|
|
11993
13243
|
|
|
11994
13244
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -11997,11 +13247,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
11997
13247
|
case LLM_ARCH_DBRX:
|
|
11998
13248
|
case LLM_ARCH_BERT:
|
|
11999
13249
|
case LLM_ARCH_NOMIC_BERT:
|
|
13250
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
12000
13251
|
case LLM_ARCH_STABLELM:
|
|
12001
13252
|
case LLM_ARCH_BITNET:
|
|
12002
13253
|
case LLM_ARCH_QWEN:
|
|
12003
13254
|
case LLM_ARCH_QWEN2:
|
|
12004
13255
|
case LLM_ARCH_QWEN2MOE:
|
|
13256
|
+
case LLM_ARCH_QWEN3:
|
|
13257
|
+
case LLM_ARCH_QWEN3MOE:
|
|
12005
13258
|
case LLM_ARCH_OLMO2:
|
|
12006
13259
|
case LLM_ARCH_OLMOE:
|
|
12007
13260
|
case LLM_ARCH_PHI2:
|