@fugood/llama.node 0.3.16 → 0.3.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +3 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +5 -0
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +8 -0
- package/src/LlamaCompletionWorker.h +1 -0
- package/src/LlamaContext.cpp +3 -2
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +124 -0
- package/src/llama.cpp/.github/workflows/build.yml +70 -27
- package/src/llama.cpp/.github/workflows/docker.yml +6 -6
- package/src/llama.cpp/.github/workflows/server.yml +7 -11
- package/src/llama.cpp/CMakeLists.txt +23 -1
- package/src/llama.cpp/common/CMakeLists.txt +6 -3
- package/src/llama.cpp/common/arg.cpp +809 -105
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +1 -1
- package/src/llama.cpp/common/common.cpp +31 -521
- package/src/llama.cpp/common/common.h +17 -36
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +30 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +15 -7
- package/src/llama.cpp/common/minja/minja.hpp +119 -93
- package/src/llama.cpp/common/sampling.cpp +3 -0
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +7 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +210 -8
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +39 -24
- package/src/llama.cpp/examples/llava/clip-impl.h +345 -0
- package/src/llama.cpp/examples/llava/clip.cpp +2152 -1803
- package/src/llama.cpp/examples/llava/clip.h +39 -22
- package/src/llama.cpp/examples/llava/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/examples/llava/llava.cpp +64 -52
- package/src/llama.cpp/examples/llava/mtmd-cli.cpp +344 -0
- package/src/llama.cpp/examples/llava/mtmd.cpp +708 -0
- package/src/llama.cpp/examples/llava/mtmd.h +168 -0
- package/src/llama.cpp/examples/llava/{qwen2vl-cli.cpp → qwen2vl-test.cpp} +83 -31
- package/src/llama.cpp/examples/main/main.cpp +16 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +3 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +17 -3
- package/src/llama.cpp/examples/quantize/quantize.cpp +115 -2
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +4 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +163 -8
- package/src/llama.cpp/examples/run/CMakeLists.txt +12 -1
- package/src/llama.cpp/examples/run/run.cpp +14 -28
- package/src/llama.cpp/examples/server/httplib.h +313 -247
- package/src/llama.cpp/examples/server/server.cpp +238 -139
- package/src/llama.cpp/examples/server/utils.hpp +51 -2
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/tts/tts.cpp +6 -9
- package/src/llama.cpp/ggml/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +66 -99
- package/src/llama.cpp/ggml/src/CMakeLists.txt +10 -7
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +48 -22
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -192
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +754 -404
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1003 -13519
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +2 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +2 -7
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +0 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +3 -4
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +533 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8809 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +258 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +70 -3
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -260
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +293 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +96 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +350 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +2 -292
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +967 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +204 -280
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +23 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +646 -114
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +17 -8
- package/src/llama.cpp/ggml/src/ggml.c +141 -245
- package/src/llama.cpp/ggml/src/gguf.cpp +1 -0
- package/src/llama.cpp/include/llama.h +30 -11
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +2 -0
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/src/CMakeLists.txt +3 -2
- package/src/llama.cpp/src/llama-adapter.cpp +37 -1
- package/src/llama.cpp/src/llama-arch.cpp +160 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-chat.cpp +82 -17
- package/src/llama.cpp/src/llama-chat.h +6 -2
- package/src/llama.cpp/src/llama-context.cpp +108 -92
- package/src/llama.cpp/src/llama-context.h +1 -2
- package/src/llama.cpp/src/llama-graph.cpp +189 -119
- package/src/llama.cpp/src/llama-graph.h +26 -6
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +70 -123
- package/src/llama.cpp/src/llama-kv-cache.h +41 -115
- package/src/llama.cpp/src/llama-memory.h +1 -1
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +10 -5
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model.cpp +1760 -534
- package/src/llama.cpp/src/llama-model.h +13 -1
- package/src/llama.cpp/src/llama-quant.cpp +29 -8
- package/src/llama.cpp/src/llama-sampling.cpp +7 -1
- package/src/llama.cpp/src/llama-vocab.cpp +44 -6
- package/src/llama.cpp/src/llama.cpp +1 -1
- package/src/llama.cpp/tests/CMakeLists.txt +43 -30
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +82 -43
- package/src/llama.cpp/tests/test-chat-template.cpp +34 -13
- package/src/llama.cpp/tests/test-chat.cpp +12 -2
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
#include <cmath>
|
|
18
18
|
#include <functional>
|
|
19
19
|
#include <map>
|
|
20
|
+
#include <regex>
|
|
20
21
|
#include <sstream>
|
|
21
22
|
#include <stdexcept>
|
|
22
23
|
|
|
@@ -42,11 +43,14 @@ const char * llm_type_name(llm_type type) {
|
|
|
42
43
|
case LLM_TYPE_770M: return "770M";
|
|
43
44
|
case LLM_TYPE_780M: return "780M";
|
|
44
45
|
case LLM_TYPE_0_5B: return "0.5B";
|
|
46
|
+
case LLM_TYPE_0_6B: return "0.6B";
|
|
45
47
|
case LLM_TYPE_1B: return "1B";
|
|
46
48
|
case LLM_TYPE_1_3B: return "1.3B";
|
|
47
49
|
case LLM_TYPE_1_4B: return "1.4B";
|
|
48
50
|
case LLM_TYPE_1_5B: return "1.5B";
|
|
49
51
|
case LLM_TYPE_1_6B: return "1.6B";
|
|
52
|
+
case LLM_TYPE_1_7B: return "1.7B";
|
|
53
|
+
case LLM_TYPE_1_8B: return "1.8B";
|
|
50
54
|
case LLM_TYPE_2B: return "2B";
|
|
51
55
|
case LLM_TYPE_2_8B: return "2.8B";
|
|
52
56
|
case LLM_TYPE_2_9B: return "2.9B";
|
|
@@ -64,6 +68,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
64
68
|
case LLM_TYPE_15B: return "15B";
|
|
65
69
|
case LLM_TYPE_16B: return "16B";
|
|
66
70
|
case LLM_TYPE_20B: return "20B";
|
|
71
|
+
case LLM_TYPE_27B: return "27B";
|
|
67
72
|
case LLM_TYPE_30B: return "30B";
|
|
68
73
|
case LLM_TYPE_32B: return "32B";
|
|
69
74
|
case LLM_TYPE_34B: return "34B";
|
|
@@ -72,6 +77,7 @@ const char * llm_type_name(llm_type type) {
|
|
|
72
77
|
case LLM_TYPE_65B: return "65B";
|
|
73
78
|
case LLM_TYPE_70B: return "70B";
|
|
74
79
|
case LLM_TYPE_236B: return "236B";
|
|
80
|
+
case LLM_TYPE_290B: return "290B";
|
|
75
81
|
case LLM_TYPE_314B: return "314B";
|
|
76
82
|
case LLM_TYPE_671B: return "671B";
|
|
77
83
|
case LLM_TYPE_SMALL: return "0.1B";
|
|
@@ -86,7 +92,10 @@ const char * llm_type_name(llm_type type) {
|
|
|
86
92
|
case LLM_TYPE_16x3_8B: return "16x3.8B";
|
|
87
93
|
case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
|
|
88
94
|
case LLM_TYPE_57B_A14B: return "57B.A14B";
|
|
89
|
-
case
|
|
95
|
+
case LLM_TYPE_17B_16E: return "17Bx16E (Scout)";
|
|
96
|
+
case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)";
|
|
97
|
+
case LLM_TYPE_30B_A3B: return "30B.A3B";
|
|
98
|
+
case LLM_TYPE_235B_A22B: return "235B.A22B";
|
|
90
99
|
default: return "?B";
|
|
91
100
|
}
|
|
92
101
|
}
|
|
@@ -255,7 +264,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
|
|
|
255
264
|
return nullptr;
|
|
256
265
|
}
|
|
257
266
|
|
|
258
|
-
// CPU: ACCEL ->
|
|
267
|
+
// CPU: ACCEL -> GPU host -> CPU extra -> CPU
|
|
259
268
|
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
|
|
260
269
|
buft_list_t buft_list;
|
|
261
270
|
|
|
@@ -271,32 +280,6 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
271
280
|
}
|
|
272
281
|
}
|
|
273
282
|
|
|
274
|
-
bool has_gpu_device = false;
|
|
275
|
-
for (auto * dev : devices) {
|
|
276
|
-
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
277
|
-
has_gpu_device = true;
|
|
278
|
-
break;
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
// add extra buffer types, only if no GPU device is present
|
|
283
|
-
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
|
284
|
-
if (!has_gpu_device) {
|
|
285
|
-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
286
|
-
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
287
|
-
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
288
|
-
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
289
|
-
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
290
|
-
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
291
|
-
while (extra_bufts && *extra_bufts) {
|
|
292
|
-
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
293
|
-
++extra_bufts;
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
} else {
|
|
297
|
-
LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__);
|
|
298
|
-
}
|
|
299
|
-
|
|
300
283
|
// add a host buffer type
|
|
301
284
|
// storing the tensors in a host buffer is useful when the processing of large batches
|
|
302
285
|
// is offloaded to a GPU device, since it reduces the time spent on data transfers
|
|
@@ -311,6 +294,20 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|
|
311
294
|
}
|
|
312
295
|
}
|
|
313
296
|
|
|
297
|
+
// add extra buffer types, only if no GPU device is present
|
|
298
|
+
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
|
299
|
+
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
300
|
+
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
301
|
+
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
|
302
|
+
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
|
303
|
+
if (ggml_backend_dev_get_extra_bufts_fn) {
|
|
304
|
+
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
|
305
|
+
while (extra_bufts && *extra_bufts) {
|
|
306
|
+
buft_list.emplace_back(cpu_dev, *extra_bufts);
|
|
307
|
+
++extra_bufts;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
314
311
|
// add the CPU buffer type
|
|
315
312
|
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
316
313
|
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
@@ -388,9 +385,12 @@ struct llama_model::impl {
|
|
|
388
385
|
layer_dev dev_input = {};
|
|
389
386
|
layer_dev dev_output = {};
|
|
390
387
|
std::vector<layer_dev> dev_layer;
|
|
388
|
+
|
|
389
|
+
bool has_tensor_overrides;
|
|
391
390
|
};
|
|
392
391
|
|
|
393
392
|
llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique<impl>()) {
|
|
393
|
+
pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern;
|
|
394
394
|
}
|
|
395
395
|
|
|
396
396
|
llama_model::~llama_model() {}
|
|
@@ -556,6 +556,25 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
556
556
|
}
|
|
557
557
|
}
|
|
558
558
|
} break;
|
|
559
|
+
case LLM_ARCH_LLAMA4:
|
|
560
|
+
{
|
|
561
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
562
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
563
|
+
ml.get_key(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step);
|
|
564
|
+
hparams.n_swa_pattern = 4; // pattern: 3 chunked - 1 full
|
|
565
|
+
hparams.n_attn_chunk = 8192; // should this be a gguf kv? currently it's the same for Scout and Maverick
|
|
566
|
+
hparams.n_swa = 1; // TODO @ngxson : this is added to trigger the SWA branch (we store the chunked attn mask in the SWA tensor), will need to clean this up later
|
|
567
|
+
|
|
568
|
+
switch (hparams.n_expert) {
|
|
569
|
+
case 16: type = LLM_TYPE_17B_16E; break;
|
|
570
|
+
case 128: type = LLM_TYPE_17B_128E; break;
|
|
571
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
if (type == LLM_TYPE_17B_128E) {
|
|
575
|
+
hparams.use_kq_norm = false;
|
|
576
|
+
}
|
|
577
|
+
} break;
|
|
559
578
|
case LLM_ARCH_DECI:
|
|
560
579
|
{
|
|
561
580
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -680,10 +699,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
680
699
|
}
|
|
681
700
|
} break;
|
|
682
701
|
case LLM_ARCH_NOMIC_BERT:
|
|
702
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
683
703
|
{
|
|
684
704
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
685
705
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
686
706
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
|
707
|
+
ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0);
|
|
687
708
|
|
|
688
709
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
|
689
710
|
type = LLM_TYPE_137M;
|
|
@@ -772,6 +793,28 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
772
793
|
default: type = LLM_TYPE_UNKNOWN;
|
|
773
794
|
}
|
|
774
795
|
} break;
|
|
796
|
+
case LLM_ARCH_QWEN3:
|
|
797
|
+
{
|
|
798
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
799
|
+
switch (hparams.n_layer) {
|
|
800
|
+
case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
|
|
801
|
+
case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
|
|
802
|
+
case 40: type = LLM_TYPE_14B; break;
|
|
803
|
+
case 64: type = LLM_TYPE_32B; break;
|
|
804
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
805
|
+
}
|
|
806
|
+
} break;
|
|
807
|
+
case LLM_ARCH_QWEN3MOE:
|
|
808
|
+
{
|
|
809
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
|
|
810
|
+
|
|
811
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
812
|
+
switch (hparams.n_layer) {
|
|
813
|
+
case 48: type = LLM_TYPE_30B_A3B; break;
|
|
814
|
+
case 94: type = LLM_TYPE_235B_A22B; break;
|
|
815
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
816
|
+
}
|
|
817
|
+
} break;
|
|
775
818
|
case LLM_ARCH_PHI2:
|
|
776
819
|
{
|
|
777
820
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
@@ -1125,6 +1168,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1125
1168
|
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
|
1126
1169
|
}
|
|
1127
1170
|
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1171
|
+
ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false);
|
|
1172
|
+
ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false);
|
|
1128
1173
|
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1129
1174
|
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1130
1175
|
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
@@ -1144,6 +1189,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1144
1189
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1145
1190
|
}
|
|
1146
1191
|
} break;
|
|
1192
|
+
case LLM_ARCH_PLM:
|
|
1193
|
+
{
|
|
1194
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1195
|
+
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
|
1196
|
+
switch (hparams.n_layer) {
|
|
1197
|
+
case 32: type = LLM_TYPE_1_8B; break;
|
|
1198
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1199
|
+
}
|
|
1200
|
+
} break;
|
|
1147
1201
|
case LLM_ARCH_CHATGLM:
|
|
1148
1202
|
{
|
|
1149
1203
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1165,6 +1219,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1165
1219
|
default: type = LLM_TYPE_UNKNOWN;
|
|
1166
1220
|
}
|
|
1167
1221
|
} break;
|
|
1222
|
+
case LLM_ARCH_GLM4:
|
|
1223
|
+
{
|
|
1224
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1225
|
+
switch (hparams.n_layer) {
|
|
1226
|
+
case 40: type = LLM_TYPE_9B; break;
|
|
1227
|
+
case 61: type = LLM_TYPE_32B; break;
|
|
1228
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1229
|
+
}
|
|
1230
|
+
} break;
|
|
1168
1231
|
case LLM_ARCH_BITNET:
|
|
1169
1232
|
{
|
|
1170
1233
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -1330,6 +1393,21 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|
|
1330
1393
|
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
|
1331
1394
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
1332
1395
|
} break;
|
|
1396
|
+
case LLM_ARCH_BAILINGMOE:
|
|
1397
|
+
{
|
|
1398
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
1399
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
1400
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
1401
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
1402
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
1403
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
|
|
1404
|
+
|
|
1405
|
+
switch (hparams.n_layer) {
|
|
1406
|
+
case 28: type = LLM_TYPE_16B; break;
|
|
1407
|
+
case 88: type = LLM_TYPE_290B; break;
|
|
1408
|
+
default: type = LLM_TYPE_UNKNOWN;
|
|
1409
|
+
}
|
|
1410
|
+
} break;
|
|
1333
1411
|
default: throw std::runtime_error("unsupported model architecture");
|
|
1334
1412
|
}
|
|
1335
1413
|
|
|
@@ -1557,9 +1635,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1557
1635
|
GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str());
|
|
1558
1636
|
}
|
|
1559
1637
|
|
|
1560
|
-
ggml_backend_buffer_type_t buft =
|
|
1638
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
1639
|
+
|
|
1640
|
+
// check overrides
|
|
1641
|
+
if (ml.tensor_buft_overrides) {
|
|
1642
|
+
std::string tensor_name = tn.str();
|
|
1643
|
+
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
|
1644
|
+
std::regex pattern(overrides->pattern);
|
|
1645
|
+
if (std::regex_search(tensor_name, pattern)) {
|
|
1646
|
+
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
|
|
1647
|
+
buft = overrides->buft;
|
|
1648
|
+
break;
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
|
|
1561
1653
|
if (!buft) {
|
|
1562
|
-
|
|
1654
|
+
buft = select_weight_buft(hparams, t_meta, op, *buft_list);
|
|
1655
|
+
if (!buft) {
|
|
1656
|
+
throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str()));
|
|
1657
|
+
}
|
|
1563
1658
|
}
|
|
1564
1659
|
|
|
1565
1660
|
// avoid using a host buffer when using mmap
|
|
@@ -1655,6 +1750,56 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1655
1750
|
}
|
|
1656
1751
|
}
|
|
1657
1752
|
} break;
|
|
1753
|
+
case LLM_ARCH_LLAMA4:
|
|
1754
|
+
{
|
|
1755
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1756
|
+
|
|
1757
|
+
// output
|
|
1758
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
1759
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
1760
|
+
|
|
1761
|
+
// if output is NULL, init from the input tok embed
|
|
1762
|
+
if (output == NULL) {
|
|
1763
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
1764
|
+
}
|
|
1765
|
+
|
|
1766
|
+
GGML_ASSERT(hparams.n_moe_layer_step > 0 && "Llama 4 requires n_moe_layer_step > 0");
|
|
1767
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
1768
|
+
bool is_moe_layer = (i + 1) % hparams.n_moe_layer_step == 0;
|
|
1769
|
+
|
|
1770
|
+
auto & layer = layers[i];
|
|
1771
|
+
|
|
1772
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
1773
|
+
|
|
1774
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
1775
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
1776
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
1777
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
1778
|
+
|
|
1779
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
1780
|
+
|
|
1781
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
|
1782
|
+
|
|
1783
|
+
if (is_moe_layer) {
|
|
1784
|
+
int n_ff_exp = hparams.n_ff_exp;
|
|
1785
|
+
|
|
1786
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1787
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
1788
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert}, 0);
|
|
1789
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
|
|
1790
|
+
|
|
1791
|
+
// Shared expert
|
|
1792
|
+
const int64_t n_ff_shexp = n_ff_exp;
|
|
1793
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
1794
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd }, 0);
|
|
1795
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
1796
|
+
} else {
|
|
1797
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
1798
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
1799
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
1800
|
+
}
|
|
1801
|
+
}
|
|
1802
|
+
} break;
|
|
1658
1803
|
case LLM_ARCH_DECI:
|
|
1659
1804
|
{
|
|
1660
1805
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -1924,6 +2069,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1924
2069
|
} break;
|
|
1925
2070
|
case LLM_ARCH_BERT:
|
|
1926
2071
|
case LLM_ARCH_NOMIC_BERT:
|
|
2072
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
1927
2073
|
{
|
|
1928
2074
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
1929
2075
|
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
|
|
@@ -1957,20 +2103,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
1957
2103
|
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
|
|
1958
2104
|
}
|
|
1959
2105
|
|
|
2106
|
+
if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2107
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
|
|
2108
|
+
}
|
|
2109
|
+
|
|
1960
2110
|
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
1961
2111
|
|
|
1962
2112
|
layer.attn_out_norm = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
1963
2113
|
layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i), {n_embd}, 0);
|
|
1964
2114
|
|
|
1965
|
-
|
|
1966
|
-
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
1967
|
-
|
|
1968
|
-
if (arch == LLM_ARCH_BERT) {
|
|
2115
|
+
if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
|
|
1969
2116
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
1970
|
-
layer.
|
|
1971
|
-
layer.
|
|
2117
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff, n_expert}, 0);
|
|
2118
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0);
|
|
2119
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
1972
2120
|
} else {
|
|
1973
|
-
layer.
|
|
2121
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2122
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
2123
|
+
|
|
2124
|
+
if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
2125
|
+
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
|
|
2126
|
+
layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, 0);
|
|
2127
|
+
layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
|
|
2128
|
+
} else {
|
|
2129
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2130
|
+
}
|
|
1974
2131
|
}
|
|
1975
2132
|
|
|
1976
2133
|
layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
|
|
@@ -2254,6 +2411,77 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
2254
2411
|
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp}, 0);
|
|
2255
2412
|
}
|
|
2256
2413
|
} break;
|
|
2414
|
+
case LLM_ARCH_QWEN3:
|
|
2415
|
+
{
|
|
2416
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2417
|
+
|
|
2418
|
+
// output
|
|
2419
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2420
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
2421
|
+
// if output is NULL, init from the input tok embed
|
|
2422
|
+
if (output == NULL) {
|
|
2423
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2426
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2427
|
+
auto & layer = layers[i];
|
|
2428
|
+
|
|
2429
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2430
|
+
|
|
2431
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2432
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2433
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2434
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2435
|
+
|
|
2436
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2437
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2438
|
+
|
|
2439
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2440
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
2441
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
2442
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
2443
|
+
}
|
|
2444
|
+
} break;
|
|
2445
|
+
case LLM_ARCH_QWEN3MOE:
|
|
2446
|
+
{
|
|
2447
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
2448
|
+
|
|
2449
|
+
// output
|
|
2450
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
2451
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
2452
|
+
|
|
2453
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
2454
|
+
auto & layer = layers[i];
|
|
2455
|
+
|
|
2456
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
2457
|
+
|
|
2458
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
2459
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2460
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
2461
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
|
|
2462
|
+
|
|
2463
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2464
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
|
|
2465
|
+
|
|
2466
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
2467
|
+
|
|
2468
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
2469
|
+
|
|
2470
|
+
if (n_expert == 0) {
|
|
2471
|
+
throw std::runtime_error("n_expert must be > 0 for QWEN3MOE");
|
|
2472
|
+
}
|
|
2473
|
+
if (n_expert_used == 0) {
|
|
2474
|
+
throw std::runtime_error("n_expert_used must be > 0 for QWEN3MOE");
|
|
2475
|
+
}
|
|
2476
|
+
|
|
2477
|
+
// MoE branch
|
|
2478
|
+
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
|
|
2479
|
+
|
|
2480
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2481
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
2482
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
2483
|
+
}
|
|
2484
|
+
} break;
|
|
2257
2485
|
case LLM_ARCH_PHI2:
|
|
2258
2486
|
{
|
|
2259
2487
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3003,8 +3231,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3003
3231
|
{
|
|
3004
3232
|
const bool is_lite = (hparams.n_layer == 27);
|
|
3005
3233
|
|
|
3234
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
3235
|
+
|
|
3236
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
3237
|
+
const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
|
3238
|
+
const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
|
3239
|
+
|
|
3006
3240
|
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
3007
|
-
const int64_t n_embd_head_qk_nope =
|
|
3241
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
|
|
3008
3242
|
|
|
3009
3243
|
const int64_t q_lora_rank = hparams.n_lora_q;
|
|
3010
3244
|
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
@@ -3030,14 +3264,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3030
3264
|
|
|
3031
3265
|
if (!is_lite) {
|
|
3032
3266
|
layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0);
|
|
3033
|
-
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head *
|
|
3267
|
+
layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0);
|
|
3034
3268
|
} else {
|
|
3035
|
-
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd,
|
|
3269
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0);
|
|
3036
3270
|
}
|
|
3037
3271
|
|
|
3038
|
-
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank +
|
|
3039
|
-
|
|
3040
|
-
|
|
3272
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0);
|
|
3273
|
+
|
|
3274
|
+
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
|
|
3275
|
+
if (is_mla) {
|
|
3276
|
+
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
|
|
3277
|
+
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
|
|
3278
|
+
} else {
|
|
3279
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
|
|
3280
|
+
}
|
|
3281
|
+
|
|
3282
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0);
|
|
3041
3283
|
|
|
3042
3284
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3043
3285
|
|
|
@@ -3068,6 +3310,35 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3068
3310
|
}
|
|
3069
3311
|
}
|
|
3070
3312
|
} break;
|
|
3313
|
+
case LLM_ARCH_PLM:
|
|
3314
|
+
{
|
|
3315
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
3316
|
+
const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
3317
|
+
const int64_t kv_lora_rank = hparams.n_lora_kv;
|
|
3318
|
+
|
|
3319
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3320
|
+
|
|
3321
|
+
// output
|
|
3322
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3323
|
+
// output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
3324
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3325
|
+
|
|
3326
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3327
|
+
auto & layer = layers[i];
|
|
3328
|
+
|
|
3329
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3330
|
+
|
|
3331
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3332
|
+
layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0);
|
|
3333
|
+
layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {kv_lora_rank}, 0);
|
|
3334
|
+
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0);
|
|
3335
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0);
|
|
3336
|
+
|
|
3337
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3338
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3339
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
3340
|
+
}
|
|
3341
|
+
} break;
|
|
3071
3342
|
case LLM_ARCH_BITNET:
|
|
3072
3343
|
{
|
|
3073
3344
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3254,6 +3525,45 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3254
3525
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
|
|
3255
3526
|
}
|
|
3256
3527
|
} break;
|
|
3528
|
+
case LLM_ARCH_GLM4:
|
|
3529
|
+
{
|
|
3530
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
3531
|
+
|
|
3532
|
+
// output
|
|
3533
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
3534
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
|
3535
|
+
// if output is NULL, init from the input tok embed
|
|
3536
|
+
if (output == NULL) {
|
|
3537
|
+
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
|
3538
|
+
}
|
|
3539
|
+
|
|
3540
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3541
|
+
auto & layer = layers[i];
|
|
3542
|
+
|
|
3543
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
3544
|
+
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3545
|
+
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3546
|
+
|
|
3547
|
+
if (layer.wqkv == nullptr) {
|
|
3548
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
|
|
3549
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
|
|
3550
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
|
|
3551
|
+
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
|
3552
|
+
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3553
|
+
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
|
3554
|
+
}
|
|
3555
|
+
|
|
3556
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
3557
|
+
|
|
3558
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3559
|
+
|
|
3560
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
3561
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
3562
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0);
|
|
3563
|
+
|
|
3564
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
3565
|
+
}
|
|
3566
|
+
} break;
|
|
3257
3567
|
case LLM_ARCH_NEMOTRON:
|
|
3258
3568
|
{
|
|
3259
3569
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -3712,8 +4022,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|
|
3712
4022
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
|
3713
4023
|
output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
|
3714
4024
|
} break;
|
|
3715
|
-
|
|
3716
|
-
|
|
4025
|
+
case LLM_ARCH_BAILINGMOE:
|
|
4026
|
+
{
|
|
4027
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
4028
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
4029
|
+
|
|
4030
|
+
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
4031
|
+
|
|
4032
|
+
// output
|
|
4033
|
+
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
4034
|
+
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
4035
|
+
|
|
4036
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
4037
|
+
auto & layer = layers[i];
|
|
4038
|
+
|
|
4039
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
4040
|
+
|
|
4041
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_rot}, 0);
|
|
4042
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
4043
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_head_kv * n_rot}, 0);
|
|
4044
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_rot, n_embd}, 0);
|
|
4045
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
4046
|
+
|
|
4047
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
4048
|
+
|
|
4049
|
+
if (n_expert == 0) {
|
|
4050
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
4051
|
+
}
|
|
4052
|
+
if (n_expert_used == 0) {
|
|
4053
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
4054
|
+
}
|
|
4055
|
+
|
|
4056
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4057
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
4058
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
4059
|
+
|
|
4060
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4061
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
4062
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
4063
|
+
}
|
|
4064
|
+
} break;
|
|
4065
|
+
default:
|
|
4066
|
+
throw std::runtime_error("unknown architecture");
|
|
3717
4067
|
}
|
|
3718
4068
|
|
|
3719
4069
|
if (n_moved_tensors > 0) {
|
|
@@ -3980,6 +4330,8 @@ void llama_model::print_info() const {
|
|
|
3980
4330
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
3981
4331
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
3982
4332
|
LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv);
|
|
4333
|
+
LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla);
|
|
4334
|
+
LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla);
|
|
3983
4335
|
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
3984
4336
|
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
3985
4337
|
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
@@ -3993,12 +4345,24 @@ void llama_model::print_info() const {
|
|
|
3993
4345
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
3994
4346
|
}
|
|
3995
4347
|
|
|
4348
|
+
if (arch == LLM_ARCH_QWEN3MOE) {
|
|
4349
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4350
|
+
}
|
|
4351
|
+
|
|
3996
4352
|
if (arch == LLM_ARCH_MINICPM || arch == LLM_ARCH_GRANITE || arch == LLM_ARCH_GRANITE_MOE) {
|
|
3997
4353
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
3998
4354
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
3999
4355
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
4000
4356
|
}
|
|
4001
4357
|
|
|
4358
|
+
if (arch == LLM_ARCH_BAILINGMOE) {
|
|
4359
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
4360
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
4361
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
4362
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
4363
|
+
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
|
|
4364
|
+
}
|
|
4365
|
+
|
|
4002
4366
|
vocab.print_info();
|
|
4003
4367
|
}
|
|
4004
4368
|
|
|
@@ -4060,6 +4424,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const {
|
|
|
4060
4424
|
});
|
|
4061
4425
|
}
|
|
4062
4426
|
|
|
4427
|
+
bool llama_model::has_tensor_overrides() const {
|
|
4428
|
+
return pimpl->has_tensor_overrides;
|
|
4429
|
+
}
|
|
4430
|
+
|
|
4063
4431
|
const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|
4064
4432
|
auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(),
|
|
4065
4433
|
[name](const std::pair<std::string, ggml_tensor *> & it) {
|
|
@@ -4087,12 +4455,22 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4087
4455
|
// inp_pos - contains the positions
|
|
4088
4456
|
ggml_tensor * inp_pos = build_inp_pos();
|
|
4089
4457
|
|
|
4458
|
+
// temperature tuning
|
|
4459
|
+
ggml_tensor * inp_attn_scale = nullptr;
|
|
4460
|
+
if (arch == LLM_ARCH_LLAMA4) {
|
|
4461
|
+
inp_attn_scale = build_inp_attn_scale();
|
|
4462
|
+
}
|
|
4463
|
+
|
|
4090
4464
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
4091
4465
|
|
|
4092
4466
|
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
4093
4467
|
for (int il = 0; il < n_layer; ++il) {
|
|
4094
4468
|
ggml_tensor * inpSA = inpL;
|
|
4095
4469
|
|
|
4470
|
+
bool use_rope = arch == LLM_ARCH_LLAMA4
|
|
4471
|
+
? (il + 1) % hparams.n_no_rope_layer_step != 0
|
|
4472
|
+
: true;
|
|
4473
|
+
|
|
4096
4474
|
// norm
|
|
4097
4475
|
cur = build_norm(inpL,
|
|
4098
4476
|
model.layers[il].attn_norm, NULL,
|
|
@@ -4130,25 +4508,38 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4130
4508
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
4131
4509
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
4132
4510
|
|
|
4133
|
-
|
|
4134
|
-
|
|
4135
|
-
|
|
4136
|
-
|
|
4137
|
-
|
|
4511
|
+
if (use_rope) {
|
|
4512
|
+
Qcur = ggml_rope_ext(
|
|
4513
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
4514
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4515
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4516
|
+
);
|
|
4138
4517
|
|
|
4139
|
-
|
|
4140
|
-
|
|
4141
|
-
|
|
4142
|
-
|
|
4143
|
-
|
|
4518
|
+
Kcur = ggml_rope_ext(
|
|
4519
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
4520
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
4521
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4522
|
+
);
|
|
4523
|
+
} else if (inp_attn_scale) {
|
|
4524
|
+
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
|
|
4525
|
+
}
|
|
4144
4526
|
|
|
4145
4527
|
cb(Qcur, "Qcur", il);
|
|
4146
4528
|
cb(Kcur, "Kcur", il);
|
|
4147
4529
|
cb(Vcur, "Vcur", il);
|
|
4148
4530
|
|
|
4531
|
+
if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
|
|
4532
|
+
// Llama4TextL2Norm
|
|
4533
|
+
Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
|
|
4534
|
+
Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
|
|
4535
|
+
cb(Qcur, "Qcur_normed", il);
|
|
4536
|
+
cb(Kcur, "Kcur_normed", il);
|
|
4537
|
+
}
|
|
4538
|
+
|
|
4149
4539
|
cur = build_attn(inp_attn, gf,
|
|
4150
4540
|
model.layers[il].wo, model.layers[il].bo,
|
|
4151
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
4541
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4542
|
+
cb(cur, "attn_out", il);
|
|
4152
4543
|
}
|
|
4153
4544
|
|
|
4154
4545
|
if (il == n_layer - 1) {
|
|
@@ -4166,7 +4557,7 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4166
4557
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
4167
4558
|
cb(ffn_inp, "ffn_inp", il);
|
|
4168
4559
|
|
|
4169
|
-
// feed-forward network
|
|
4560
|
+
// feed-forward network (non-MoE)
|
|
4170
4561
|
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
4171
4562
|
|
|
4172
4563
|
cur = build_norm(ffn_inp,
|
|
@@ -4181,6 +4572,38 @@ struct llm_build_llama : public llm_graph_context {
|
|
|
4181
4572
|
NULL,
|
|
4182
4573
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4183
4574
|
cb(cur, "ffn_out", il);
|
|
4575
|
+
|
|
4576
|
+
} else if (arch == LLM_ARCH_LLAMA4) {
|
|
4577
|
+
// llama4 MoE
|
|
4578
|
+
ggml_tensor * ffn_inp_normed = build_norm(ffn_inp,
|
|
4579
|
+
model.layers[il].ffn_norm, NULL,
|
|
4580
|
+
LLM_NORM_RMS, il);
|
|
4581
|
+
cb(cur, "ffn_norm", il);
|
|
4582
|
+
|
|
4583
|
+
ggml_tensor * moe_out = build_moe_ffn(ffn_inp_normed,
|
|
4584
|
+
model.layers[il].ffn_gate_inp,
|
|
4585
|
+
model.layers[il].ffn_up_exps,
|
|
4586
|
+
model.layers[il].ffn_gate_exps,
|
|
4587
|
+
model.layers[il].ffn_down_exps,
|
|
4588
|
+
nullptr,
|
|
4589
|
+
n_expert, n_expert_used,
|
|
4590
|
+
LLM_FFN_SILU, false,
|
|
4591
|
+
false, 0.0,
|
|
4592
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID,
|
|
4593
|
+
il);
|
|
4594
|
+
|
|
4595
|
+
// Shared experts
|
|
4596
|
+
ggml_tensor * shexp_out = build_ffn(ffn_inp_normed,
|
|
4597
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
4598
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
4599
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
4600
|
+
NULL,
|
|
4601
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
4602
|
+
cb(shexp_out, "ffn_moe_shexp", il);
|
|
4603
|
+
|
|
4604
|
+
cur = ggml_add(ctx0, moe_out, shexp_out);
|
|
4605
|
+
cb(cur, "ffn_moe_out_merged", il);
|
|
4606
|
+
|
|
4184
4607
|
} else {
|
|
4185
4608
|
// MoE branch
|
|
4186
4609
|
cur = build_norm(ffn_inp,
|
|
@@ -4328,7 +4751,7 @@ struct llm_build_deci : public llm_graph_context {
|
|
|
4328
4751
|
|
|
4329
4752
|
cur = build_attn(inp_attn, gf,
|
|
4330
4753
|
model.layers[il].wo, model.layers[il].bo,
|
|
4331
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
4754
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
4332
4755
|
}
|
|
4333
4756
|
|
|
4334
4757
|
if (il == n_layer - 1) {
|
|
@@ -4470,7 +4893,7 @@ struct llm_build_baichuan : public llm_graph_context {
|
|
|
4470
4893
|
|
|
4471
4894
|
cur = build_attn(inp_attn, gf,
|
|
4472
4895
|
model.layers[il].wo, NULL,
|
|
4473
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4896
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4474
4897
|
}
|
|
4475
4898
|
|
|
4476
4899
|
if (il == n_layer - 1) {
|
|
@@ -4585,7 +5008,7 @@ struct llm_build_xverse : public llm_graph_context {
|
|
|
4585
5008
|
|
|
4586
5009
|
cur = build_attn(inp_attn, gf,
|
|
4587
5010
|
model.layers[il].wo, NULL,
|
|
4588
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5011
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4589
5012
|
}
|
|
4590
5013
|
|
|
4591
5014
|
if (il == n_layer - 1) {
|
|
@@ -4710,7 +5133,7 @@ struct llm_build_falcon : public llm_graph_context {
|
|
|
4710
5133
|
|
|
4711
5134
|
cur = build_attn(inp_attn, gf,
|
|
4712
5135
|
model.layers[il].wo, NULL,
|
|
4713
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5136
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4714
5137
|
}
|
|
4715
5138
|
|
|
4716
5139
|
if (il == n_layer - 1) {
|
|
@@ -4840,7 +5263,7 @@ struct llm_build_grok : public llm_graph_context {
|
|
|
4840
5263
|
|
|
4841
5264
|
cur = build_attn(inp_attn, gf,
|
|
4842
5265
|
model.layers[il].wo, model.layers[il].bo,
|
|
4843
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
5266
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
4844
5267
|
}
|
|
4845
5268
|
|
|
4846
5269
|
if (il == n_layer - 1) {
|
|
@@ -4991,7 +5414,7 @@ struct llm_build_dbrx : public llm_graph_context {
|
|
|
4991
5414
|
|
|
4992
5415
|
cur = build_attn(inp_attn, gf,
|
|
4993
5416
|
model.layers[il].wo, NULL,
|
|
4994
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5417
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
4995
5418
|
}
|
|
4996
5419
|
|
|
4997
5420
|
if (il == n_layer - 1) {
|
|
@@ -5105,7 +5528,7 @@ struct llm_build_starcoder : public llm_graph_context {
|
|
|
5105
5528
|
|
|
5106
5529
|
cur = build_attn(inp_attn, gf,
|
|
5107
5530
|
model.layers[il].wo, model.layers[il].bo,
|
|
5108
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5531
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5109
5532
|
}
|
|
5110
5533
|
|
|
5111
5534
|
if (il == n_layer - 1) {
|
|
@@ -5204,7 +5627,7 @@ struct llm_build_refact : public llm_graph_context {
|
|
|
5204
5627
|
|
|
5205
5628
|
cur = build_attn(inp_attn, gf,
|
|
5206
5629
|
model.layers[il].wo, NULL,
|
|
5207
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5630
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5208
5631
|
}
|
|
5209
5632
|
|
|
5210
5633
|
if (il == n_layer - 1) {
|
|
@@ -5331,6 +5754,11 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5331
5754
|
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
5332
5755
|
cb(cur, "wqkv", il);
|
|
5333
5756
|
|
|
5757
|
+
if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5758
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
5759
|
+
cb(cur, "bqkv", il);
|
|
5760
|
+
}
|
|
5761
|
+
|
|
5334
5762
|
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
5335
5763
|
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
5336
5764
|
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
@@ -5358,7 +5786,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5358
5786
|
|
|
5359
5787
|
cur = build_attn(inp_attn, gf,
|
|
5360
5788
|
model.layers[il].wo, model.layers[il].bo,
|
|
5361
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5789
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5362
5790
|
cb(cur, "kqv_out", il);
|
|
5363
5791
|
|
|
5364
5792
|
if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
@@ -5383,13 +5811,29 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5383
5811
|
cb(ffn_inp, "ffn_inp", il);
|
|
5384
5812
|
|
|
5385
5813
|
// feed-forward network
|
|
5386
|
-
if (
|
|
5814
|
+
if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
|
|
5815
|
+
// MoE branch
|
|
5816
|
+
cur = build_moe_ffn(cur,
|
|
5817
|
+
model.layers[il].ffn_gate_inp,
|
|
5818
|
+
model.layers[il].ffn_up_exps,
|
|
5819
|
+
nullptr,
|
|
5820
|
+
model.layers[il].ffn_down_exps,
|
|
5821
|
+
nullptr,
|
|
5822
|
+
hparams.n_expert,
|
|
5823
|
+
hparams.n_expert_used,
|
|
5824
|
+
LLM_FFN_GELU,
|
|
5825
|
+
false, false,
|
|
5826
|
+
0.0f,
|
|
5827
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
|
|
5828
|
+
cb(cur, "ffn_moe_out", il);
|
|
5829
|
+
} else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
|
|
5387
5830
|
cur = build_ffn(cur,
|
|
5388
5831
|
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
5389
5832
|
NULL, NULL, NULL,
|
|
5390
5833
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
5391
5834
|
NULL,
|
|
5392
5835
|
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
5836
|
+
cb(cur, "ffn_out", il);
|
|
5393
5837
|
} else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
|
|
5394
5838
|
cur = build_ffn(cur,
|
|
5395
5839
|
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5397,6 +5841,7 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5397
5841
|
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
5398
5842
|
NULL,
|
|
5399
5843
|
LLM_FFN_GELU, LLM_FFN_PAR, il);
|
|
5844
|
+
cb(cur, "ffn_out", il);
|
|
5400
5845
|
} else {
|
|
5401
5846
|
cur = build_ffn(cur,
|
|
5402
5847
|
model.layers[il].ffn_up, NULL, NULL,
|
|
@@ -5404,8 +5849,8 @@ struct llm_build_bert : public llm_graph_context {
|
|
|
5404
5849
|
model.layers[il].ffn_down, NULL, NULL,
|
|
5405
5850
|
NULL,
|
|
5406
5851
|
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
5852
|
+
cb(cur, "ffn_out", il);
|
|
5407
5853
|
}
|
|
5408
|
-
cb(cur, "ffn_out", il);
|
|
5409
5854
|
|
|
5410
5855
|
// attentions bypass the intermediate layer
|
|
5411
5856
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
@@ -5475,7 +5920,7 @@ struct llm_build_bloom : public llm_graph_context {
|
|
|
5475
5920
|
|
|
5476
5921
|
cur = build_attn(inp_attn, gf,
|
|
5477
5922
|
model.layers[il].wo, model.layers[il].bo,
|
|
5478
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5923
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5479
5924
|
}
|
|
5480
5925
|
|
|
5481
5926
|
if (il == n_layer - 1) {
|
|
@@ -5616,7 +6061,7 @@ struct llm_build_mpt : public llm_graph_context {
|
|
|
5616
6061
|
|
|
5617
6062
|
cur = build_attn(inp_attn, gf,
|
|
5618
6063
|
model.layers[il].wo, model.layers[il].bo,
|
|
5619
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6064
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5620
6065
|
}
|
|
5621
6066
|
|
|
5622
6067
|
if (il == n_layer - 1) {
|
|
@@ -5762,7 +6207,7 @@ struct llm_build_stablelm : public llm_graph_context {
|
|
|
5762
6207
|
|
|
5763
6208
|
cur = build_attn(inp_attn, gf,
|
|
5764
6209
|
model.layers[il].wo, NULL,
|
|
5765
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6210
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5766
6211
|
}
|
|
5767
6212
|
|
|
5768
6213
|
if (il == n_layer - 1) {
|
|
@@ -5885,7 +6330,7 @@ struct llm_build_qwen : public llm_graph_context {
|
|
|
5885
6330
|
|
|
5886
6331
|
cur = build_attn(inp_attn, gf,
|
|
5887
6332
|
model.layers[il].wo, NULL,
|
|
5888
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6333
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
5889
6334
|
}
|
|
5890
6335
|
|
|
5891
6336
|
if (il == n_layer - 1) {
|
|
@@ -6005,7 +6450,7 @@ struct llm_build_qwen2 : public llm_graph_context {
|
|
|
6005
6450
|
|
|
6006
6451
|
cur = build_attn(inp_attn, gf,
|
|
6007
6452
|
model.layers[il].wo, model.layers[il].bo,
|
|
6008
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6453
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6009
6454
|
}
|
|
6010
6455
|
|
|
6011
6456
|
if (il == n_layer - 1) {
|
|
@@ -6126,7 +6571,7 @@ struct llm_build_qwen2vl : public llm_graph_context {
|
|
|
6126
6571
|
|
|
6127
6572
|
cur = build_attn(inp_attn, gf,
|
|
6128
6573
|
model.layers[il].wo, model.layers[il].bo,
|
|
6129
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6574
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6130
6575
|
}
|
|
6131
6576
|
|
|
6132
6577
|
if (il == n_layer - 1) {
|
|
@@ -6253,7 +6698,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6253
6698
|
|
|
6254
6699
|
cur = build_attn(inp_attn, gf,
|
|
6255
6700
|
model.layers[il].wo, model.layers[il].bo,
|
|
6256
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6701
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6257
6702
|
}
|
|
6258
6703
|
|
|
6259
6704
|
if (il == n_layer - 1) {
|
|
@@ -6284,7 +6729,7 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6284
6729
|
false, 0.0,
|
|
6285
6730
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
6286
6731
|
il);
|
|
6287
|
-
cb(
|
|
6732
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
6288
6733
|
|
|
6289
6734
|
// FFN shared expert
|
|
6290
6735
|
{
|
|
@@ -6340,16 +6785,14 @@ struct llm_build_qwen2moe : public llm_graph_context {
|
|
|
6340
6785
|
}
|
|
6341
6786
|
};
|
|
6342
6787
|
|
|
6343
|
-
struct
|
|
6344
|
-
|
|
6788
|
+
struct llm_build_qwen3 : public llm_graph_context {
|
|
6789
|
+
llm_build_qwen3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6345
6790
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6346
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6347
6791
|
|
|
6348
6792
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6793
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
6349
6794
|
|
|
6350
6795
|
ggml_tensor * cur;
|
|
6351
|
-
ggml_tensor * attn_norm_output;
|
|
6352
|
-
ggml_tensor * ffn_output;
|
|
6353
6796
|
ggml_tensor * inpL;
|
|
6354
6797
|
|
|
6355
6798
|
inpL = build_inp_embd(model.tok_embd);
|
|
@@ -6360,48 +6803,42 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6360
6803
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6361
6804
|
|
|
6362
6805
|
for (int il = 0; il < n_layer; ++il) {
|
|
6363
|
-
|
|
6364
|
-
|
|
6365
|
-
|
|
6366
|
-
|
|
6367
|
-
|
|
6806
|
+
ggml_tensor * inpSA = inpL;
|
|
6807
|
+
|
|
6808
|
+
// norm
|
|
6809
|
+
cur = build_norm(inpL,
|
|
6810
|
+
model.layers[il].attn_norm, NULL,
|
|
6811
|
+
LLM_NORM_RMS, il);
|
|
6812
|
+
cb(cur, "attn_norm", il);
|
|
6368
6813
|
|
|
6369
6814
|
// self-attention
|
|
6370
6815
|
{
|
|
6371
|
-
|
|
6372
|
-
ggml_tensor *
|
|
6373
|
-
ggml_tensor * Vcur = nullptr;
|
|
6374
|
-
|
|
6375
|
-
if (model.layers[il].wqkv) {
|
|
6376
|
-
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
6377
|
-
cb(cur, "wqkv", il);
|
|
6378
|
-
|
|
6379
|
-
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
6380
|
-
cb(cur, "bqkv", il);
|
|
6381
|
-
|
|
6382
|
-
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
6383
|
-
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
6384
|
-
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
6385
|
-
} else {
|
|
6386
|
-
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
6387
|
-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
6388
|
-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
6389
|
-
}
|
|
6390
|
-
|
|
6816
|
+
// compute Q and K and RoPE them
|
|
6817
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6391
6818
|
cb(Qcur, "Qcur", il);
|
|
6819
|
+
|
|
6820
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6392
6821
|
cb(Kcur, "Kcur", il);
|
|
6822
|
+
|
|
6823
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6393
6824
|
cb(Vcur, "Vcur", il);
|
|
6394
6825
|
|
|
6395
6826
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6396
6827
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6397
6828
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6398
6829
|
|
|
6830
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
6831
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6832
|
+
|
|
6399
6833
|
Qcur = ggml_rope_ext(
|
|
6400
6834
|
ctx0, Qcur, inp_pos, nullptr,
|
|
6401
6835
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6402
6836
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6403
6837
|
);
|
|
6404
6838
|
|
|
6839
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
6840
|
+
cb(Kcur, "Kcur_normed", il);
|
|
6841
|
+
|
|
6405
6842
|
Kcur = ggml_rope_ext(
|
|
6406
6843
|
ctx0, Kcur, inp_pos, nullptr,
|
|
6407
6844
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
@@ -6412,36 +6849,36 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6412
6849
|
cb(Kcur, "Kcur", il);
|
|
6413
6850
|
cb(Vcur, "Vcur", il);
|
|
6414
6851
|
|
|
6415
|
-
// with phi2, we scale the Q to avoid precision issues
|
|
6416
|
-
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
|
6417
|
-
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
6418
|
-
|
|
6419
6852
|
cur = build_attn(inp_attn, gf,
|
|
6420
6853
|
model.layers[il].wo, model.layers[il].bo,
|
|
6421
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
6854
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6422
6855
|
}
|
|
6423
6856
|
|
|
6424
6857
|
if (il == n_layer - 1) {
|
|
6425
6858
|
// skip computing output for unused tokens
|
|
6426
6859
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6427
|
-
cur
|
|
6428
|
-
|
|
6429
|
-
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
6860
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6861
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6430
6862
|
}
|
|
6431
6863
|
|
|
6432
|
-
|
|
6433
|
-
|
|
6434
|
-
ffn_output = build_ffn(attn_norm_output,
|
|
6435
|
-
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
6436
|
-
NULL, NULL, NULL,
|
|
6437
|
-
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6438
|
-
NULL,
|
|
6439
|
-
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
6440
|
-
cb(ffn_output, "ffn_out", il);
|
|
6441
|
-
}
|
|
6864
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
6865
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6442
6866
|
|
|
6443
|
-
|
|
6444
|
-
cur =
|
|
6867
|
+
// feed-forward network
|
|
6868
|
+
cur = build_norm(ffn_inp,
|
|
6869
|
+
model.layers[il].ffn_norm, NULL,
|
|
6870
|
+
LLM_NORM_RMS, il);
|
|
6871
|
+
cb(cur, "ffn_norm", il);
|
|
6872
|
+
|
|
6873
|
+
cur = build_ffn(cur,
|
|
6874
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
6875
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
6876
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
6877
|
+
NULL,
|
|
6878
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
6879
|
+
cb(cur, "ffn_out", il);
|
|
6880
|
+
|
|
6881
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6445
6882
|
|
|
6446
6883
|
cur = build_cvec(cur, il);
|
|
6447
6884
|
cb(cur, "l_out", il);
|
|
@@ -6450,18 +6887,17 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6450
6887
|
inpL = cur;
|
|
6451
6888
|
}
|
|
6452
6889
|
|
|
6453
|
-
cur =
|
|
6454
|
-
|
|
6455
|
-
|
|
6456
|
-
|
|
6890
|
+
cur = inpL;
|
|
6891
|
+
|
|
6892
|
+
cur = build_norm(cur,
|
|
6893
|
+
model.output_norm, NULL,
|
|
6894
|
+
LLM_NORM_RMS, -1);
|
|
6457
6895
|
|
|
6458
6896
|
cb(cur, "result_norm", -1);
|
|
6459
6897
|
res->t_embd = cur;
|
|
6460
6898
|
|
|
6899
|
+
// lm_head
|
|
6461
6900
|
cur = build_lora_mm(model.output, cur);
|
|
6462
|
-
cb(cur, "result_output_no_bias", -1);
|
|
6463
|
-
|
|
6464
|
-
cur = ggml_add(ctx0, cur, model.output_b);
|
|
6465
6901
|
|
|
6466
6902
|
cb(cur, "result_output", -1);
|
|
6467
6903
|
res->t_logits = cur;
|
|
@@ -6470,12 +6906,12 @@ struct llm_build_phi2 : public llm_graph_context {
|
|
|
6470
6906
|
}
|
|
6471
6907
|
};
|
|
6472
6908
|
|
|
6473
|
-
struct
|
|
6474
|
-
|
|
6909
|
+
struct llm_build_qwen3moe : public llm_graph_context {
|
|
6910
|
+
llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6475
6911
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6476
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6477
6912
|
|
|
6478
6913
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6914
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
6479
6915
|
|
|
6480
6916
|
ggml_tensor * cur;
|
|
6481
6917
|
ggml_tensor * inpL;
|
|
@@ -6488,52 +6924,44 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6488
6924
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6489
6925
|
|
|
6490
6926
|
for (int il = 0; il < n_layer; ++il) {
|
|
6491
|
-
|
|
6492
|
-
|
|
6493
|
-
// self-attention
|
|
6494
|
-
{
|
|
6495
|
-
// rope freq factors for 128k context
|
|
6496
|
-
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
|
6497
|
-
|
|
6498
|
-
ggml_tensor* attn_norm_output = build_norm(inpL,
|
|
6499
|
-
model.layers[il].attn_norm,
|
|
6500
|
-
model.layers[il].attn_norm_b,
|
|
6501
|
-
LLM_NORM_RMS, il);
|
|
6502
|
-
cb(attn_norm_output, "attn_norm", il);
|
|
6503
|
-
|
|
6504
|
-
ggml_tensor * Qcur = nullptr;
|
|
6505
|
-
ggml_tensor * Kcur = nullptr;
|
|
6506
|
-
ggml_tensor * Vcur = nullptr;
|
|
6507
|
-
|
|
6508
|
-
if (model.layers[il].wqkv) {
|
|
6509
|
-
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
6510
|
-
cb(cur, "wqkv", il);
|
|
6927
|
+
ggml_tensor * inpSA = inpL;
|
|
6511
6928
|
|
|
6512
|
-
|
|
6513
|
-
|
|
6514
|
-
|
|
6515
|
-
|
|
6516
|
-
|
|
6517
|
-
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
6518
|
-
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
6519
|
-
}
|
|
6929
|
+
// norm
|
|
6930
|
+
cur = build_norm(inpL,
|
|
6931
|
+
model.layers[il].attn_norm, NULL,
|
|
6932
|
+
LLM_NORM_RMS, il);
|
|
6933
|
+
cb(cur, "attn_norm", il);
|
|
6520
6934
|
|
|
6935
|
+
// self_attention
|
|
6936
|
+
{
|
|
6937
|
+
// compute Q and K and RoPE them
|
|
6938
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
6521
6939
|
cb(Qcur, "Qcur", il);
|
|
6940
|
+
|
|
6941
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
6522
6942
|
cb(Kcur, "Kcur", il);
|
|
6943
|
+
|
|
6944
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
6523
6945
|
cb(Vcur, "Vcur", il);
|
|
6524
6946
|
|
|
6525
6947
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
6526
6948
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6527
6949
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6528
6950
|
|
|
6529
|
-
Qcur =
|
|
6530
|
-
|
|
6951
|
+
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
|
|
6952
|
+
cb(Qcur, "Qcur_normed", il);
|
|
6953
|
+
|
|
6954
|
+
Qcur = ggml_rope_ext(
|
|
6955
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
6531
6956
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6532
6957
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6533
6958
|
);
|
|
6534
6959
|
|
|
6960
|
+
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
|
|
6961
|
+
cb(Kcur, "Kcur_normed", il);
|
|
6962
|
+
|
|
6535
6963
|
Kcur = ggml_rope_ext(
|
|
6536
|
-
ctx0, Kcur, inp_pos,
|
|
6964
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
6537
6965
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6538
6966
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6539
6967
|
);
|
|
@@ -6542,41 +6970,29 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6542
6970
|
cb(Kcur, "Kcur", il);
|
|
6543
6971
|
cb(Vcur, "Vcur", il);
|
|
6544
6972
|
|
|
6545
|
-
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
|
6546
|
-
cb(Qcur, "Qcur", il);
|
|
6547
|
-
|
|
6548
6973
|
cur = build_attn(inp_attn, gf,
|
|
6549
6974
|
model.layers[il].wo, model.layers[il].bo,
|
|
6550
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
6975
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6551
6976
|
}
|
|
6552
6977
|
|
|
6553
6978
|
if (il == n_layer - 1) {
|
|
6554
6979
|
// skip computing output for unused tokens
|
|
6555
|
-
ggml_tensor* inp_out_ids = build_inp_out_ids();
|
|
6556
|
-
cur
|
|
6557
|
-
|
|
6980
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6981
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
6982
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
6558
6983
|
}
|
|
6559
6984
|
|
|
6560
|
-
|
|
6561
|
-
|
|
6985
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
6986
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
6562
6987
|
|
|
6563
|
-
|
|
6564
|
-
|
|
6988
|
+
// MoE branch
|
|
6989
|
+
cur = build_norm(ffn_inp,
|
|
6990
|
+
model.layers[il].ffn_norm, NULL,
|
|
6565
6991
|
LLM_NORM_RMS, il);
|
|
6566
6992
|
cb(cur, "ffn_norm", il);
|
|
6567
6993
|
|
|
6568
|
-
|
|
6569
|
-
|
|
6570
|
-
cur = build_ffn(cur,
|
|
6571
|
-
model.layers[il].ffn_up, NULL, NULL,
|
|
6572
|
-
NULL, NULL, NULL,
|
|
6573
|
-
model.layers[il].ffn_down, NULL, NULL,
|
|
6574
|
-
NULL,
|
|
6575
|
-
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
6576
|
-
cb(cur, "ffn_out", il);
|
|
6577
|
-
} else {
|
|
6578
|
-
// MoE branch
|
|
6579
|
-
cur = build_moe_ffn(cur,
|
|
6994
|
+
ggml_tensor * moe_out =
|
|
6995
|
+
build_moe_ffn(cur,
|
|
6580
6996
|
model.layers[il].ffn_gate_inp,
|
|
6581
6997
|
model.layers[il].ffn_up_exps,
|
|
6582
6998
|
model.layers[il].ffn_gate_exps,
|
|
@@ -6587,10 +7003,10 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6587
7003
|
false, 0.0,
|
|
6588
7004
|
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
6589
7005
|
il);
|
|
6590
|
-
|
|
6591
|
-
|
|
7006
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
7007
|
+
cur = moe_out;
|
|
6592
7008
|
|
|
6593
|
-
cur = ggml_add(ctx0,
|
|
7009
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6594
7010
|
|
|
6595
7011
|
cur = build_cvec(cur, il);
|
|
6596
7012
|
cb(cur, "l_out", il);
|
|
@@ -6599,21 +7015,18 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6599
7015
|
inpL = cur;
|
|
6600
7016
|
}
|
|
6601
7017
|
|
|
6602
|
-
cur =
|
|
6603
|
-
|
|
6604
|
-
|
|
7018
|
+
cur = inpL;
|
|
7019
|
+
|
|
7020
|
+
cur = build_norm(cur,
|
|
7021
|
+
model.output_norm, NULL,
|
|
6605
7022
|
LLM_NORM_RMS, -1);
|
|
6606
7023
|
|
|
6607
7024
|
cb(cur, "result_norm", -1);
|
|
6608
7025
|
res->t_embd = cur;
|
|
6609
7026
|
|
|
7027
|
+
// lm_head
|
|
6610
7028
|
cur = build_lora_mm(model.output, cur);
|
|
6611
7029
|
|
|
6612
|
-
if (model.output_b != nullptr) {
|
|
6613
|
-
cb(cur, "result_output_no_bias", -1);
|
|
6614
|
-
cur = ggml_add(ctx0, cur, model.output_b);
|
|
6615
|
-
}
|
|
6616
|
-
|
|
6617
7030
|
cb(cur, "result_output", -1);
|
|
6618
7031
|
res->t_logits = cur;
|
|
6619
7032
|
|
|
@@ -6621,14 +7034,16 @@ struct llm_build_phi3 : public llm_graph_context {
|
|
|
6621
7034
|
}
|
|
6622
7035
|
};
|
|
6623
7036
|
|
|
6624
|
-
struct
|
|
6625
|
-
|
|
7037
|
+
struct llm_build_phi2 : public llm_graph_context {
|
|
7038
|
+
llm_build_phi2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6626
7039
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7040
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6627
7041
|
|
|
6628
7042
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6629
|
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
6630
7043
|
|
|
6631
7044
|
ggml_tensor * cur;
|
|
7045
|
+
ggml_tensor * attn_norm_output;
|
|
7046
|
+
ggml_tensor * ffn_output;
|
|
6632
7047
|
ggml_tensor * inpL;
|
|
6633
7048
|
|
|
6634
7049
|
inpL = build_inp_embd(model.tok_embd);
|
|
@@ -6639,25 +7054,36 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
6639
7054
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6640
7055
|
|
|
6641
7056
|
for (int il = 0; il < n_layer; ++il) {
|
|
6642
|
-
|
|
6643
|
-
|
|
6644
|
-
|
|
6645
|
-
|
|
6646
|
-
|
|
6647
|
-
cb(cur, "attn_norm", il);
|
|
6648
|
-
|
|
6649
|
-
ggml_tensor * attention_norm = cur;
|
|
7057
|
+
attn_norm_output = build_norm(inpL,
|
|
7058
|
+
model.layers[il].attn_norm,
|
|
7059
|
+
model.layers[il].attn_norm_b,
|
|
7060
|
+
LLM_NORM, il);
|
|
7061
|
+
cb(attn_norm_output, "attn_norm", il);
|
|
6650
7062
|
|
|
6651
7063
|
// self-attention
|
|
6652
7064
|
{
|
|
6653
|
-
|
|
6654
|
-
ggml_tensor *
|
|
6655
|
-
|
|
7065
|
+
ggml_tensor * Qcur = nullptr;
|
|
7066
|
+
ggml_tensor * Kcur = nullptr;
|
|
7067
|
+
ggml_tensor * Vcur = nullptr;
|
|
6656
7068
|
|
|
6657
|
-
|
|
6658
|
-
|
|
7069
|
+
if (model.layers[il].wqkv) {
|
|
7070
|
+
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
7071
|
+
cb(cur, "wqkv", il);
|
|
6659
7072
|
|
|
6660
|
-
|
|
7073
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7074
|
+
cb(cur, "bqkv", il);
|
|
7075
|
+
|
|
7076
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
7077
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
7078
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7079
|
+
} else {
|
|
7080
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7081
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7082
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
7083
|
+
}
|
|
7084
|
+
|
|
7085
|
+
cb(Qcur, "Qcur", il);
|
|
7086
|
+
cb(Kcur, "Kcur", il);
|
|
6661
7087
|
cb(Vcur, "Vcur", il);
|
|
6662
7088
|
|
|
6663
7089
|
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
@@ -6666,13 +7092,13 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
6666
7092
|
|
|
6667
7093
|
Qcur = ggml_rope_ext(
|
|
6668
7094
|
ctx0, Qcur, inp_pos, nullptr,
|
|
6669
|
-
|
|
7095
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6670
7096
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6671
7097
|
);
|
|
6672
7098
|
|
|
6673
7099
|
Kcur = ggml_rope_ext(
|
|
6674
7100
|
ctx0, Kcur, inp_pos, nullptr,
|
|
6675
|
-
|
|
7101
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
6676
7102
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
6677
7103
|
);
|
|
6678
7104
|
|
|
@@ -6680,34 +7106,35 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
6680
7106
|
cb(Kcur, "Kcur", il);
|
|
6681
7107
|
cb(Vcur, "Vcur", il);
|
|
6682
7108
|
|
|
7109
|
+
// with phi2, we scale the Q to avoid precision issues
|
|
7110
|
+
// ref: https://github.com/ml-explore/mlx-examples/blob/08e862336ade809bc37d1035f94b359e7d1a5152/phi2/phi2.py#L64-L66
|
|
7111
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f/sqrtf(float(n_embd_head)));
|
|
7112
|
+
|
|
6683
7113
|
cur = build_attn(inp_attn, gf,
|
|
6684
|
-
model.layers[il].wo,
|
|
6685
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f
|
|
7114
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
7115
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6686
7116
|
}
|
|
6687
|
-
ggml_tensor * sa_out = cur;
|
|
6688
|
-
|
|
6689
|
-
cur = attention_norm;
|
|
6690
7117
|
|
|
6691
7118
|
if (il == n_layer - 1) {
|
|
6692
7119
|
// skip computing output for unused tokens
|
|
6693
7120
|
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
6694
|
-
cur
|
|
6695
|
-
|
|
6696
|
-
|
|
7121
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7122
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7123
|
+
attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids);
|
|
6697
7124
|
}
|
|
6698
7125
|
|
|
6699
|
-
//
|
|
7126
|
+
// FF
|
|
6700
7127
|
{
|
|
6701
|
-
|
|
6702
|
-
model.layers[il].ffn_up,
|
|
6703
|
-
|
|
6704
|
-
model.layers[il].ffn_down,
|
|
7128
|
+
ffn_output = build_ffn(attn_norm_output,
|
|
7129
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7130
|
+
NULL, NULL, NULL,
|
|
7131
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
6705
7132
|
NULL,
|
|
6706
|
-
|
|
6707
|
-
cb(
|
|
7133
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
7134
|
+
cb(ffn_output, "ffn_out", il);
|
|
6708
7135
|
}
|
|
6709
7136
|
|
|
6710
|
-
cur = ggml_add(ctx0, cur,
|
|
7137
|
+
cur = ggml_add(ctx0, cur, ffn_output);
|
|
6711
7138
|
cur = ggml_add(ctx0, cur, inpL);
|
|
6712
7139
|
|
|
6713
7140
|
cur = build_cvec(cur, il);
|
|
@@ -6717,17 +7144,18 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
6717
7144
|
inpL = cur;
|
|
6718
7145
|
}
|
|
6719
7146
|
|
|
6720
|
-
cur = inpL
|
|
6721
|
-
|
|
6722
|
-
|
|
6723
|
-
|
|
6724
|
-
LLM_NORM_RMS, -1);
|
|
7147
|
+
cur = build_norm(inpL,
|
|
7148
|
+
model.output_norm,
|
|
7149
|
+
model.output_norm_b,
|
|
7150
|
+
LLM_NORM, -1);
|
|
6725
7151
|
|
|
6726
7152
|
cb(cur, "result_norm", -1);
|
|
6727
7153
|
res->t_embd = cur;
|
|
6728
7154
|
|
|
6729
|
-
// lm_head
|
|
6730
7155
|
cur = build_lora_mm(model.output, cur);
|
|
7156
|
+
cb(cur, "result_output_no_bias", -1);
|
|
7157
|
+
|
|
7158
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
|
6731
7159
|
|
|
6732
7160
|
cb(cur, "result_output", -1);
|
|
6733
7161
|
res->t_logits = cur;
|
|
@@ -6736,15 +7164,14 @@ struct llm_build_plamo : public llm_graph_context {
|
|
|
6736
7164
|
}
|
|
6737
7165
|
};
|
|
6738
7166
|
|
|
6739
|
-
struct
|
|
6740
|
-
|
|
7167
|
+
struct llm_build_phi3 : public llm_graph_context {
|
|
7168
|
+
llm_build_phi3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
6741
7169
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
6742
|
-
const int64_t n_embd_gqa
|
|
7170
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
6743
7171
|
|
|
6744
7172
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
6745
7173
|
|
|
6746
7174
|
ggml_tensor * cur;
|
|
6747
|
-
ggml_tensor * pos;
|
|
6748
7175
|
ggml_tensor * inpL;
|
|
6749
7176
|
|
|
6750
7177
|
inpL = build_inp_embd(model.tok_embd);
|
|
@@ -6754,30 +7181,36 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
6754
7181
|
|
|
6755
7182
|
auto * inp_attn = build_attn_inp_kv_unified();
|
|
6756
7183
|
|
|
6757
|
-
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
6758
|
-
cb(pos, "pos_embd", -1);
|
|
6759
|
-
|
|
6760
|
-
inpL = ggml_add(ctx0, inpL, pos);
|
|
6761
|
-
cb(inpL, "inpL", -1);
|
|
6762
|
-
|
|
6763
7184
|
for (int il = 0; il < n_layer; ++il) {
|
|
6764
|
-
|
|
6765
|
-
model.layers[il].attn_norm,
|
|
6766
|
-
model.layers[il].attn_norm_b,
|
|
6767
|
-
LLM_NORM, il);
|
|
6768
|
-
cb(cur, "attn_norm", il);
|
|
7185
|
+
auto * residual = inpL;
|
|
6769
7186
|
|
|
6770
7187
|
// self-attention
|
|
6771
7188
|
{
|
|
6772
|
-
|
|
6773
|
-
|
|
7189
|
+
// rope freq factors for 128k context
|
|
7190
|
+
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
|
6774
7191
|
|
|
6775
|
-
|
|
6776
|
-
|
|
7192
|
+
ggml_tensor* attn_norm_output = build_norm(inpL,
|
|
7193
|
+
model.layers[il].attn_norm,
|
|
7194
|
+
model.layers[il].attn_norm_b,
|
|
7195
|
+
LLM_NORM_RMS, il);
|
|
7196
|
+
cb(attn_norm_output, "attn_norm", il);
|
|
6777
7197
|
|
|
6778
|
-
ggml_tensor * Qcur =
|
|
6779
|
-
ggml_tensor * Kcur =
|
|
6780
|
-
ggml_tensor * Vcur =
|
|
7198
|
+
ggml_tensor * Qcur = nullptr;
|
|
7199
|
+
ggml_tensor * Kcur = nullptr;
|
|
7200
|
+
ggml_tensor * Vcur = nullptr;
|
|
7201
|
+
|
|
7202
|
+
if (model.layers[il].wqkv) {
|
|
7203
|
+
cur = build_lora_mm(model.layers[il].wqkv, attn_norm_output);
|
|
7204
|
+
cb(cur, "wqkv", il);
|
|
7205
|
+
|
|
7206
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd)));
|
|
7207
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd)));
|
|
7208
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)));
|
|
7209
|
+
} else {
|
|
7210
|
+
Qcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
7211
|
+
Kcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
7212
|
+
Vcur = ggml_add(ctx0, build_lora_mm(model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
7213
|
+
}
|
|
6781
7214
|
|
|
6782
7215
|
cb(Qcur, "Qcur", il);
|
|
6783
7216
|
cb(Kcur, "Kcur", il);
|
|
@@ -6787,39 +7220,300 @@ struct llm_build_gpt2 : public llm_graph_context {
|
|
|
6787
7220
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
6788
7221
|
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
6789
7222
|
|
|
7223
|
+
Qcur = ggml_rope_ext(
|
|
7224
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
7225
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7226
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7227
|
+
);
|
|
7228
|
+
|
|
7229
|
+
Kcur = ggml_rope_ext(
|
|
7230
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
7231
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7232
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7233
|
+
);
|
|
7234
|
+
|
|
7235
|
+
cb(Qcur, "Qcur", il);
|
|
7236
|
+
cb(Kcur, "Kcur", il);
|
|
7237
|
+
cb(Vcur, "Vcur", il);
|
|
7238
|
+
|
|
7239
|
+
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
|
7240
|
+
cb(Qcur, "Qcur", il);
|
|
7241
|
+
|
|
6790
7242
|
cur = build_attn(inp_attn, gf,
|
|
6791
7243
|
model.layers[il].wo, model.layers[il].bo,
|
|
6792
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f
|
|
7244
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
6793
7245
|
}
|
|
6794
7246
|
|
|
6795
7247
|
if (il == n_layer - 1) {
|
|
6796
7248
|
// skip computing output for unused tokens
|
|
6797
|
-
ggml_tensor
|
|
6798
|
-
cur
|
|
6799
|
-
|
|
7249
|
+
ggml_tensor* inp_out_ids = build_inp_out_ids();
|
|
7250
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7251
|
+
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
|
6800
7252
|
}
|
|
6801
7253
|
|
|
6802
|
-
|
|
6803
|
-
|
|
6804
|
-
cb(ffn_inp, "ffn_inp", il);
|
|
7254
|
+
cur = ggml_add(ctx0, cur, residual);
|
|
7255
|
+
residual = cur;
|
|
6805
7256
|
|
|
6806
|
-
|
|
6807
|
-
|
|
6808
|
-
|
|
6809
|
-
|
|
6810
|
-
model.layers[il].ffn_norm_b,
|
|
6811
|
-
LLM_NORM, il);
|
|
6812
|
-
cb(cur, "ffn_norm", il);
|
|
7257
|
+
cur = build_norm(cur,
|
|
7258
|
+
model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
|
|
7259
|
+
LLM_NORM_RMS, il);
|
|
7260
|
+
cb(cur, "ffn_norm", il);
|
|
6813
7261
|
|
|
7262
|
+
// feed-forward network
|
|
7263
|
+
if (model.layers[il].ffn_gate_inp == nullptr) {
|
|
6814
7264
|
cur = build_ffn(cur,
|
|
6815
|
-
model.layers[il].ffn_up,
|
|
6816
|
-
NULL, NULL,
|
|
6817
|
-
model.layers[il].ffn_down,
|
|
7265
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
7266
|
+
NULL, NULL, NULL,
|
|
7267
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
6818
7268
|
NULL,
|
|
6819
|
-
|
|
7269
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
6820
7270
|
cb(cur, "ffn_out", il);
|
|
6821
|
-
}
|
|
6822
|
-
|
|
7271
|
+
} else {
|
|
7272
|
+
// MoE branch
|
|
7273
|
+
cur = build_moe_ffn(cur,
|
|
7274
|
+
model.layers[il].ffn_gate_inp,
|
|
7275
|
+
model.layers[il].ffn_up_exps,
|
|
7276
|
+
model.layers[il].ffn_gate_exps,
|
|
7277
|
+
model.layers[il].ffn_down_exps,
|
|
7278
|
+
nullptr,
|
|
7279
|
+
n_expert, n_expert_used,
|
|
7280
|
+
LLM_FFN_SILU, true,
|
|
7281
|
+
false, 0.0,
|
|
7282
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
7283
|
+
il);
|
|
7284
|
+
cb(cur, "ffn_moe_out", il);
|
|
7285
|
+
}
|
|
7286
|
+
|
|
7287
|
+
cur = ggml_add(ctx0, residual, cur);
|
|
7288
|
+
|
|
7289
|
+
cur = build_cvec(cur, il);
|
|
7290
|
+
cb(cur, "l_out", il);
|
|
7291
|
+
|
|
7292
|
+
// input for next layer
|
|
7293
|
+
inpL = cur;
|
|
7294
|
+
}
|
|
7295
|
+
|
|
7296
|
+
cur = build_norm(inpL,
|
|
7297
|
+
model.output_norm,
|
|
7298
|
+
model.output_norm_b,
|
|
7299
|
+
LLM_NORM_RMS, -1);
|
|
7300
|
+
|
|
7301
|
+
cb(cur, "result_norm", -1);
|
|
7302
|
+
res->t_embd = cur;
|
|
7303
|
+
|
|
7304
|
+
cur = build_lora_mm(model.output, cur);
|
|
7305
|
+
|
|
7306
|
+
if (model.output_b != nullptr) {
|
|
7307
|
+
cb(cur, "result_output_no_bias", -1);
|
|
7308
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
|
7309
|
+
}
|
|
7310
|
+
|
|
7311
|
+
cb(cur, "result_output", -1);
|
|
7312
|
+
res->t_logits = cur;
|
|
7313
|
+
|
|
7314
|
+
ggml_build_forward_expand(gf, cur);
|
|
7315
|
+
}
|
|
7316
|
+
};
|
|
7317
|
+
|
|
7318
|
+
struct llm_build_plamo : public llm_graph_context {
|
|
7319
|
+
llm_build_plamo(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7320
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7321
|
+
|
|
7322
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7323
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
7324
|
+
|
|
7325
|
+
ggml_tensor * cur;
|
|
7326
|
+
ggml_tensor * inpL;
|
|
7327
|
+
|
|
7328
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
7329
|
+
|
|
7330
|
+
// inp_pos - contains the positions
|
|
7331
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
7332
|
+
|
|
7333
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7334
|
+
|
|
7335
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7336
|
+
|
|
7337
|
+
// norm
|
|
7338
|
+
cur = build_norm(inpL,
|
|
7339
|
+
model.layers[il].attn_norm, NULL,
|
|
7340
|
+
LLM_NORM_RMS, il);
|
|
7341
|
+
cb(cur, "attn_norm", il);
|
|
7342
|
+
|
|
7343
|
+
ggml_tensor * attention_norm = cur;
|
|
7344
|
+
|
|
7345
|
+
// self-attention
|
|
7346
|
+
{
|
|
7347
|
+
// compute Q and K and RoPE them
|
|
7348
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
7349
|
+
cb(Qcur, "Qcur", il);
|
|
7350
|
+
|
|
7351
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
7352
|
+
cb(Kcur, "Kcur", il);
|
|
7353
|
+
|
|
7354
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
7355
|
+
cb(Vcur, "Vcur", il);
|
|
7356
|
+
|
|
7357
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7358
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7359
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7360
|
+
|
|
7361
|
+
Qcur = ggml_rope_ext(
|
|
7362
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
7363
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7364
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7365
|
+
);
|
|
7366
|
+
|
|
7367
|
+
Kcur = ggml_rope_ext(
|
|
7368
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
7369
|
+
n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
7370
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
7371
|
+
);
|
|
7372
|
+
|
|
7373
|
+
cb(Qcur, "Qcur", il);
|
|
7374
|
+
cb(Kcur, "Kcur", il);
|
|
7375
|
+
cb(Vcur, "Vcur", il);
|
|
7376
|
+
|
|
7377
|
+
cur = build_attn(inp_attn, gf,
|
|
7378
|
+
model.layers[il].wo, NULL,
|
|
7379
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7380
|
+
}
|
|
7381
|
+
ggml_tensor * sa_out = cur;
|
|
7382
|
+
|
|
7383
|
+
cur = attention_norm;
|
|
7384
|
+
|
|
7385
|
+
if (il == n_layer - 1) {
|
|
7386
|
+
// skip computing output for unused tokens
|
|
7387
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7388
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7389
|
+
sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids);
|
|
7390
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7391
|
+
}
|
|
7392
|
+
|
|
7393
|
+
// feed-forward network
|
|
7394
|
+
{
|
|
7395
|
+
cur = build_ffn(cur,
|
|
7396
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
7397
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
7398
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
7399
|
+
NULL,
|
|
7400
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
7401
|
+
cb(cur, "ffn_out", il);
|
|
7402
|
+
}
|
|
7403
|
+
|
|
7404
|
+
cur = ggml_add(ctx0, cur, sa_out);
|
|
7405
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
7406
|
+
|
|
7407
|
+
cur = build_cvec(cur, il);
|
|
7408
|
+
cb(cur, "l_out", il);
|
|
7409
|
+
|
|
7410
|
+
// input for next layer
|
|
7411
|
+
inpL = cur;
|
|
7412
|
+
}
|
|
7413
|
+
|
|
7414
|
+
cur = inpL;
|
|
7415
|
+
|
|
7416
|
+
cur = build_norm(cur,
|
|
7417
|
+
model.output_norm, NULL,
|
|
7418
|
+
LLM_NORM_RMS, -1);
|
|
7419
|
+
|
|
7420
|
+
cb(cur, "result_norm", -1);
|
|
7421
|
+
res->t_embd = cur;
|
|
7422
|
+
|
|
7423
|
+
// lm_head
|
|
7424
|
+
cur = build_lora_mm(model.output, cur);
|
|
7425
|
+
|
|
7426
|
+
cb(cur, "result_output", -1);
|
|
7427
|
+
res->t_logits = cur;
|
|
7428
|
+
|
|
7429
|
+
ggml_build_forward_expand(gf, cur);
|
|
7430
|
+
}
|
|
7431
|
+
};
|
|
7432
|
+
|
|
7433
|
+
struct llm_build_gpt2 : public llm_graph_context {
|
|
7434
|
+
llm_build_gpt2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
7435
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
7436
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
7437
|
+
|
|
7438
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
7439
|
+
|
|
7440
|
+
ggml_tensor * cur;
|
|
7441
|
+
ggml_tensor * pos;
|
|
7442
|
+
ggml_tensor * inpL;
|
|
7443
|
+
|
|
7444
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
7445
|
+
|
|
7446
|
+
// inp_pos - contains the positions
|
|
7447
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
7448
|
+
|
|
7449
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
7450
|
+
|
|
7451
|
+
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
|
7452
|
+
cb(pos, "pos_embd", -1);
|
|
7453
|
+
|
|
7454
|
+
inpL = ggml_add(ctx0, inpL, pos);
|
|
7455
|
+
cb(inpL, "inpL", -1);
|
|
7456
|
+
|
|
7457
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
7458
|
+
cur = build_norm(inpL,
|
|
7459
|
+
model.layers[il].attn_norm,
|
|
7460
|
+
model.layers[il].attn_norm_b,
|
|
7461
|
+
LLM_NORM, il);
|
|
7462
|
+
cb(cur, "attn_norm", il);
|
|
7463
|
+
|
|
7464
|
+
// self-attention
|
|
7465
|
+
{
|
|
7466
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
7467
|
+
cb(cur, "wqkv", il);
|
|
7468
|
+
|
|
7469
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
7470
|
+
cb(cur, "bqkv", il);
|
|
7471
|
+
|
|
7472
|
+
ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
7473
|
+
ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
7474
|
+
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
7475
|
+
|
|
7476
|
+
cb(Qcur, "Qcur", il);
|
|
7477
|
+
cb(Kcur, "Kcur", il);
|
|
7478
|
+
cb(Vcur, "Vcur", il);
|
|
7479
|
+
|
|
7480
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
7481
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
7482
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
7483
|
+
|
|
7484
|
+
cur = build_attn(inp_attn, gf,
|
|
7485
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
7486
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7487
|
+
}
|
|
7488
|
+
|
|
7489
|
+
if (il == n_layer - 1) {
|
|
7490
|
+
// skip computing output for unused tokens
|
|
7491
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
7492
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
7493
|
+
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
|
|
7494
|
+
}
|
|
7495
|
+
|
|
7496
|
+
// add the input
|
|
7497
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
|
7498
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
7499
|
+
|
|
7500
|
+
// FF
|
|
7501
|
+
{
|
|
7502
|
+
cur = build_norm(ffn_inp,
|
|
7503
|
+
model.layers[il].ffn_norm,
|
|
7504
|
+
model.layers[il].ffn_norm_b,
|
|
7505
|
+
LLM_NORM, il);
|
|
7506
|
+
cb(cur, "ffn_norm", il);
|
|
7507
|
+
|
|
7508
|
+
cur = build_ffn(cur,
|
|
7509
|
+
model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL,
|
|
7510
|
+
NULL, NULL, NULL,
|
|
7511
|
+
model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
|
|
7512
|
+
NULL,
|
|
7513
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
7514
|
+
cb(cur, "ffn_out", il);
|
|
7515
|
+
}
|
|
7516
|
+
|
|
6823
7517
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
6824
7518
|
|
|
6825
7519
|
cur = build_cvec(cur, il);
|
|
@@ -6905,7 +7599,7 @@ struct llm_build_codeshell : public llm_graph_context {
|
|
|
6905
7599
|
|
|
6906
7600
|
cur = build_attn(inp_attn, gf,
|
|
6907
7601
|
model.layers[il].wo, model.layers[il].bo,
|
|
6908
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7602
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
6909
7603
|
}
|
|
6910
7604
|
|
|
6911
7605
|
if (il == n_layer - 1) {
|
|
@@ -7034,7 +7728,7 @@ struct llm_build_orion : public llm_graph_context {
|
|
|
7034
7728
|
|
|
7035
7729
|
cur = build_attn(inp_attn, gf,
|
|
7036
7730
|
model.layers[il].wo, NULL,
|
|
7037
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7731
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7038
7732
|
}
|
|
7039
7733
|
|
|
7040
7734
|
if (il == n_layer - 1) {
|
|
@@ -7161,7 +7855,7 @@ struct llm_build_internlm2 : public llm_graph_context {
|
|
|
7161
7855
|
|
|
7162
7856
|
cur = build_attn(inp_attn, gf,
|
|
7163
7857
|
model.layers[il].wo, model.layers[il].bo,
|
|
7164
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7858
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7165
7859
|
}
|
|
7166
7860
|
|
|
7167
7861
|
if (il == n_layer - 1) {
|
|
@@ -7358,7 +8052,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|
|
7358
8052
|
|
|
7359
8053
|
cur = build_attn(inp_attn, gf,
|
|
7360
8054
|
model.layers[il].wo, NULL,
|
|
7361
|
-
q_states, k_states, v_states, nullptr, kq_scale, il);
|
|
8055
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
7362
8056
|
}
|
|
7363
8057
|
|
|
7364
8058
|
if (il == n_layer - 1) {
|
|
@@ -7488,7 +8182,7 @@ struct llm_build_gemma : public llm_graph_context {
|
|
|
7488
8182
|
|
|
7489
8183
|
cur = build_attn(inp_attn, gf,
|
|
7490
8184
|
model.layers[il].wo, NULL,
|
|
7491
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
8185
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7492
8186
|
}
|
|
7493
8187
|
|
|
7494
8188
|
if (il == n_layer - 1) {
|
|
@@ -7610,7 +8304,7 @@ struct llm_build_gemma2 : public llm_graph_context {
|
|
|
7610
8304
|
|
|
7611
8305
|
cur = build_attn(inp_attn, gf,
|
|
7612
8306
|
model.layers[il].wo, NULL,
|
|
7613
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
8307
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
7614
8308
|
}
|
|
7615
8309
|
|
|
7616
8310
|
cur = build_norm(cur,
|
|
@@ -7751,7 +8445,7 @@ struct llm_build_gemma3 : public llm_graph_context {
|
|
|
7751
8445
|
|
|
7752
8446
|
cur = build_attn(inp_attn, gf,
|
|
7753
8447
|
model.layers[il].wo, NULL,
|
|
7754
|
-
Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il);
|
|
8448
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il);
|
|
7755
8449
|
}
|
|
7756
8450
|
|
|
7757
8451
|
cur = build_norm(cur,
|
|
@@ -7891,7 +8585,7 @@ struct llm_build_starcoder2 : public llm_graph_context {
|
|
|
7891
8585
|
|
|
7892
8586
|
cur = build_attn(inp_attn, gf,
|
|
7893
8587
|
model.layers[il].wo, model.layers[il].bo,
|
|
7894
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8588
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
7895
8589
|
}
|
|
7896
8590
|
|
|
7897
8591
|
if (il == n_layer - 1) {
|
|
@@ -8226,7 +8920,7 @@ struct llm_build_command_r : public llm_graph_context {
|
|
|
8226
8920
|
|
|
8227
8921
|
cur = build_attn(inp_attn, gf,
|
|
8228
8922
|
model.layers[il].wo, model.layers[il].bo,
|
|
8229
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8923
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8230
8924
|
}
|
|
8231
8925
|
|
|
8232
8926
|
if (il == n_layer - 1) {
|
|
@@ -8361,7 +9055,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|
|
8361
9055
|
|
|
8362
9056
|
cur = build_attn(inp_attn, gf,
|
|
8363
9057
|
model.layers[il].wo, model.layers[il].bo,
|
|
8364
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9058
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8365
9059
|
}
|
|
8366
9060
|
|
|
8367
9061
|
if (il == n_layer - 1) {
|
|
@@ -8492,7 +9186,7 @@ struct llm_build_olmo : public llm_graph_context {
|
|
|
8492
9186
|
|
|
8493
9187
|
cur = build_attn(inp_attn, gf,
|
|
8494
9188
|
model.layers[il].wo, nullptr,
|
|
8495
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9189
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8496
9190
|
}
|
|
8497
9191
|
|
|
8498
9192
|
if (il == n_layer - 1) {
|
|
@@ -8612,7 +9306,7 @@ struct llm_build_olmo2 : public llm_graph_context {
|
|
|
8612
9306
|
|
|
8613
9307
|
cur = build_attn(inp_attn, gf,
|
|
8614
9308
|
model.layers[il].wo, NULL,
|
|
8615
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9309
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8616
9310
|
}
|
|
8617
9311
|
|
|
8618
9312
|
cur = build_norm(cur,
|
|
@@ -8745,7 +9439,7 @@ struct llm_build_olmoe : public llm_graph_context {
|
|
|
8745
9439
|
|
|
8746
9440
|
cur = build_attn(inp_attn, gf,
|
|
8747
9441
|
model.layers[il].wo, NULL,
|
|
8748
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9442
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8749
9443
|
}
|
|
8750
9444
|
|
|
8751
9445
|
if (il == n_layer - 1) {
|
|
@@ -8878,7 +9572,7 @@ struct llm_build_openelm : public llm_graph_context {
|
|
|
8878
9572
|
|
|
8879
9573
|
cur = build_attn(inp_attn, gf,
|
|
8880
9574
|
model.layers[il].wo, NULL,
|
|
8881
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9575
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8882
9576
|
}
|
|
8883
9577
|
|
|
8884
9578
|
if (il == n_layer - 1) {
|
|
@@ -8992,7 +9686,7 @@ struct llm_build_gptneox : public llm_graph_context {
|
|
|
8992
9686
|
|
|
8993
9687
|
cur = build_attn(inp_attn, gf,
|
|
8994
9688
|
model.layers[il].wo, model.layers[il].bo,
|
|
8995
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9689
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
8996
9690
|
}
|
|
8997
9691
|
|
|
8998
9692
|
if (il == n_layer - 1) {
|
|
@@ -9142,7 +9836,7 @@ struct llm_build_arctic : public llm_graph_context {
|
|
|
9142
9836
|
|
|
9143
9837
|
cur = build_attn(inp_attn, gf,
|
|
9144
9838
|
model.layers[il].wo, NULL,
|
|
9145
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9839
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9146
9840
|
}
|
|
9147
9841
|
|
|
9148
9842
|
if (il == n_layer - 1) {
|
|
@@ -9297,7 +9991,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|
|
9297
9991
|
|
|
9298
9992
|
cur = build_attn(inp_attn, gf,
|
|
9299
9993
|
model.layers[il].wo, model.layers[il].bo,
|
|
9300
|
-
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
|
|
9994
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
9301
9995
|
}
|
|
9302
9996
|
|
|
9303
9997
|
if (il == n_layer - 1) {
|
|
@@ -9387,15 +10081,22 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9387
10081
|
llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
9388
10082
|
bool is_lite = (hparams.n_layer == 27);
|
|
9389
10083
|
|
|
10084
|
+
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
|
|
10085
|
+
|
|
10086
|
+
// note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA
|
|
10087
|
+
const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k;
|
|
10088
|
+
const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v;
|
|
10089
|
+
|
|
10090
|
+
const int64_t n_embd_head_qk_rope = hparams.n_rot;
|
|
10091
|
+
const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;
|
|
10092
|
+
|
|
10093
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
10094
|
+
|
|
9390
10095
|
// We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
|
|
9391
10096
|
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
|
|
9392
10097
|
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
|
|
9393
|
-
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(
|
|
9394
|
-
const float
|
|
9395
|
-
|
|
9396
|
-
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
9397
|
-
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
9398
|
-
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
10098
|
+
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
|
|
10099
|
+
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
|
|
9399
10100
|
|
|
9400
10101
|
ggml_tensor * cur;
|
|
9401
10102
|
ggml_tensor * inpL;
|
|
@@ -9421,16 +10122,14 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9421
10122
|
{
|
|
9422
10123
|
ggml_tensor * q = NULL;
|
|
9423
10124
|
if (!is_lite) {
|
|
9424
|
-
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
|
9425
10125
|
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
|
9426
10126
|
cb(q, "q", il);
|
|
9427
10127
|
|
|
9428
10128
|
q = build_norm(q,
|
|
9429
|
-
model.layers[il].attn_q_a_norm,
|
|
10129
|
+
model.layers[il].attn_q_a_norm, nullptr,
|
|
9430
10130
|
LLM_NORM_RMS, il);
|
|
9431
10131
|
cb(q, "q", il);
|
|
9432
10132
|
|
|
9433
|
-
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
|
9434
10133
|
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
|
9435
10134
|
cb(q, "q", il);
|
|
9436
10135
|
} else {
|
|
@@ -9438,96 +10137,125 @@ struct llm_build_deepseek2 : public llm_graph_context {
|
|
|
9438
10137
|
cb(q, "q", il);
|
|
9439
10138
|
}
|
|
9440
10139
|
|
|
9441
|
-
// split into {
|
|
9442
|
-
ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
|
|
9443
|
-
|
|
9444
|
-
ggml_row_size(q->type,
|
|
10140
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
|
10141
|
+
ggml_tensor * q_nope = ggml_view_3d(ctx0, q,
|
|
10142
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
|
10143
|
+
ggml_row_size(q->type, n_embd_head_k),
|
|
10144
|
+
ggml_row_size(q->type, n_embd_head_k) * n_head,
|
|
9445
10145
|
0);
|
|
9446
10146
|
cb(q_nope, "q_nope", il);
|
|
9447
10147
|
|
|
9448
|
-
// and {
|
|
9449
|
-
ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
|
|
9450
|
-
|
|
9451
|
-
ggml_row_size(q->type,
|
|
10148
|
+
// and {n_embd_head_qk_rope, n_head, n_tokens}
|
|
10149
|
+
ggml_tensor * q_pe = ggml_view_3d(ctx0, q,
|
|
10150
|
+
n_embd_head_qk_rope, n_head, n_tokens,
|
|
10151
|
+
ggml_row_size(q->type, n_embd_head_k),
|
|
10152
|
+
ggml_row_size(q->type, n_embd_head_k) * n_head,
|
|
9452
10153
|
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
9453
10154
|
cb(q_pe, "q_pe", il);
|
|
9454
10155
|
|
|
9455
|
-
|
|
9456
|
-
|
|
9457
|
-
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
10156
|
+
ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
10157
|
+
cb(kv_cmpr_pe, "kv_cmpr_pe", il);
|
|
9458
10158
|
|
|
9459
10159
|
// split into {kv_lora_rank, n_tokens}
|
|
9460
|
-
ggml_tensor *
|
|
9461
|
-
|
|
10160
|
+
ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
|
|
10161
|
+
kv_lora_rank, n_tokens,
|
|
10162
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
9462
10163
|
0);
|
|
9463
|
-
cb(
|
|
10164
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
|
10165
|
+
|
|
10166
|
+
// and {n_embd_head_qk_rope, 1, n_tokens}
|
|
10167
|
+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
|
|
10168
|
+
n_embd_head_qk_rope, 1, n_tokens,
|
|
10169
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
10170
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
|
|
10171
|
+
ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
|
|
10172
|
+
cb(k_pe, "k_pe", il);
|
|
9464
10173
|
|
|
9465
|
-
|
|
9466
|
-
|
|
9467
|
-
|
|
9468
|
-
|
|
9469
|
-
|
|
10174
|
+
q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
|
|
10175
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10176
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10177
|
+
);
|
|
10178
|
+
cb(q_pe, "q_pe", il);
|
|
10179
|
+
|
|
10180
|
+
k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
|
|
10181
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
10182
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
10183
|
+
);
|
|
9470
10184
|
cb(k_pe, "k_pe", il);
|
|
9471
10185
|
|
|
9472
|
-
|
|
9473
|
-
|
|
9474
|
-
kv_compressed = build_norm(kv_compressed,
|
|
9475
|
-
model.layers[il].attn_kv_a_norm, NULL,
|
|
10186
|
+
kv_cmpr = build_norm(kv_cmpr,
|
|
10187
|
+
model.layers[il].attn_kv_a_norm, nullptr,
|
|
9476
10188
|
LLM_NORM_RMS, il);
|
|
9477
|
-
cb(
|
|
10189
|
+
cb(kv_cmpr, "kv_cmpr", il);
|
|
9478
10190
|
|
|
9479
|
-
|
|
9480
|
-
|
|
9481
|
-
|
|
10191
|
+
if (is_mla) {
|
|
10192
|
+
// {n_embd_head_qk_nope, n_tokens, n_head}
|
|
10193
|
+
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
|
|
10194
|
+
cb(q_nope, "q_nope_perm", il);
|
|
9482
10195
|
|
|
9483
|
-
|
|
9484
|
-
|
|
9485
|
-
|
|
9486
|
-
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
9487
|
-
0);
|
|
9488
|
-
cb(k_nope, "k_nope", il);
|
|
10196
|
+
// {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
|
|
10197
|
+
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
|
|
10198
|
+
cb(q_nope_absorbed, "q_nope_absorbed", il);
|
|
9489
10199
|
|
|
9490
|
-
|
|
9491
|
-
|
|
9492
|
-
|
|
9493
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
9494
|
-
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
9495
|
-
cb(v_states, "v_states", il);
|
|
10200
|
+
// {kv_lora_rank, n_head, n_tokens}
|
|
10201
|
+
q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3);
|
|
10202
|
+
cb(q_nope_absorbed, "q_nope_absorbed_perm", il);
|
|
9496
10203
|
|
|
9497
|
-
|
|
9498
|
-
|
|
10204
|
+
// {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens}
|
|
10205
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
10206
|
+
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0);
|
|
10207
|
+
cb(Qcur, "Qcur", il);
|
|
9499
10208
|
|
|
9500
|
-
|
|
9501
|
-
|
|
9502
|
-
0);
|
|
9503
|
-
cb(v_states, "v_states", il);
|
|
10209
|
+
kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens);
|
|
10210
|
+
cb(kv_cmpr, "kv_cmpr_reshape", il);
|
|
9504
10211
|
|
|
9505
|
-
|
|
9506
|
-
|
|
9507
|
-
|
|
9508
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9509
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
9510
|
-
);
|
|
9511
|
-
cb(q_pe, "q_pe", il);
|
|
10212
|
+
// {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens}
|
|
10213
|
+
ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0);
|
|
10214
|
+
cb(Kcur, "Kcur", il);
|
|
9512
10215
|
|
|
9513
|
-
|
|
9514
|
-
|
|
9515
|
-
|
|
9516
|
-
ctx0, k_pe, inp_pos, nullptr,
|
|
9517
|
-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
9518
|
-
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
|
9519
|
-
);
|
|
9520
|
-
cb(k_pe, "k_pe", il);
|
|
10216
|
+
// {kv_lora_rank, 1, n_tokens}
|
|
10217
|
+
ggml_tensor * Vcur = kv_cmpr;
|
|
10218
|
+
cb(Vcur, "Vcur", il);
|
|
9521
10219
|
|
|
9522
|
-
|
|
9523
|
-
|
|
10220
|
+
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
|
|
10221
|
+
cur = build_attn(inp_attn, gf,
|
|
10222
|
+
model.layers[il].wo, NULL,
|
|
10223
|
+
Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il);
|
|
10224
|
+
} else {
|
|
10225
|
+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
|
|
10226
|
+
cb(kv, "kv", il);
|
|
10227
|
+
|
|
10228
|
+
// split into {n_embd_head_qk_nope, n_head, n_tokens}
|
|
10229
|
+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
|
|
10230
|
+
n_embd_head_qk_nope, n_head, n_tokens,
|
|
10231
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
|
10232
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
|
10233
|
+
0);
|
|
10234
|
+
cb(k_nope, "k_nope_view", il);
|
|
9524
10235
|
|
|
9525
|
-
|
|
9526
|
-
|
|
10236
|
+
// and {n_embd_head_v, n_head, n_tokens}
|
|
10237
|
+
ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
|
|
10238
|
+
n_embd_head_v, n_head, n_tokens,
|
|
10239
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v),
|
|
10240
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head,
|
|
10241
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope));
|
|
10242
|
+
cb(Vcur, "Vcur_view", il);
|
|
9527
10243
|
|
|
9528
|
-
|
|
9529
|
-
|
|
9530
|
-
|
|
10244
|
+
Vcur = ggml_cont(ctx0, Vcur);
|
|
10245
|
+
cb(Vcur, "Vcur_cont", il);
|
|
10246
|
+
|
|
10247
|
+
// note: rope must go first for in-place context shifting in build_rope_shift()
|
|
10248
|
+
ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0);
|
|
10249
|
+
cb(Qcur, "Qcur", il);
|
|
10250
|
+
|
|
10251
|
+
ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0);
|
|
10252
|
+
cb(Kcur, "Kcur", il);
|
|
10253
|
+
|
|
10254
|
+
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
|
|
10255
|
+
cur = build_attn(inp_attn, gf,
|
|
10256
|
+
model.layers[il].wo, NULL,
|
|
10257
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il);
|
|
10258
|
+
}
|
|
9531
10259
|
}
|
|
9532
10260
|
|
|
9533
10261
|
if (il == n_layer - 1) {
|
|
@@ -9693,7 +10421,7 @@ struct llm_build_bitnet : public llm_graph_context {
|
|
|
9693
10421
|
|
|
9694
10422
|
cur = build_attn(inp_attn, gf,
|
|
9695
10423
|
NULL, NULL,
|
|
9696
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10424
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
9697
10425
|
|
|
9698
10426
|
cur = build_norm(cur,
|
|
9699
10427
|
model.layers[il].attn_sub_norm, NULL,
|
|
@@ -9816,7 +10544,7 @@ struct llm_build_t5_enc : public llm_graph_context {
|
|
|
9816
10544
|
|
|
9817
10545
|
cur = build_attn(inp_attn, gf,
|
|
9818
10546
|
model.layers[il].wo_enc, nullptr,
|
|
9819
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
|
10547
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
9820
10548
|
cb(cur, "kqv_out", il);
|
|
9821
10549
|
}
|
|
9822
10550
|
|
|
@@ -9922,7 +10650,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
9922
10650
|
|
|
9923
10651
|
cur = build_attn(inp_attn_self, gf,
|
|
9924
10652
|
model.layers[il].wo, model.layers[il].bo,
|
|
9925
|
-
Qcur, Kcur, Vcur, kq_b, 1.0f, il);
|
|
10653
|
+
Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il);
|
|
9926
10654
|
cb(cur, "kqv_out", il);
|
|
9927
10655
|
}
|
|
9928
10656
|
|
|
@@ -9954,7 +10682,7 @@ struct llm_build_t5_dec : public llm_graph_context {
|
|
|
9954
10682
|
|
|
9955
10683
|
cur = build_attn(inp_attn_cross, gf,
|
|
9956
10684
|
model.layers[il].wo_cross, nullptr,
|
|
9957
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f, il);
|
|
10685
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il);
|
|
9958
10686
|
cb(cur, "kqv_out", il);
|
|
9959
10687
|
|
|
9960
10688
|
//ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
|
@@ -10087,7 +10815,7 @@ struct llm_build_jais : public llm_graph_context {
|
|
|
10087
10815
|
|
|
10088
10816
|
cur = build_attn(inp_attn, gf,
|
|
10089
10817
|
model.layers[il].wo, model.layers[il].bo,
|
|
10090
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il);
|
|
10818
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il);
|
|
10091
10819
|
}
|
|
10092
10820
|
|
|
10093
10821
|
if (il == n_layer - 1) {
|
|
@@ -10219,7 +10947,7 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
10219
10947
|
|
|
10220
10948
|
cur = build_attn(inp_attn, gf,
|
|
10221
10949
|
model.layers[il].wo, NULL,
|
|
10222
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10950
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10223
10951
|
}
|
|
10224
10952
|
|
|
10225
10953
|
if (il == n_layer - 1) {
|
|
@@ -10272,6 +11000,157 @@ struct llm_build_chatglm : public llm_graph_context {
|
|
|
10272
11000
|
}
|
|
10273
11001
|
};
|
|
10274
11002
|
|
|
11003
|
+
struct llm_build_glm4 : public llm_graph_context {
|
|
11004
|
+
llm_build_glm4(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
11005
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
11006
|
+
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
11007
|
+
|
|
11008
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
11009
|
+
|
|
11010
|
+
ggml_tensor * cur;
|
|
11011
|
+
ggml_tensor * inpL;
|
|
11012
|
+
|
|
11013
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
11014
|
+
|
|
11015
|
+
// inp_pos - contains the positions
|
|
11016
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
11017
|
+
|
|
11018
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11019
|
+
|
|
11020
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
11021
|
+
ggml_tensor * inpSA = inpL;
|
|
11022
|
+
|
|
11023
|
+
// Pre-attention norm
|
|
11024
|
+
cur = build_norm(inpL,
|
|
11025
|
+
model.layers[il].attn_norm,
|
|
11026
|
+
NULL,
|
|
11027
|
+
LLM_NORM_RMS, il);
|
|
11028
|
+
cb(cur, "attn_norm", il);
|
|
11029
|
+
|
|
11030
|
+
// self-attention
|
|
11031
|
+
{
|
|
11032
|
+
ggml_tensor * Qcur = nullptr;
|
|
11033
|
+
ggml_tensor * Kcur = nullptr;
|
|
11034
|
+
ggml_tensor * Vcur = nullptr;
|
|
11035
|
+
|
|
11036
|
+
if (model.layers[il].wqkv == nullptr) {
|
|
11037
|
+
Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
11038
|
+
if (model.layers[il].bq) {
|
|
11039
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
11040
|
+
}
|
|
11041
|
+
Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
11042
|
+
if (model.layers[il].bk) {
|
|
11043
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
11044
|
+
}
|
|
11045
|
+
Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
11046
|
+
if (model.layers[il].bv) {
|
|
11047
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
11048
|
+
}
|
|
11049
|
+
} else {
|
|
11050
|
+
cur = build_lora_mm(model.layers[il].wqkv, cur);
|
|
11051
|
+
cb(cur, "wqkv", il);
|
|
11052
|
+
if (model.layers[il].bqkv) {
|
|
11053
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
11054
|
+
cb(cur, "bqkv", il);
|
|
11055
|
+
}
|
|
11056
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
11057
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
11058
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
11059
|
+
}
|
|
11060
|
+
|
|
11061
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
11062
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
11063
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
|
|
11064
|
+
|
|
11065
|
+
Qcur = ggml_rope_ext(
|
|
11066
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
11067
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11068
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11069
|
+
);
|
|
11070
|
+
|
|
11071
|
+
Kcur = ggml_rope_ext(
|
|
11072
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
11073
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
11074
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
11075
|
+
);
|
|
11076
|
+
|
|
11077
|
+
cb(Qcur, "Qcur", il);
|
|
11078
|
+
cb(Kcur, "Kcur", il);
|
|
11079
|
+
cb(Vcur, "Vcur", il);
|
|
11080
|
+
|
|
11081
|
+
cur = build_attn(inp_attn, gf,
|
|
11082
|
+
model.layers[il].wo, NULL,
|
|
11083
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11084
|
+
}
|
|
11085
|
+
|
|
11086
|
+
if (il == n_layer - 1) {
|
|
11087
|
+
// skip computing output for unused tokens
|
|
11088
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
11089
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
11090
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
11091
|
+
}
|
|
11092
|
+
|
|
11093
|
+
// Post-attention norm (new!)
|
|
11094
|
+
cur = build_norm(cur,
|
|
11095
|
+
model.layers[il].attn_post_norm,
|
|
11096
|
+
NULL,
|
|
11097
|
+
LLM_NORM_RMS, il);
|
|
11098
|
+
cb(cur, "post_attn_norm", il);
|
|
11099
|
+
|
|
11100
|
+
// Add the input (residual connection after post-attention norm)
|
|
11101
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
11102
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
11103
|
+
|
|
11104
|
+
// FF
|
|
11105
|
+
{
|
|
11106
|
+
// Pre-MLP norm
|
|
11107
|
+
cur = build_norm(ffn_inp,
|
|
11108
|
+
model.layers[il].ffn_norm,
|
|
11109
|
+
NULL,
|
|
11110
|
+
LLM_NORM_RMS, il);
|
|
11111
|
+
cb(cur, "ffn_norm", il);
|
|
11112
|
+
|
|
11113
|
+
// MLP
|
|
11114
|
+
cur = build_ffn(cur,
|
|
11115
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
11116
|
+
NULL, NULL, NULL,
|
|
11117
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
11118
|
+
NULL,
|
|
11119
|
+
LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
|
|
11120
|
+
cb(cur, "ffn_out", il);
|
|
11121
|
+
|
|
11122
|
+
// Post-MLP norm
|
|
11123
|
+
cur = build_norm(cur,
|
|
11124
|
+
model.layers[il].ffn_post_norm,
|
|
11125
|
+
NULL,
|
|
11126
|
+
LLM_NORM_RMS, il);
|
|
11127
|
+
cb(cur, "post_mlp_norm", il);
|
|
11128
|
+
}
|
|
11129
|
+
|
|
11130
|
+
// Add residual connection after post-MLP norm
|
|
11131
|
+
inpL = ggml_add(ctx0, cur, ffn_inp);
|
|
11132
|
+
cb(inpL, "l_out", il);
|
|
11133
|
+
}
|
|
11134
|
+
|
|
11135
|
+
// Final norm
|
|
11136
|
+
cur = build_norm(inpL,
|
|
11137
|
+
model.output_norm,
|
|
11138
|
+
NULL,
|
|
11139
|
+
LLM_NORM_RMS, -1);
|
|
11140
|
+
|
|
11141
|
+
cb(cur, "result_norm", -1);
|
|
11142
|
+
res->t_embd = cur;
|
|
11143
|
+
|
|
11144
|
+
// Output projection
|
|
11145
|
+
cur = build_lora_mm(model.output, cur);
|
|
11146
|
+
|
|
11147
|
+
cb(cur, "result_output", -1);
|
|
11148
|
+
res->t_logits = cur;
|
|
11149
|
+
|
|
11150
|
+
ggml_build_forward_expand(gf, cur);
|
|
11151
|
+
}
|
|
11152
|
+
};
|
|
11153
|
+
|
|
10275
11154
|
struct llm_build_nemotron : public llm_graph_context {
|
|
10276
11155
|
llm_build_nemotron(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
10277
11156
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
@@ -10345,7 +11224,7 @@ struct llm_build_nemotron : public llm_graph_context {
|
|
|
10345
11224
|
|
|
10346
11225
|
cur = build_attn(inp_attn, gf,
|
|
10347
11226
|
model.layers[il].wo, model.layers[il].bo,
|
|
10348
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11227
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10349
11228
|
}
|
|
10350
11229
|
|
|
10351
11230
|
if (il == n_layer - 1) {
|
|
@@ -10476,7 +11355,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|
|
10476
11355
|
|
|
10477
11356
|
cur = build_attn(inp_attn, gf,
|
|
10478
11357
|
model.layers[il].wo, model.layers[il].bo,
|
|
10479
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11358
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
10480
11359
|
}
|
|
10481
11360
|
|
|
10482
11361
|
if (il == n_layer - 1) {
|
|
@@ -11378,7 +12257,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
11378
12257
|
|
|
11379
12258
|
cur = build_attn(inp_attn, gf,
|
|
11380
12259
|
model.layers[il].wo, nullptr,
|
|
11381
|
-
Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
12260
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
|
|
11382
12261
|
|
|
11383
12262
|
if (hparams.swin_norm) {
|
|
11384
12263
|
cur = build_norm(cur,
|
|
@@ -11397,31 +12276,370 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
11397
12276
|
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
11398
12277
|
cb(ffn_inp, "ffn_inp", il);
|
|
11399
12278
|
|
|
11400
|
-
// feed-forward network
|
|
11401
|
-
if (!hparams.swin_norm) {
|
|
11402
|
-
cur = build_norm(ffn_inp,
|
|
11403
|
-
model.layers[il].ffn_norm, NULL,
|
|
11404
|
-
LLM_NORM_RMS, il);
|
|
11405
|
-
cb(cur, "ffn_norm", il);
|
|
11406
|
-
}
|
|
12279
|
+
// feed-forward network
|
|
12280
|
+
if (!hparams.swin_norm) {
|
|
12281
|
+
cur = build_norm(ffn_inp,
|
|
12282
|
+
model.layers[il].ffn_norm, NULL,
|
|
12283
|
+
LLM_NORM_RMS, il);
|
|
12284
|
+
cb(cur, "ffn_norm", il);
|
|
12285
|
+
}
|
|
12286
|
+
|
|
12287
|
+
cur = build_ffn(cur,
|
|
12288
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
12289
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
12290
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
12291
|
+
NULL,
|
|
12292
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12293
|
+
cb(cur, "ffn_out", il);
|
|
12294
|
+
|
|
12295
|
+
if (hparams.swin_norm) {
|
|
12296
|
+
cur = build_norm(cur,
|
|
12297
|
+
model.layers[il].ffn_norm, NULL,
|
|
12298
|
+
LLM_NORM_RMS, il);
|
|
12299
|
+
cb(cur, "ffn_norm", il);
|
|
12300
|
+
}
|
|
12301
|
+
|
|
12302
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
12303
|
+
cb(cur, "ffn_out", il);
|
|
12304
|
+
|
|
12305
|
+
cur = build_cvec(cur, il);
|
|
12306
|
+
cb(cur, "l_out", il);
|
|
12307
|
+
|
|
12308
|
+
// input for next layer
|
|
12309
|
+
inpL = cur;
|
|
12310
|
+
}
|
|
12311
|
+
|
|
12312
|
+
cur = inpL;
|
|
12313
|
+
|
|
12314
|
+
cur = build_norm(cur,
|
|
12315
|
+
model.output_norm, NULL,
|
|
12316
|
+
LLM_NORM_RMS, -1);
|
|
12317
|
+
|
|
12318
|
+
cb(cur, "result_norm", -1);
|
|
12319
|
+
res->t_embd = cur;
|
|
12320
|
+
|
|
12321
|
+
// lm_head
|
|
12322
|
+
cur = build_lora_mm(model.output, cur);
|
|
12323
|
+
cb(cur, "result_output_with_img_logits", -1);
|
|
12324
|
+
|
|
12325
|
+
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
|
|
12326
|
+
// Needs to be removed once image outputs are supported.
|
|
12327
|
+
int img_token_end_idx = 8196;
|
|
12328
|
+
int img_token_start_idx = 4;
|
|
12329
|
+
int num_img_tokens = img_token_end_idx - img_token_start_idx;
|
|
12330
|
+
// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
|
|
12331
|
+
// which ensures that text token values are always at least larger than image token values
|
|
12332
|
+
ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
|
|
12333
|
+
img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
|
|
12334
|
+
cb(img_logits, "img_logits", -1);
|
|
12335
|
+
|
|
12336
|
+
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
|
|
12337
|
+
|
|
12338
|
+
cb(cur, "result_output", -1);
|
|
12339
|
+
res->t_logits = cur;
|
|
12340
|
+
|
|
12341
|
+
ggml_build_forward_expand(gf, cur);
|
|
12342
|
+
}
|
|
12343
|
+
};
|
|
12344
|
+
|
|
12345
|
+
struct llm_build_wavtokenizer_dec : public llm_graph_context {
|
|
12346
|
+
llm_build_wavtokenizer_dec(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
12347
|
+
ggml_tensor * cur;
|
|
12348
|
+
ggml_tensor * inpL;
|
|
12349
|
+
|
|
12350
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12351
|
+
|
|
12352
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
|
|
12353
|
+
|
|
12354
|
+
cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
|
|
12355
|
+
cur = ggml_add(ctx0, cur, model.conv1d_b);
|
|
12356
|
+
|
|
12357
|
+
// posnet
|
|
12358
|
+
for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
|
|
12359
|
+
const auto & layer = model.layers[il].posnet;
|
|
12360
|
+
|
|
12361
|
+
inpL = cur;
|
|
12362
|
+
|
|
12363
|
+
switch (il) {
|
|
12364
|
+
case 0:
|
|
12365
|
+
case 1:
|
|
12366
|
+
case 3:
|
|
12367
|
+
case 4:
|
|
12368
|
+
{
|
|
12369
|
+
cur = build_norm(cur,
|
|
12370
|
+
layer.norm1,
|
|
12371
|
+
layer.norm1_b,
|
|
12372
|
+
LLM_NORM_GROUP, 0);
|
|
12373
|
+
|
|
12374
|
+
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
12375
|
+
|
|
12376
|
+
cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
|
|
12377
|
+
cur = ggml_add(ctx0, cur, layer.conv1_b);
|
|
12378
|
+
|
|
12379
|
+
cur = build_norm(cur,
|
|
12380
|
+
layer.norm2,
|
|
12381
|
+
layer.norm2_b,
|
|
12382
|
+
LLM_NORM_GROUP, 0);
|
|
12383
|
+
|
|
12384
|
+
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
12385
|
+
|
|
12386
|
+
cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
|
|
12387
|
+
cur = ggml_add(ctx0, cur, layer.conv2_b);
|
|
12388
|
+
|
|
12389
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
12390
|
+
} break;
|
|
12391
|
+
case 2:
|
|
12392
|
+
{
|
|
12393
|
+
cur = build_norm(cur,
|
|
12394
|
+
layer.attn_norm,
|
|
12395
|
+
layer.attn_norm_b,
|
|
12396
|
+
LLM_NORM_GROUP, 0);
|
|
12397
|
+
|
|
12398
|
+
ggml_tensor * q;
|
|
12399
|
+
ggml_tensor * k;
|
|
12400
|
+
ggml_tensor * v;
|
|
12401
|
+
|
|
12402
|
+
q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
|
|
12403
|
+
k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
|
|
12404
|
+
v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
|
|
12405
|
+
|
|
12406
|
+
q = ggml_add(ctx0, q, layer.attn_q_b);
|
|
12407
|
+
k = ggml_add(ctx0, k, layer.attn_k_b);
|
|
12408
|
+
v = ggml_add(ctx0, v, layer.attn_v_b);
|
|
12409
|
+
|
|
12410
|
+
q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
|
|
12411
|
+
k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
|
|
12412
|
+
|
|
12413
|
+
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
12414
|
+
|
|
12415
|
+
kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
|
|
12416
|
+
|
|
12417
|
+
cur = ggml_mul_mat(ctx0, kq, v);
|
|
12418
|
+
|
|
12419
|
+
cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
|
|
12420
|
+
cur = ggml_add(ctx0, cur, layer.attn_o_b);
|
|
12421
|
+
|
|
12422
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
12423
|
+
} break;
|
|
12424
|
+
case 5:
|
|
12425
|
+
{
|
|
12426
|
+
cur = build_norm(cur,
|
|
12427
|
+
layer.norm,
|
|
12428
|
+
layer.norm_b,
|
|
12429
|
+
LLM_NORM_GROUP, 0);
|
|
12430
|
+
} break;
|
|
12431
|
+
default: GGML_ABORT("unknown posnet layer");
|
|
12432
|
+
};
|
|
12433
|
+
}
|
|
12434
|
+
|
|
12435
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
12436
|
+
|
|
12437
|
+
cur = build_norm(cur,
|
|
12438
|
+
model.tok_norm,
|
|
12439
|
+
model.tok_norm_b,
|
|
12440
|
+
LLM_NORM, -1);
|
|
12441
|
+
|
|
12442
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
12443
|
+
|
|
12444
|
+
inpL = cur;
|
|
12445
|
+
|
|
12446
|
+
// convnext
|
|
12447
|
+
for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
|
|
12448
|
+
const auto & layer = model.layers[il].convnext;
|
|
12449
|
+
|
|
12450
|
+
cur = inpL;
|
|
12451
|
+
|
|
12452
|
+
cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
|
|
12453
|
+
cur = ggml_add(ctx0, cur, layer.dw_b);
|
|
12454
|
+
|
|
12455
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
12456
|
+
|
|
12457
|
+
cur = build_norm(cur,
|
|
12458
|
+
layer.norm,
|
|
12459
|
+
layer.norm_b,
|
|
12460
|
+
LLM_NORM, -1);
|
|
12461
|
+
|
|
12462
|
+
cur = build_ffn(cur,
|
|
12463
|
+
layer.pw1, layer.pw1_b, NULL,
|
|
12464
|
+
NULL, NULL, NULL,
|
|
12465
|
+
layer.pw2, layer.pw2_b, NULL,
|
|
12466
|
+
NULL,
|
|
12467
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
12468
|
+
|
|
12469
|
+
cur = ggml_mul(ctx0, cur, layer.gamma);
|
|
12470
|
+
|
|
12471
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
12472
|
+
|
|
12473
|
+
inpL = ggml_add(ctx0, cur, inpL);
|
|
12474
|
+
}
|
|
12475
|
+
|
|
12476
|
+
cur = inpL;
|
|
12477
|
+
|
|
12478
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
12479
|
+
|
|
12480
|
+
cur = build_norm(cur,
|
|
12481
|
+
model.output_norm,
|
|
12482
|
+
model.output_norm_b,
|
|
12483
|
+
LLM_NORM, -1);
|
|
12484
|
+
|
|
12485
|
+
// lm_head
|
|
12486
|
+
cur = build_lora_mm(model.output, cur);
|
|
12487
|
+
|
|
12488
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
|
12489
|
+
|
|
12490
|
+
cb(cur, "result_embd", -1);
|
|
12491
|
+
res->t_embd = cur;
|
|
12492
|
+
|
|
12493
|
+
ggml_build_forward_expand(gf, cur);
|
|
12494
|
+
}
|
|
12495
|
+
};
|
|
12496
|
+
|
|
12497
|
+
struct llm_build_plm : public llm_graph_context {
|
|
12498
|
+
llm_build_plm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
12499
|
+
const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
|
|
12500
|
+
|
|
12501
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
12502
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
12503
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
12504
|
+
|
|
12505
|
+
ggml_tensor * cur;
|
|
12506
|
+
ggml_tensor * inpL;
|
|
12507
|
+
|
|
12508
|
+
// {n_embd, n_tokens}
|
|
12509
|
+
inpL = build_inp_embd(model.tok_embd);
|
|
12510
|
+
|
|
12511
|
+
// inp_pos - contains the positions
|
|
12512
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
12513
|
+
|
|
12514
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
12515
|
+
|
|
12516
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12517
|
+
ggml_tensor * inpSA = inpL;
|
|
12518
|
+
|
|
12519
|
+
// norm
|
|
12520
|
+
cur = build_norm(inpL,
|
|
12521
|
+
model.layers[il].attn_norm, NULL,
|
|
12522
|
+
LLM_NORM_RMS, il);
|
|
12523
|
+
cb(cur, "attn_norm", il);
|
|
12524
|
+
|
|
12525
|
+
// self_attention
|
|
12526
|
+
{
|
|
12527
|
+
ggml_tensor * q = NULL;
|
|
12528
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
|
12529
|
+
cb(q, "q", il);
|
|
12530
|
+
|
|
12531
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
12532
|
+
ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
12533
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
12534
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
12535
|
+
0);
|
|
12536
|
+
cb(q_nope, "q_nope", il);
|
|
12537
|
+
|
|
12538
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
12539
|
+
ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
12540
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
12541
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
12542
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
12543
|
+
cb(q_pe, "q_pe", il);
|
|
12544
|
+
|
|
12545
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
12546
|
+
ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
12547
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
12548
|
+
|
|
12549
|
+
// split into {kv_lora_rank, n_tokens}
|
|
12550
|
+
ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
12551
|
+
kv_pe_compresseed->nb[1],
|
|
12552
|
+
0);
|
|
12553
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
12554
|
+
|
|
12555
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
|
12556
|
+
ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
12557
|
+
kv_pe_compresseed->nb[1],
|
|
12558
|
+
kv_pe_compresseed->nb[1],
|
|
12559
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
12560
|
+
cb(k_pe, "k_pe", il);
|
|
12561
|
+
|
|
12562
|
+
kv_compressed = build_norm(kv_compressed,
|
|
12563
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
|
12564
|
+
LLM_NORM_RMS, il);
|
|
12565
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
12566
|
+
|
|
12567
|
+
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
|
12568
|
+
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
|
12569
|
+
cb(kv, "kv", il);
|
|
12570
|
+
|
|
12571
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
12572
|
+
ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
|
12573
|
+
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
|
12574
|
+
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
12575
|
+
0);
|
|
12576
|
+
cb(k_nope, "k_nope", il);
|
|
12577
|
+
|
|
12578
|
+
// and {n_head * n_embd_head_v, n_tokens}
|
|
12579
|
+
ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
|
12580
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
|
12581
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
|
12582
|
+
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
|
12583
|
+
cb(v_states, "v_states", il);
|
|
12584
|
+
|
|
12585
|
+
v_states = ggml_cont(ctx0, v_states);
|
|
12586
|
+
cb(v_states, "v_states", il);
|
|
12587
|
+
|
|
12588
|
+
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
|
12589
|
+
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
|
12590
|
+
0);
|
|
12591
|
+
cb(v_states, "v_states", il);
|
|
12592
|
+
|
|
12593
|
+
q_pe = ggml_rope_ext(
|
|
12594
|
+
ctx0, q_pe, inp_pos, nullptr,
|
|
12595
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12596
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12597
|
+
);
|
|
12598
|
+
cb(q_pe, "q_pe", il);
|
|
12599
|
+
|
|
12600
|
+
// shared RoPE key
|
|
12601
|
+
k_pe = ggml_rope_ext(
|
|
12602
|
+
ctx0, k_pe, inp_pos, nullptr,
|
|
12603
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12604
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12605
|
+
);
|
|
12606
|
+
cb(k_pe, "k_pe", il);
|
|
12607
|
+
|
|
12608
|
+
ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
|
|
12609
|
+
cb(q_states, "q_states", il);
|
|
12610
|
+
|
|
12611
|
+
ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
|
|
12612
|
+
cb(k_states, "k_states", il);
|
|
12613
|
+
|
|
12614
|
+
cur = build_attn(inp_attn, gf,
|
|
12615
|
+
model.layers[il].wo, NULL,
|
|
12616
|
+
q_states, k_states, v_states, nullptr, nullptr, kq_scale, il);
|
|
12617
|
+
}
|
|
12618
|
+
|
|
12619
|
+
if (il == n_layer - 1) {
|
|
12620
|
+
// skip computing output for unused tokens
|
|
12621
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12622
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12623
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12624
|
+
}
|
|
12625
|
+
|
|
12626
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12627
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
12628
|
+
|
|
12629
|
+
cur = build_norm(ffn_inp,
|
|
12630
|
+
model.layers[il].ffn_norm, NULL,
|
|
12631
|
+
LLM_NORM_RMS, il);
|
|
12632
|
+
cb(cur, "ffn_norm", il);
|
|
11407
12633
|
|
|
11408
12634
|
cur = build_ffn(cur,
|
|
11409
12635
|
model.layers[il].ffn_up, NULL, NULL,
|
|
11410
|
-
|
|
12636
|
+
NULL, NULL, NULL,
|
|
11411
12637
|
model.layers[il].ffn_down, NULL, NULL,
|
|
11412
12638
|
NULL,
|
|
11413
|
-
|
|
12639
|
+
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);
|
|
11414
12640
|
cb(cur, "ffn_out", il);
|
|
11415
12641
|
|
|
11416
|
-
if (hparams.swin_norm) {
|
|
11417
|
-
cur = build_norm(cur,
|
|
11418
|
-
model.layers[il].ffn_norm, NULL,
|
|
11419
|
-
LLM_NORM_RMS, il);
|
|
11420
|
-
cb(cur, "ffn_norm", il);
|
|
11421
|
-
}
|
|
11422
|
-
|
|
11423
12642
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
11424
|
-
cb(cur, "ffn_out", il);
|
|
11425
12643
|
|
|
11426
12644
|
cur = build_cvec(cur, il);
|
|
11427
12645
|
cb(cur, "l_out", il);
|
|
@@ -11439,22 +12657,7 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
11439
12657
|
cb(cur, "result_norm", -1);
|
|
11440
12658
|
res->t_embd = cur;
|
|
11441
12659
|
|
|
11442
|
-
// lm_head
|
|
11443
12660
|
cur = build_lora_mm(model.output, cur);
|
|
11444
|
-
cb(cur, "result_output_with_img_logits", -1);
|
|
11445
|
-
|
|
11446
|
-
// TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.
|
|
11447
|
-
// Needs to be removed once image outputs are supported.
|
|
11448
|
-
int img_token_end_idx = 8196;
|
|
11449
|
-
int img_token_start_idx = 4;
|
|
11450
|
-
int num_img_tokens = img_token_end_idx - img_token_start_idx;
|
|
11451
|
-
// creates 1d tensor of size num_img_tokens and values -FLT_MAX,
|
|
11452
|
-
// which ensures that text token values are always at least larger than image token values
|
|
11453
|
-
ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
|
|
11454
|
-
img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
|
|
11455
|
-
cb(img_logits, "img_logits", -1);
|
|
11456
|
-
|
|
11457
|
-
cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
|
|
11458
12661
|
|
|
11459
12662
|
cb(cur, "result_output", -1);
|
|
11460
12663
|
res->t_logits = cur;
|
|
@@ -11463,153 +12666,145 @@ struct llm_build_chameleon : public llm_graph_context {
|
|
|
11463
12666
|
}
|
|
11464
12667
|
};
|
|
11465
12668
|
|
|
11466
|
-
struct
|
|
11467
|
-
|
|
12669
|
+
struct llm_build_bailingmoe : public llm_graph_context {
|
|
12670
|
+
llm_build_bailingmoe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
|
11468
12671
|
ggml_tensor * cur;
|
|
11469
12672
|
ggml_tensor * inpL;
|
|
11470
12673
|
|
|
11471
12674
|
inpL = build_inp_embd(model.tok_embd);
|
|
11472
12675
|
|
|
11473
|
-
|
|
11474
|
-
|
|
11475
|
-
cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
|
|
11476
|
-
cur = ggml_add(ctx0, cur, model.conv1d_b);
|
|
11477
|
-
|
|
11478
|
-
// posnet
|
|
11479
|
-
for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
|
|
11480
|
-
const auto & layer = model.layers[il].posnet;
|
|
11481
|
-
|
|
11482
|
-
inpL = cur;
|
|
11483
|
-
|
|
11484
|
-
switch (il) {
|
|
11485
|
-
case 0:
|
|
11486
|
-
case 1:
|
|
11487
|
-
case 3:
|
|
11488
|
-
case 4:
|
|
11489
|
-
{
|
|
11490
|
-
cur = build_norm(cur,
|
|
11491
|
-
layer.norm1,
|
|
11492
|
-
layer.norm1_b,
|
|
11493
|
-
LLM_NORM_GROUP, 0);
|
|
11494
|
-
|
|
11495
|
-
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
11496
|
-
|
|
11497
|
-
cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
|
|
11498
|
-
cur = ggml_add(ctx0, cur, layer.conv1_b);
|
|
11499
|
-
|
|
11500
|
-
cur = build_norm(cur,
|
|
11501
|
-
layer.norm2,
|
|
11502
|
-
layer.norm2_b,
|
|
11503
|
-
LLM_NORM_GROUP, 0);
|
|
11504
|
-
|
|
11505
|
-
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
11506
|
-
|
|
11507
|
-
cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
|
|
11508
|
-
cur = ggml_add(ctx0, cur, layer.conv2_b);
|
|
11509
|
-
|
|
11510
|
-
cur = ggml_add(ctx0, cur, inpL);
|
|
11511
|
-
} break;
|
|
11512
|
-
case 2:
|
|
11513
|
-
{
|
|
11514
|
-
cur = build_norm(cur,
|
|
11515
|
-
layer.attn_norm,
|
|
11516
|
-
layer.attn_norm_b,
|
|
11517
|
-
LLM_NORM_GROUP, 0);
|
|
11518
|
-
|
|
11519
|
-
ggml_tensor * q;
|
|
11520
|
-
ggml_tensor * k;
|
|
11521
|
-
ggml_tensor * v;
|
|
12676
|
+
// inp_pos - contains the positions
|
|
12677
|
+
ggml_tensor * inp_pos = build_inp_pos();
|
|
11522
12678
|
|
|
11523
|
-
|
|
11524
|
-
k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
|
|
11525
|
-
v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
|
|
12679
|
+
auto * inp_attn = build_attn_inp_kv_unified();
|
|
11526
12680
|
|
|
11527
|
-
|
|
11528
|
-
|
|
11529
|
-
v = ggml_add(ctx0, v, layer.attn_v_b);
|
|
12681
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12682
|
+
ggml_tensor * inpSA = inpL;
|
|
11530
12683
|
|
|
11531
|
-
|
|
11532
|
-
|
|
12684
|
+
// norm
|
|
12685
|
+
cur = build_norm(inpL,
|
|
12686
|
+
model.layers[il].attn_norm, NULL,
|
|
12687
|
+
LLM_NORM_RMS, il);
|
|
12688
|
+
cb(cur, "attn_norm", il);
|
|
11533
12689
|
|
|
11534
|
-
|
|
12690
|
+
// self-attention
|
|
12691
|
+
{
|
|
12692
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
12693
|
+
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
|
11535
12694
|
|
|
11536
|
-
|
|
12695
|
+
// compute Q and K and RoPE them
|
|
12696
|
+
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
|
12697
|
+
cb(Qcur, "Qcur", il);
|
|
12698
|
+
if (model.layers[il].bq) {
|
|
12699
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
12700
|
+
cb(Qcur, "Qcur", il);
|
|
12701
|
+
}
|
|
11537
12702
|
|
|
11538
|
-
|
|
12703
|
+
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
|
|
12704
|
+
cb(Kcur, "Kcur", il);
|
|
12705
|
+
if (model.layers[il].bk) {
|
|
12706
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
12707
|
+
cb(Kcur, "Kcur", il);
|
|
12708
|
+
}
|
|
11539
12709
|
|
|
11540
|
-
|
|
11541
|
-
|
|
12710
|
+
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
|
|
12711
|
+
cb(Vcur, "Vcur", il);
|
|
12712
|
+
if (model.layers[il].bv) {
|
|
12713
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12714
|
+
cb(Vcur, "Vcur", il);
|
|
12715
|
+
}
|
|
11542
12716
|
|
|
11543
|
-
|
|
11544
|
-
|
|
11545
|
-
|
|
11546
|
-
{
|
|
11547
|
-
cur = build_norm(cur,
|
|
11548
|
-
layer.norm,
|
|
11549
|
-
layer.norm_b,
|
|
11550
|
-
LLM_NORM_GROUP, 0);
|
|
11551
|
-
} break;
|
|
11552
|
-
default: GGML_ABORT("unknown posnet layer");
|
|
11553
|
-
};
|
|
11554
|
-
}
|
|
12717
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens);
|
|
12718
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens);
|
|
12719
|
+
Vcur = ggml_reshape_3d(ctx0, Vcur, n_rot, n_head_kv, n_tokens);
|
|
11555
12720
|
|
|
11556
|
-
|
|
12721
|
+
Qcur = ggml_rope_ext(
|
|
12722
|
+
ctx0, Qcur, inp_pos, rope_factors,
|
|
12723
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12724
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12725
|
+
);
|
|
11557
12726
|
|
|
11558
|
-
|
|
11559
|
-
|
|
11560
|
-
|
|
11561
|
-
|
|
12727
|
+
Kcur = ggml_rope_ext(
|
|
12728
|
+
ctx0, Kcur, inp_pos, rope_factors,
|
|
12729
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12730
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12731
|
+
);
|
|
11562
12732
|
|
|
11563
|
-
|
|
12733
|
+
cb(Qcur, "Qcur", il);
|
|
12734
|
+
cb(Kcur, "Kcur", il);
|
|
12735
|
+
cb(Vcur, "Vcur", il);
|
|
11564
12736
|
|
|
11565
|
-
|
|
12737
|
+
cur = build_attn(inp_attn, gf,
|
|
12738
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
12739
|
+
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il);
|
|
12740
|
+
}
|
|
11566
12741
|
|
|
11567
|
-
|
|
11568
|
-
|
|
11569
|
-
|
|
12742
|
+
if (il == n_layer - 1) {
|
|
12743
|
+
// skip computing output for unused tokens
|
|
12744
|
+
ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
12745
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
12746
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
12747
|
+
}
|
|
11570
12748
|
|
|
11571
|
-
|
|
12749
|
+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
12750
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
11572
12751
|
|
|
11573
|
-
cur =
|
|
11574
|
-
|
|
12752
|
+
cur = build_norm(ffn_inp,
|
|
12753
|
+
model.layers[il].ffn_norm, NULL,
|
|
12754
|
+
LLM_NORM_RMS, il);
|
|
12755
|
+
cb(cur, "ffn_norm", il);
|
|
11575
12756
|
|
|
11576
|
-
|
|
12757
|
+
ggml_tensor * moe_out =
|
|
12758
|
+
build_moe_ffn(cur,
|
|
12759
|
+
model.layers[il].ffn_gate_inp,
|
|
12760
|
+
model.layers[il].ffn_up_exps,
|
|
12761
|
+
model.layers[il].ffn_gate_exps,
|
|
12762
|
+
model.layers[il].ffn_down_exps,
|
|
12763
|
+
nullptr,
|
|
12764
|
+
n_expert, n_expert_used,
|
|
12765
|
+
LLM_FFN_SILU, hparams.expert_weights_norm,
|
|
12766
|
+
false, hparams.expert_weights_scale,
|
|
12767
|
+
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
|
|
12768
|
+
il);
|
|
12769
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
11577
12770
|
|
|
11578
|
-
|
|
11579
|
-
|
|
11580
|
-
|
|
11581
|
-
|
|
12771
|
+
// FFN shared expert
|
|
12772
|
+
{
|
|
12773
|
+
ggml_tensor * ffn_shexp = build_ffn(cur,
|
|
12774
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
12775
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
12776
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
12777
|
+
NULL,
|
|
12778
|
+
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
|
12779
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
11582
12780
|
|
|
11583
|
-
|
|
11584
|
-
|
|
11585
|
-
|
|
11586
|
-
layer.pw2, layer.pw2_b, NULL,
|
|
11587
|
-
NULL,
|
|
11588
|
-
LLM_FFN_GELU, LLM_FFN_SEQ, il);
|
|
12781
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
12782
|
+
cb(cur, "ffn_out", il);
|
|
12783
|
+
}
|
|
11589
12784
|
|
|
11590
|
-
cur =
|
|
12785
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
11591
12786
|
|
|
11592
|
-
cur =
|
|
12787
|
+
cur = build_cvec(cur, il);
|
|
12788
|
+
cb(cur, "l_out", il);
|
|
11593
12789
|
|
|
11594
|
-
|
|
12790
|
+
// input for next layer
|
|
12791
|
+
inpL = cur;
|
|
11595
12792
|
}
|
|
11596
12793
|
|
|
11597
12794
|
cur = inpL;
|
|
11598
12795
|
|
|
11599
|
-
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
11600
|
-
|
|
11601
12796
|
cur = build_norm(cur,
|
|
11602
|
-
model.output_norm,
|
|
11603
|
-
|
|
11604
|
-
|
|
12797
|
+
model.output_norm, NULL,
|
|
12798
|
+
LLM_NORM_RMS, -1);
|
|
12799
|
+
|
|
12800
|
+
cb(cur, "result_norm", -1);
|
|
12801
|
+
res->t_embd = cur;
|
|
11605
12802
|
|
|
11606
12803
|
// lm_head
|
|
11607
12804
|
cur = build_lora_mm(model.output, cur);
|
|
11608
12805
|
|
|
11609
|
-
cur
|
|
11610
|
-
|
|
11611
|
-
cb(cur, "result_embd", -1);
|
|
11612
|
-
res->t_embd = cur;
|
|
12806
|
+
cb(cur, "result_output", -1);
|
|
12807
|
+
res->t_logits = cur;
|
|
11613
12808
|
|
|
11614
12809
|
ggml_build_forward_expand(gf, cur);
|
|
11615
12810
|
}
|
|
@@ -11659,6 +12854,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11659
12854
|
|
|
11660
12855
|
switch (arch) {
|
|
11661
12856
|
case LLM_ARCH_LLAMA:
|
|
12857
|
+
case LLM_ARCH_LLAMA4:
|
|
11662
12858
|
case LLM_ARCH_MINICPM:
|
|
11663
12859
|
case LLM_ARCH_GRANITE:
|
|
11664
12860
|
case LLM_ARCH_GRANITE_MOE:
|
|
@@ -11692,6 +12888,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11692
12888
|
case LLM_ARCH_BERT:
|
|
11693
12889
|
case LLM_ARCH_JINA_BERT_V2:
|
|
11694
12890
|
case LLM_ARCH_NOMIC_BERT:
|
|
12891
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
11695
12892
|
{
|
|
11696
12893
|
llm = std::make_unique<llm_build_bert>(*this, params, gf);
|
|
11697
12894
|
} break;
|
|
@@ -11723,6 +12920,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11723
12920
|
{
|
|
11724
12921
|
llm = std::make_unique<llm_build_qwen2moe>(*this, params, gf);
|
|
11725
12922
|
} break;
|
|
12923
|
+
case LLM_ARCH_QWEN3:
|
|
12924
|
+
{
|
|
12925
|
+
llm = std::make_unique<llm_build_qwen3>(*this, params, gf);
|
|
12926
|
+
} break;
|
|
12927
|
+
case LLM_ARCH_QWEN3MOE:
|
|
12928
|
+
{
|
|
12929
|
+
llm = std::make_unique<llm_build_qwen3moe>(*this, params, gf);
|
|
12930
|
+
} break;
|
|
11726
12931
|
case LLM_ARCH_PHI2:
|
|
11727
12932
|
{
|
|
11728
12933
|
llm = std::make_unique<llm_build_phi2>(*this, params, gf);
|
|
@@ -11828,6 +13033,10 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11828
13033
|
{
|
|
11829
13034
|
llm = std::make_unique<llm_build_chatglm>(*this, params, gf);
|
|
11830
13035
|
} break;
|
|
13036
|
+
case LLM_ARCH_GLM4:
|
|
13037
|
+
{
|
|
13038
|
+
llm = std::make_unique<llm_build_glm4>(*this, params, gf);
|
|
13039
|
+
} break;
|
|
11831
13040
|
case LLM_ARCH_BITNET:
|
|
11832
13041
|
{
|
|
11833
13042
|
llm = std::make_unique<llm_build_bitnet>(*this, params, gf);
|
|
@@ -11846,10 +13055,11 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11846
13055
|
GGML_ABORT("invalid graph type");
|
|
11847
13056
|
};
|
|
11848
13057
|
} break;
|
|
11849
|
-
|
|
11850
|
-
|
|
11851
|
-
|
|
11852
|
-
|
|
13058
|
+
case LLM_ARCH_T5ENCODER:
|
|
13059
|
+
{
|
|
13060
|
+
llm = std::make_unique<llm_build_t5_enc>(*this, params, gf);
|
|
13061
|
+
}
|
|
13062
|
+
break;
|
|
11853
13063
|
case LLM_ARCH_JAIS:
|
|
11854
13064
|
{
|
|
11855
13065
|
llm = std::make_unique<llm_build_jais>(*this, params, gf);
|
|
@@ -11886,6 +13096,14 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11886
13096
|
{
|
|
11887
13097
|
llm = std::make_unique<llm_build_wavtokenizer_dec>(*this, params, gf);
|
|
11888
13098
|
} break;
|
|
13099
|
+
case LLM_ARCH_PLM:
|
|
13100
|
+
{
|
|
13101
|
+
llm = std::make_unique<llm_build_plm>(*this, params, gf);
|
|
13102
|
+
} break;
|
|
13103
|
+
case LLM_ARCH_BAILINGMOE:
|
|
13104
|
+
{
|
|
13105
|
+
llm = std::make_unique<llm_build_bailingmoe>(*this, params, gf);
|
|
13106
|
+
} break;
|
|
11889
13107
|
default:
|
|
11890
13108
|
GGML_ABORT("fatal error");
|
|
11891
13109
|
}
|
|
@@ -11903,6 +13121,7 @@ llm_graph_result_ptr llama_model::build_graph(
|
|
|
11903
13121
|
llama_model_params llama_model_default_params() {
|
|
11904
13122
|
llama_model_params result = {
|
|
11905
13123
|
/*.devices =*/ nullptr,
|
|
13124
|
+
/*.tensor_buft_overrides =*/ nullptr,
|
|
11906
13125
|
/*.n_gpu_layers =*/ 0,
|
|
11907
13126
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
11908
13127
|
/*.main_gpu =*/ 0,
|
|
@@ -11998,6 +13217,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
11998
13217
|
|
|
11999
13218
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
12000
13219
|
case LLM_ARCH_LLAMA:
|
|
13220
|
+
case LLM_ARCH_LLAMA4:
|
|
12001
13221
|
case LLM_ARCH_DECI:
|
|
12002
13222
|
case LLM_ARCH_BAICHUAN:
|
|
12003
13223
|
case LLM_ARCH_STARCODER:
|
|
@@ -12012,10 +13232,13 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
12012
13232
|
case LLM_ARCH_ARCTIC:
|
|
12013
13233
|
case LLM_ARCH_DEEPSEEK:
|
|
12014
13234
|
case LLM_ARCH_DEEPSEEK2:
|
|
13235
|
+
case LLM_ARCH_PLM:
|
|
12015
13236
|
case LLM_ARCH_CHATGLM:
|
|
13237
|
+
case LLM_ARCH_GLM4:
|
|
12016
13238
|
case LLM_ARCH_GRANITE:
|
|
12017
13239
|
case LLM_ARCH_GRANITE_MOE:
|
|
12018
13240
|
case LLM_ARCH_CHAMELEON:
|
|
13241
|
+
case LLM_ARCH_BAILINGMOE:
|
|
12019
13242
|
return LLAMA_ROPE_TYPE_NORM;
|
|
12020
13243
|
|
|
12021
13244
|
// the pairs of head values are offset by n_rot/2
|
|
@@ -12024,11 +13247,14 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|
|
12024
13247
|
case LLM_ARCH_DBRX:
|
|
12025
13248
|
case LLM_ARCH_BERT:
|
|
12026
13249
|
case LLM_ARCH_NOMIC_BERT:
|
|
13250
|
+
case LLM_ARCH_NOMIC_BERT_MOE:
|
|
12027
13251
|
case LLM_ARCH_STABLELM:
|
|
12028
13252
|
case LLM_ARCH_BITNET:
|
|
12029
13253
|
case LLM_ARCH_QWEN:
|
|
12030
13254
|
case LLM_ARCH_QWEN2:
|
|
12031
13255
|
case LLM_ARCH_QWEN2MOE:
|
|
13256
|
+
case LLM_ARCH_QWEN3:
|
|
13257
|
+
case LLM_ARCH_QWEN3MOE:
|
|
12032
13258
|
case LLM_ARCH_OLMO2:
|
|
12033
13259
|
case LLM_ARCH_OLMOE:
|
|
12034
13260
|
case LLM_ARCH_PHI2:
|