@fugood/llama.node 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +5 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/EmbeddingWorker.cpp +15 -5
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +1 -1
- package/src/LlamaContext.cpp +81 -18
- package/src/LlamaContext.h +2 -0
- package/src/llama.cpp/.github/workflows/build.yml +197 -159
- package/src/llama.cpp/.github/workflows/docker.yml +5 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +11 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -2
- package/src/llama.cpp/common/arg.cpp +426 -245
- package/src/llama.cpp/common/common.cpp +143 -80
- package/src/llama.cpp/common/common.h +81 -24
- package/src/llama.cpp/common/sampling.cpp +53 -19
- package/src/llama.cpp/common/sampling.h +22 -1
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +101 -148
- package/src/llama.cpp/examples/CMakeLists.txt +32 -13
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +5 -4
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +11 -2
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +1 -1
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +405 -316
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +262 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/llava.cpp +46 -19
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +1 -1
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +1 -1
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +2 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +2 -2
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +9 -5
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +1 -1
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +4 -4
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -4
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -7
- package/src/llama.cpp/examples/server/server.cpp +1758 -886
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +94 -304
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +4 -0
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +3 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +16 -15
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +1 -1
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +46 -34
- package/src/llama.cpp/ggml/include/ggml-backend.h +16 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +7 -49
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml.h +106 -24
- package/src/llama.cpp/ggml/src/CMakeLists.txt +73 -24
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +51 -11
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +379 -22
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -7
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +33 -3
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +95 -35
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +288 -213
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/common.h +19 -22
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.cpp +93 -92
- package/src/llama.cpp/ggml/src/{ggml-amx → ggml-cpu/amx}/mmq.h +2 -9
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +892 -190
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +2 -24
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +15 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +38 -25
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +552 -399
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +101 -136
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +7 -10
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -6
- package/src/llama.cpp/ggml/src/ggml-impl.h +32 -11
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +13 -9
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +131 -64
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +3 -6
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +39 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +14 -7
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +67 -80
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +3 -5
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +5 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +13 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +2 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +2 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +32 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +80 -61
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +159 -114
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +6 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +8 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +4 -1
- package/src/llama.cpp/ggml/src/ggml-threading.h +4 -2
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +21 -7
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1718 -399
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +105 -31
- package/src/llama.cpp/ggml/src/ggml.c +367 -207
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +26 -19
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/src/CMakeLists.txt +2 -7
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +35 -90
- package/src/llama.cpp/src/llama-vocab.cpp +6 -1
- package/src/llama.cpp/src/llama.cpp +1748 -640
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -37
- package/src/llama.cpp/tests/test-arg-parser.cpp +2 -2
- package/src/llama.cpp/tests/test-backend-ops.cpp +140 -21
- package/src/llama.cpp/tests/test-chat-template.cpp +50 -4
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-quantize-fns.cpp +3 -3
- package/src/llama.cpp/tests/test-rope.cpp +61 -20
- package/src/llama.cpp/tests/test-sampling.cpp +2 -2
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/ggml/include/ggml-amx.h +0 -25
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +0 -129
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +0 -107
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +0 -446
|
@@ -163,6 +163,7 @@ enum llm_arch {
|
|
|
163
163
|
LLM_ARCH_QWEN,
|
|
164
164
|
LLM_ARCH_QWEN2,
|
|
165
165
|
LLM_ARCH_QWEN2MOE,
|
|
166
|
+
LLM_ARCH_QWEN2VL,
|
|
166
167
|
LLM_ARCH_PHI2,
|
|
167
168
|
LLM_ARCH_PHI3,
|
|
168
169
|
LLM_ARCH_PLAMO,
|
|
@@ -179,9 +180,11 @@ enum llm_arch {
|
|
|
179
180
|
LLM_ARCH_COMMAND_R,
|
|
180
181
|
LLM_ARCH_DBRX,
|
|
181
182
|
LLM_ARCH_OLMO,
|
|
183
|
+
LLM_ARCH_OLMO2,
|
|
182
184
|
LLM_ARCH_OLMOE,
|
|
183
185
|
LLM_ARCH_OPENELM,
|
|
184
186
|
LLM_ARCH_ARCTIC,
|
|
187
|
+
LLM_ARCH_DEEPSEEK,
|
|
185
188
|
LLM_ARCH_DEEPSEEK2,
|
|
186
189
|
LLM_ARCH_CHATGLM,
|
|
187
190
|
LLM_ARCH_BITNET,
|
|
@@ -194,60 +197,65 @@ enum llm_arch {
|
|
|
194
197
|
LLM_ARCH_GRANITE,
|
|
195
198
|
LLM_ARCH_GRANITE_MOE,
|
|
196
199
|
LLM_ARCH_CHAMELEON,
|
|
200
|
+
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
197
201
|
LLM_ARCH_UNKNOWN,
|
|
198
202
|
};
|
|
199
203
|
|
|
200
204
|
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|
201
|
-
{ LLM_ARCH_LLAMA,
|
|
202
|
-
{ LLM_ARCH_FALCON,
|
|
203
|
-
{ LLM_ARCH_GROK,
|
|
204
|
-
{ LLM_ARCH_GPT2,
|
|
205
|
-
{ LLM_ARCH_GPTJ,
|
|
206
|
-
{ LLM_ARCH_GPTNEOX,
|
|
207
|
-
{ LLM_ARCH_MPT,
|
|
208
|
-
{ LLM_ARCH_BAICHUAN,
|
|
209
|
-
{ LLM_ARCH_STARCODER,
|
|
210
|
-
{ LLM_ARCH_REFACT,
|
|
211
|
-
{ LLM_ARCH_BERT,
|
|
212
|
-
{ LLM_ARCH_NOMIC_BERT,
|
|
213
|
-
{ LLM_ARCH_JINA_BERT_V2,
|
|
214
|
-
{ LLM_ARCH_BLOOM,
|
|
215
|
-
{ LLM_ARCH_STABLELM,
|
|
216
|
-
{ LLM_ARCH_QWEN,
|
|
217
|
-
{ LLM_ARCH_QWEN2,
|
|
218
|
-
{ LLM_ARCH_QWEN2MOE,
|
|
219
|
-
{
|
|
220
|
-
{
|
|
221
|
-
{
|
|
222
|
-
{
|
|
223
|
-
{
|
|
224
|
-
{
|
|
225
|
-
{
|
|
226
|
-
{
|
|
227
|
-
{
|
|
228
|
-
{
|
|
229
|
-
{
|
|
230
|
-
{
|
|
231
|
-
{
|
|
232
|
-
{
|
|
233
|
-
{
|
|
234
|
-
{
|
|
235
|
-
{
|
|
236
|
-
{
|
|
237
|
-
{
|
|
238
|
-
{
|
|
239
|
-
{
|
|
240
|
-
{
|
|
241
|
-
{
|
|
242
|
-
{
|
|
243
|
-
{
|
|
244
|
-
{
|
|
245
|
-
{
|
|
246
|
-
{
|
|
247
|
-
{
|
|
248
|
-
{
|
|
249
|
-
{
|
|
250
|
-
{
|
|
205
|
+
{ LLM_ARCH_LLAMA, "llama" },
|
|
206
|
+
{ LLM_ARCH_FALCON, "falcon" },
|
|
207
|
+
{ LLM_ARCH_GROK, "grok" },
|
|
208
|
+
{ LLM_ARCH_GPT2, "gpt2" },
|
|
209
|
+
{ LLM_ARCH_GPTJ, "gptj" },
|
|
210
|
+
{ LLM_ARCH_GPTNEOX, "gptneox" },
|
|
211
|
+
{ LLM_ARCH_MPT, "mpt" },
|
|
212
|
+
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
|
213
|
+
{ LLM_ARCH_STARCODER, "starcoder" },
|
|
214
|
+
{ LLM_ARCH_REFACT, "refact" },
|
|
215
|
+
{ LLM_ARCH_BERT, "bert" },
|
|
216
|
+
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
|
|
217
|
+
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
|
|
218
|
+
{ LLM_ARCH_BLOOM, "bloom" },
|
|
219
|
+
{ LLM_ARCH_STABLELM, "stablelm" },
|
|
220
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
|
221
|
+
{ LLM_ARCH_QWEN2, "qwen2" },
|
|
222
|
+
{ LLM_ARCH_QWEN2MOE, "qwen2moe" },
|
|
223
|
+
{ LLM_ARCH_QWEN2VL, "qwen2vl" },
|
|
224
|
+
{ LLM_ARCH_PHI2, "phi2" },
|
|
225
|
+
{ LLM_ARCH_PHI3, "phi3" },
|
|
226
|
+
{ LLM_ARCH_PLAMO, "plamo" },
|
|
227
|
+
{ LLM_ARCH_CODESHELL, "codeshell" },
|
|
228
|
+
{ LLM_ARCH_ORION, "orion" },
|
|
229
|
+
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
|
230
|
+
{ LLM_ARCH_MINICPM, "minicpm" },
|
|
231
|
+
{ LLM_ARCH_MINICPM3, "minicpm3" },
|
|
232
|
+
{ LLM_ARCH_GEMMA, "gemma" },
|
|
233
|
+
{ LLM_ARCH_GEMMA2, "gemma2" },
|
|
234
|
+
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
|
235
|
+
{ LLM_ARCH_MAMBA, "mamba" },
|
|
236
|
+
{ LLM_ARCH_XVERSE, "xverse" },
|
|
237
|
+
{ LLM_ARCH_COMMAND_R, "command-r" },
|
|
238
|
+
{ LLM_ARCH_DBRX, "dbrx" },
|
|
239
|
+
{ LLM_ARCH_OLMO, "olmo" },
|
|
240
|
+
{ LLM_ARCH_OLMO2, "olmo2" },
|
|
241
|
+
{ LLM_ARCH_OLMOE, "olmoe" },
|
|
242
|
+
{ LLM_ARCH_OPENELM, "openelm" },
|
|
243
|
+
{ LLM_ARCH_ARCTIC, "arctic" },
|
|
244
|
+
{ LLM_ARCH_DEEPSEEK, "deepseek" },
|
|
245
|
+
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
|
|
246
|
+
{ LLM_ARCH_CHATGLM, "chatglm" },
|
|
247
|
+
{ LLM_ARCH_BITNET, "bitnet" },
|
|
248
|
+
{ LLM_ARCH_T5, "t5" },
|
|
249
|
+
{ LLM_ARCH_T5ENCODER, "t5encoder" },
|
|
250
|
+
{ LLM_ARCH_JAIS, "jais" },
|
|
251
|
+
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
|
252
|
+
{ LLM_ARCH_EXAONE, "exaone" },
|
|
253
|
+
{ LLM_ARCH_RWKV6, "rwkv6" },
|
|
254
|
+
{ LLM_ARCH_GRANITE, "granite" },
|
|
255
|
+
{ LLM_ARCH_GRANITE_MOE, "granitemoe" },
|
|
256
|
+
{ LLM_ARCH_CHAMELEON, "chameleon" },
|
|
257
|
+
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
|
|
258
|
+
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
|
251
259
|
};
|
|
252
260
|
|
|
253
261
|
enum llm_kv {
|
|
@@ -267,6 +275,7 @@ enum llm_kv {
|
|
|
267
275
|
LLM_KV_VOCAB_SIZE,
|
|
268
276
|
LLM_KV_CONTEXT_LENGTH,
|
|
269
277
|
LLM_KV_EMBEDDING_LENGTH,
|
|
278
|
+
LLM_KV_FEATURES_LENGTH,
|
|
270
279
|
LLM_KV_BLOCK_COUNT,
|
|
271
280
|
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
|
|
272
281
|
LLM_KV_FEED_FORWARD_LENGTH,
|
|
@@ -298,6 +307,8 @@ enum llm_kv {
|
|
|
298
307
|
LLM_KV_ATTENTION_VALUE_LENGTH,
|
|
299
308
|
LLM_KV_ATTENTION_LAYERNORM_EPS,
|
|
300
309
|
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
|
|
310
|
+
LLM_KV_ATTENTION_GROUPNORM_EPS,
|
|
311
|
+
LLM_KV_ATTENTION_GROUPNORM_GROUPS,
|
|
301
312
|
LLM_KV_ATTENTION_CAUSAL,
|
|
302
313
|
LLM_KV_ATTENTION_Q_LORA_RANK,
|
|
303
314
|
LLM_KV_ATTENTION_KV_LORA_RANK,
|
|
@@ -306,6 +317,7 @@ enum llm_kv {
|
|
|
306
317
|
LLM_KV_ATTENTION_SCALE,
|
|
307
318
|
|
|
308
319
|
LLM_KV_ROPE_DIMENSION_COUNT,
|
|
320
|
+
LLM_KV_ROPE_DIMENSION_SECTIONS,
|
|
309
321
|
LLM_KV_ROPE_FREQ_BASE,
|
|
310
322
|
LLM_KV_ROPE_SCALE_LINEAR,
|
|
311
323
|
LLM_KV_ROPE_SCALING_TYPE,
|
|
@@ -360,6 +372,12 @@ enum llm_kv {
|
|
|
360
372
|
LLM_KV_ADAPTER_TYPE,
|
|
361
373
|
LLM_KV_ADAPTER_LORA_ALPHA,
|
|
362
374
|
|
|
375
|
+
LLM_KV_POSNET_EMBEDDING_LENGTH,
|
|
376
|
+
LLM_KV_POSNET_BLOCK_COUNT,
|
|
377
|
+
|
|
378
|
+
LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
|
|
379
|
+
LLM_KV_CONVNEXT_BLOCK_COUNT,
|
|
380
|
+
|
|
363
381
|
// deprecated:
|
|
364
382
|
LLM_KV_TOKENIZER_PREFIX_ID,
|
|
365
383
|
LLM_KV_TOKENIZER_SUFFIX_ID,
|
|
@@ -383,6 +401,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
383
401
|
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
|
384
402
|
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
|
385
403
|
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
|
404
|
+
{ LLM_KV_FEATURES_LENGTH, "%s.features_length" },
|
|
386
405
|
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
|
387
406
|
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
|
|
388
407
|
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
|
|
@@ -414,6 +433,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
414
433
|
{ LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" },
|
|
415
434
|
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
|
|
416
435
|
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
|
|
436
|
+
{ LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" },
|
|
437
|
+
{ LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" },
|
|
417
438
|
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
|
|
418
439
|
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
|
|
419
440
|
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },
|
|
@@ -422,6 +443,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
422
443
|
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
|
423
444
|
|
|
424
445
|
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
|
|
446
|
+
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },
|
|
425
447
|
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
|
|
426
448
|
{ LLM_KV_ROPE_SCALE_LINEAR, "%s.rope.scale_linear" },
|
|
427
449
|
{ LLM_KV_ROPE_SCALING_TYPE, "%s.rope.scaling.type" },
|
|
@@ -443,6 +465,12 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|
|
443
465
|
|
|
444
466
|
{ LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" },
|
|
445
467
|
|
|
468
|
+
{ LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" },
|
|
469
|
+
{ LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" },
|
|
470
|
+
|
|
471
|
+
{ LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
|
|
472
|
+
{ LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" },
|
|
473
|
+
|
|
446
474
|
{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
|
|
447
475
|
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
|
|
448
476
|
{ LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" },
|
|
@@ -601,6 +629,22 @@ enum llm_tensor {
|
|
|
601
629
|
LLM_TENSOR_ENC_OUTPUT_NORM,
|
|
602
630
|
LLM_TENSOR_CLS,
|
|
603
631
|
LLM_TENSOR_CLS_OUT,
|
|
632
|
+
LLM_TENSOR_CONV1D,
|
|
633
|
+
LLM_TENSOR_CONVNEXT_DW,
|
|
634
|
+
LLM_TENSOR_CONVNEXT_NORM,
|
|
635
|
+
LLM_TENSOR_CONVNEXT_PW1,
|
|
636
|
+
LLM_TENSOR_CONVNEXT_PW2,
|
|
637
|
+
LLM_TENSOR_CONVNEXT_GAMMA,
|
|
638
|
+
LLM_TENSOR_POS_NET_CONV1,
|
|
639
|
+
LLM_TENSOR_POS_NET_CONV2,
|
|
640
|
+
LLM_TENSOR_POS_NET_NORM,
|
|
641
|
+
LLM_TENSOR_POS_NET_NORM1,
|
|
642
|
+
LLM_TENSOR_POS_NET_NORM2,
|
|
643
|
+
LLM_TENSOR_POS_NET_ATTN_NORM,
|
|
644
|
+
LLM_TENSOR_POS_NET_ATTN_Q,
|
|
645
|
+
LLM_TENSOR_POS_NET_ATTN_K,
|
|
646
|
+
LLM_TENSOR_POS_NET_ATTN_V,
|
|
647
|
+
LLM_TENSOR_POS_NET_ATTN_OUT,
|
|
604
648
|
};
|
|
605
649
|
|
|
606
650
|
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
|
|
@@ -896,6 +940,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
896
940
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
897
941
|
},
|
|
898
942
|
},
|
|
943
|
+
{
|
|
944
|
+
LLM_ARCH_QWEN2VL,
|
|
945
|
+
{
|
|
946
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
947
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
948
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
949
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
950
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
951
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
952
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
953
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
954
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
955
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
956
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
957
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
958
|
+
},
|
|
959
|
+
},
|
|
899
960
|
{
|
|
900
961
|
LLM_ARCH_QWEN2MOE,
|
|
901
962
|
{
|
|
@@ -1034,6 +1095,8 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1034
1095
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1035
1096
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1036
1097
|
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
1098
|
+
{ LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
|
|
1099
|
+
{ LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
|
|
1037
1100
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1038
1101
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1039
1102
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
@@ -1207,6 +1270,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1207
1270
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1208
1271
|
},
|
|
1209
1272
|
},
|
|
1273
|
+
{
|
|
1274
|
+
LLM_ARCH_OLMO2,
|
|
1275
|
+
{
|
|
1276
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1277
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1278
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1279
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1280
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1281
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1282
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1283
|
+
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
|
1284
|
+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
|
1285
|
+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1286
|
+
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
|
1287
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1288
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1289
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1290
|
+
},
|
|
1291
|
+
},
|
|
1210
1292
|
{
|
|
1211
1293
|
LLM_ARCH_OLMOE,
|
|
1212
1294
|
{
|
|
@@ -1265,6 +1347,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1265
1347
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1266
1348
|
},
|
|
1267
1349
|
},
|
|
1350
|
+
{
|
|
1351
|
+
LLM_ARCH_DEEPSEEK,
|
|
1352
|
+
{
|
|
1353
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1354
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1355
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1356
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
|
1357
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
1358
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
1359
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
1360
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
1361
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
1362
|
+
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
|
1363
|
+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
|
1364
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
|
1365
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
|
1366
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
1367
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
1368
|
+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
|
1369
|
+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
|
1370
|
+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
|
1371
|
+
{ LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
|
|
1372
|
+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
|
1373
|
+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
|
1374
|
+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
|
1375
|
+
},
|
|
1376
|
+
},
|
|
1268
1377
|
{
|
|
1269
1378
|
LLM_ARCH_DEEPSEEK2,
|
|
1270
1379
|
{
|
|
@@ -1520,6 +1629,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1520
1629
|
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
|
1521
1630
|
},
|
|
1522
1631
|
},
|
|
1632
|
+
{
|
|
1633
|
+
LLM_ARCH_WAVTOKENIZER_DEC,
|
|
1634
|
+
{
|
|
1635
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
1636
|
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
|
1637
|
+
{ LLM_TENSOR_CONV1D, "conv1d" },
|
|
1638
|
+
{ LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
|
|
1639
|
+
{ LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
|
|
1640
|
+
{ LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
|
|
1641
|
+
{ LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
|
|
1642
|
+
{ LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
|
|
1643
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
|
1644
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
|
1645
|
+
{ LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
|
|
1646
|
+
{ LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
|
|
1647
|
+
{ LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
|
|
1648
|
+
{ LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
|
|
1649
|
+
{ LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
|
|
1650
|
+
{ LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
|
|
1651
|
+
{ LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
|
|
1652
|
+
{ LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
|
|
1653
|
+
{ LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
|
|
1654
|
+
{ LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
|
|
1655
|
+
},
|
|
1656
|
+
},
|
|
1523
1657
|
{
|
|
1524
1658
|
LLM_ARCH_UNKNOWN,
|
|
1525
1659
|
{
|
|
@@ -1528,6 +1662,69 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
|
|
1528
1662
|
},
|
|
1529
1663
|
};
|
|
1530
1664
|
|
|
1665
|
+
enum llm_chat_template {
|
|
1666
|
+
LLM_CHAT_TEMPLATE_CHATML,
|
|
1667
|
+
LLM_CHAT_TEMPLATE_LLAMA_2,
|
|
1668
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS,
|
|
1669
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS,
|
|
1670
|
+
LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP,
|
|
1671
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V1,
|
|
1672
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
|
1673
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
|
1674
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
|
1675
|
+
LLM_CHAT_TEMPLATE_PHI_3,
|
|
1676
|
+
LLM_CHAT_TEMPLATE_ZEPHYR,
|
|
1677
|
+
LLM_CHAT_TEMPLATE_MONARCH,
|
|
1678
|
+
LLM_CHAT_TEMPLATE_GEMMA,
|
|
1679
|
+
LLM_CHAT_TEMPLATE_ORION,
|
|
1680
|
+
LLM_CHAT_TEMPLATE_OPENCHAT,
|
|
1681
|
+
LLM_CHAT_TEMPLATE_VICUNA,
|
|
1682
|
+
LLM_CHAT_TEMPLATE_VICUNA_ORCA,
|
|
1683
|
+
LLM_CHAT_TEMPLATE_DEEPSEEK,
|
|
1684
|
+
LLM_CHAT_TEMPLATE_DEEPSEEK_2,
|
|
1685
|
+
LLM_CHAT_TEMPLATE_COMMAND_R,
|
|
1686
|
+
LLM_CHAT_TEMPLATE_LLAMA_3,
|
|
1687
|
+
LLM_CHAT_TEMPLATE_CHATGML_3,
|
|
1688
|
+
LLM_CHAT_TEMPLATE_CHATGML_4,
|
|
1689
|
+
LLM_CHAT_TEMPLATE_MINICPM,
|
|
1690
|
+
LLM_CHAT_TEMPLATE_EXAONE_3,
|
|
1691
|
+
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
|
1692
|
+
LLM_CHAT_TEMPLATE_GRANITE,
|
|
1693
|
+
LLM_CHAT_TEMPLATE_GIGACHAT,
|
|
1694
|
+
LLM_CHAT_TEMPLATE_UNKNOWN,
|
|
1695
|
+
};
|
|
1696
|
+
|
|
1697
|
+
static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
1698
|
+
{ "chatml", LLM_CHAT_TEMPLATE_CHATML },
|
|
1699
|
+
{ "llama2", LLM_CHAT_TEMPLATE_LLAMA_2 },
|
|
1700
|
+
{ "llama2-sys", LLM_CHAT_TEMPLATE_LLAMA_2_SYS },
|
|
1701
|
+
{ "llama2-sys-bos", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS },
|
|
1702
|
+
{ "llama2-sys-strip", LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP },
|
|
1703
|
+
{ "mistral-v1", LLM_CHAT_TEMPLATE_MISTRAL_V1 },
|
|
1704
|
+
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
|
1705
|
+
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
|
1706
|
+
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
|
1707
|
+
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
|
1708
|
+
{ "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR },
|
|
1709
|
+
{ "monarch", LLM_CHAT_TEMPLATE_MONARCH },
|
|
1710
|
+
{ "gemma", LLM_CHAT_TEMPLATE_GEMMA },
|
|
1711
|
+
{ "orion", LLM_CHAT_TEMPLATE_ORION },
|
|
1712
|
+
{ "openchat", LLM_CHAT_TEMPLATE_OPENCHAT },
|
|
1713
|
+
{ "vicuna", LLM_CHAT_TEMPLATE_VICUNA },
|
|
1714
|
+
{ "vicuna-orca", LLM_CHAT_TEMPLATE_VICUNA_ORCA },
|
|
1715
|
+
{ "deepseek", LLM_CHAT_TEMPLATE_DEEPSEEK },
|
|
1716
|
+
{ "deepseek2", LLM_CHAT_TEMPLATE_DEEPSEEK_2 },
|
|
1717
|
+
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
|
1718
|
+
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
|
1719
|
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
|
|
1720
|
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
|
|
1721
|
+
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
|
1722
|
+
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
|
1723
|
+
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
|
1724
|
+
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
|
1725
|
+
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
|
1726
|
+
};
|
|
1727
|
+
|
|
1531
1728
|
static llm_arch llm_arch_from_string(const std::string & name) {
|
|
1532
1729
|
for (const auto & kv : LLM_ARCH_NAMES) { // NOLINT
|
|
1533
1730
|
if (kv.second == name) {
|
|
@@ -1601,9 +1798,10 @@ struct LLM_TN {
|
|
|
1601
1798
|
//
|
|
1602
1799
|
|
|
1603
1800
|
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
|
1604
|
-
{ LLAMA_ROPE_SCALING_TYPE_NONE,
|
|
1605
|
-
{ LLAMA_ROPE_SCALING_TYPE_LINEAR,
|
|
1606
|
-
{ LLAMA_ROPE_SCALING_TYPE_YARN,
|
|
1801
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
|
1802
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
|
1803
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
|
1804
|
+
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
|
1607
1805
|
};
|
|
1608
1806
|
|
|
1609
1807
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
|
@@ -1709,7 +1907,7 @@ private:
|
|
|
1709
1907
|
DWORD bufLen = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
|
1710
1908
|
NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&lpMsgBuf, 0, NULL);
|
|
1711
1909
|
if (!bufLen) {
|
|
1712
|
-
ret = format("Win32 error code: %
|
|
1910
|
+
ret = format("Win32 error code: %lx", error_code);
|
|
1713
1911
|
} else {
|
|
1714
1912
|
ret = lpMsgBuf;
|
|
1715
1913
|
LocalFree(lpMsgBuf);
|
|
@@ -2047,7 +2245,7 @@ struct llama_mmap {
|
|
|
2047
2245
|
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
|
2048
2246
|
|
|
2049
2247
|
// may fail on pre-Windows 8 systems
|
|
2050
|
-
pPrefetchVirtualMemory =
|
|
2248
|
+
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *) GetProcAddress(hKernel32, "PrefetchVirtualMemory");
|
|
2051
2249
|
|
|
2052
2250
|
if (pPrefetchVirtualMemory) {
|
|
2053
2251
|
// advise the kernel to preload the mapped memory
|
|
@@ -2320,6 +2518,7 @@ enum e_model {
|
|
|
2320
2518
|
MODEL_16B,
|
|
2321
2519
|
MODEL_20B,
|
|
2322
2520
|
MODEL_30B,
|
|
2521
|
+
MODEL_32B,
|
|
2323
2522
|
MODEL_34B,
|
|
2324
2523
|
MODEL_35B,
|
|
2325
2524
|
MODEL_40B,
|
|
@@ -2345,15 +2544,26 @@ static const size_t kiB = 1024;
|
|
|
2345
2544
|
static const size_t MiB = 1024*kiB;
|
|
2346
2545
|
static const size_t GiB = 1024*MiB;
|
|
2347
2546
|
|
|
2547
|
+
struct llama_hparams_posnet {
|
|
2548
|
+
uint32_t n_embd;
|
|
2549
|
+
uint32_t n_layer;
|
|
2550
|
+
};
|
|
2551
|
+
|
|
2552
|
+
struct llama_hparams_convnext {
|
|
2553
|
+
uint32_t n_embd;
|
|
2554
|
+
uint32_t n_layer;
|
|
2555
|
+
};
|
|
2556
|
+
|
|
2348
2557
|
struct llama_hparams {
|
|
2349
2558
|
bool vocab_only;
|
|
2350
2559
|
bool rope_finetuned;
|
|
2351
2560
|
bool use_par_res;
|
|
2352
2561
|
bool swin_norm;
|
|
2353
2562
|
|
|
2354
|
-
uint32_t n_vocab;
|
|
2563
|
+
uint32_t n_vocab = 0;
|
|
2355
2564
|
uint32_t n_ctx_train; // context size the model was trained on
|
|
2356
2565
|
uint32_t n_embd;
|
|
2566
|
+
uint32_t n_embd_features = 0;
|
|
2357
2567
|
uint32_t n_layer;
|
|
2358
2568
|
uint32_t n_rot;
|
|
2359
2569
|
uint32_t n_swa = 0; // sliding window attention (SWA)
|
|
@@ -2364,6 +2574,10 @@ struct llama_hparams {
|
|
|
2364
2574
|
uint32_t n_vocab_type = 0; // for BERT-style token types
|
|
2365
2575
|
uint32_t n_rel_attn_bkts = 0;
|
|
2366
2576
|
|
|
2577
|
+
// for WavTokenizer
|
|
2578
|
+
struct llama_hparams_posnet posnet;
|
|
2579
|
+
struct llama_hparams_convnext convnext;
|
|
2580
|
+
|
|
2367
2581
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
|
|
2368
2582
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
|
|
2369
2583
|
std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
|
|
@@ -2378,6 +2592,9 @@ struct llama_hparams {
|
|
|
2378
2592
|
|
|
2379
2593
|
float f_norm_eps;
|
|
2380
2594
|
float f_norm_rms_eps;
|
|
2595
|
+
float f_norm_group_eps;
|
|
2596
|
+
|
|
2597
|
+
uint32_t n_norm_groups;
|
|
2381
2598
|
|
|
2382
2599
|
float f_attn_logit_softcapping = 50.0f;
|
|
2383
2600
|
float f_final_logit_softcapping = 30.0f;
|
|
@@ -2388,11 +2605,12 @@ struct llama_hparams {
|
|
|
2388
2605
|
uint32_t time_decay_extra_dim = 0;
|
|
2389
2606
|
uint32_t wkv_head_size = 0;
|
|
2390
2607
|
|
|
2391
|
-
float
|
|
2392
|
-
float
|
|
2393
|
-
float
|
|
2394
|
-
uint32_t
|
|
2395
|
-
float
|
|
2608
|
+
float rope_attn_factor = 1.0f;
|
|
2609
|
+
float rope_freq_base_train;
|
|
2610
|
+
float rope_freq_scale_train;
|
|
2611
|
+
uint32_t n_ctx_orig_yarn;
|
|
2612
|
+
float rope_yarn_log_mul;
|
|
2613
|
+
int rope_sections[4];
|
|
2396
2614
|
|
|
2397
2615
|
// for State Space Models
|
|
2398
2616
|
uint32_t ssm_d_conv = 0;
|
|
@@ -2422,63 +2640,6 @@ struct llama_hparams {
|
|
|
2422
2640
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
|
2423
2641
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
|
2424
2642
|
|
|
2425
|
-
bool operator!=(const llama_hparams & other) const {
|
|
2426
|
-
if (this->vocab_only != other.vocab_only) return true;
|
|
2427
|
-
if (this->n_vocab != other.n_vocab) return true;
|
|
2428
|
-
if (this->n_ctx_train != other.n_ctx_train) return true;
|
|
2429
|
-
if (this->n_embd != other.n_embd) return true;
|
|
2430
|
-
if (this->n_layer != other.n_layer) return true;
|
|
2431
|
-
if (this->n_rot != other.n_rot) return true;
|
|
2432
|
-
if (this->n_swa != other.n_swa) return true;
|
|
2433
|
-
if (this->n_embd_head_k != other.n_embd_head_k) return true;
|
|
2434
|
-
if (this->n_embd_head_v != other.n_embd_head_v) return true;
|
|
2435
|
-
if (this->n_expert != other.n_expert) return true;
|
|
2436
|
-
if (this->n_expert_used != other.n_expert_used) return true;
|
|
2437
|
-
|
|
2438
|
-
if (this->n_head_arr != other.n_head_arr) return true;
|
|
2439
|
-
if (this->n_head_kv_arr != other.n_head_kv_arr) return true;
|
|
2440
|
-
if (this->n_ff_arr != other.n_ff_arr) return true;
|
|
2441
|
-
|
|
2442
|
-
if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true;
|
|
2443
|
-
if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true;
|
|
2444
|
-
if (this->n_lora_q != other.n_lora_q) return true;
|
|
2445
|
-
if (this->n_lora_kv != other.n_lora_kv) return true;
|
|
2446
|
-
if (this->n_ff_exp != other.n_ff_exp) return true;
|
|
2447
|
-
if (this->n_ff_shexp != other.n_ff_shexp) return true;
|
|
2448
|
-
if (this->n_expert_shared != other.n_expert_shared) return true;
|
|
2449
|
-
|
|
2450
|
-
if (this->rope_finetuned != other.rope_finetuned) return true;
|
|
2451
|
-
if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
|
|
2452
|
-
|
|
2453
|
-
if (this->ssm_d_conv != other.ssm_d_conv) return true;
|
|
2454
|
-
if (this->ssm_d_inner != other.ssm_d_inner) return true;
|
|
2455
|
-
if (this->ssm_d_state != other.ssm_d_state) return true;
|
|
2456
|
-
if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
|
|
2457
|
-
if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true;
|
|
2458
|
-
|
|
2459
|
-
if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true;
|
|
2460
|
-
if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true;
|
|
2461
|
-
if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true;
|
|
2462
|
-
if (this->wkv_head_size != other.wkv_head_size) return true;
|
|
2463
|
-
|
|
2464
|
-
if (this->dec_start_token_id != other.dec_start_token_id) return true;
|
|
2465
|
-
|
|
2466
|
-
const float EPSILON = 1e-9f;
|
|
2467
|
-
|
|
2468
|
-
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
|
|
2469
|
-
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
|
|
2470
|
-
if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true;
|
|
2471
|
-
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
|
|
2472
|
-
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
|
|
2473
|
-
if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true;
|
|
2474
|
-
if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true;
|
|
2475
|
-
if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true;
|
|
2476
|
-
if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true;
|
|
2477
|
-
if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true;
|
|
2478
|
-
|
|
2479
|
-
return false;
|
|
2480
|
-
}
|
|
2481
|
-
|
|
2482
2643
|
uint32_t n_head(uint32_t il = 0) const {
|
|
2483
2644
|
if (il < n_layer) {
|
|
2484
2645
|
return n_head_arr[il];
|
|
@@ -2531,21 +2692,21 @@ struct llama_hparams {
|
|
|
2531
2692
|
if (wkv_head_size != 0) {
|
|
2532
2693
|
// for RWKV models
|
|
2533
2694
|
return 2 * n_embd;
|
|
2534
|
-
} else {
|
|
2535
|
-
// TODO: maybe support other convolution strides than 1
|
|
2536
|
-
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
|
2537
|
-
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
|
2538
2695
|
}
|
|
2696
|
+
|
|
2697
|
+
// TODO: maybe support other convolution strides than 1
|
|
2698
|
+
// NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
|
|
2699
|
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
|
|
2539
2700
|
}
|
|
2540
2701
|
|
|
2541
2702
|
uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
|
|
2542
2703
|
if (wkv_head_size != 0) {
|
|
2543
2704
|
// corresponds to RWKV's wkv_states size
|
|
2544
2705
|
return n_embd * wkv_head_size;
|
|
2545
|
-
} else {
|
|
2546
|
-
// corresponds to Mamba's ssm_states size
|
|
2547
|
-
return ssm_d_state * ssm_d_inner;
|
|
2548
2706
|
}
|
|
2707
|
+
|
|
2708
|
+
// corresponds to Mamba's ssm_states size
|
|
2709
|
+
return ssm_d_state * ssm_d_inner;
|
|
2549
2710
|
}
|
|
2550
2711
|
};
|
|
2551
2712
|
|
|
@@ -2583,142 +2744,187 @@ struct llama_cparams {
|
|
|
2583
2744
|
void * cb_eval_user_data;
|
|
2584
2745
|
};
|
|
2585
2746
|
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2747
|
+
struct llama_layer_posnet {
|
|
2748
|
+
// resnet
|
|
2749
|
+
struct ggml_tensor * norm1 = nullptr;
|
|
2750
|
+
struct ggml_tensor * norm1_b = nullptr;
|
|
2751
|
+
|
|
2752
|
+
struct ggml_tensor * conv1 = nullptr;
|
|
2753
|
+
struct ggml_tensor * conv1_b = nullptr;
|
|
2754
|
+
|
|
2755
|
+
struct ggml_tensor * norm2 = nullptr;
|
|
2756
|
+
struct ggml_tensor * norm2_b = nullptr;
|
|
2757
|
+
|
|
2758
|
+
struct ggml_tensor * conv2 = nullptr;
|
|
2759
|
+
struct ggml_tensor * conv2_b = nullptr;
|
|
2592
2760
|
|
|
2761
|
+
// attention
|
|
2762
|
+
struct ggml_tensor * attn_norm = nullptr;
|
|
2763
|
+
struct ggml_tensor * attn_norm_b = nullptr;
|
|
2764
|
+
|
|
2765
|
+
struct ggml_tensor * attn_q = nullptr;
|
|
2766
|
+
struct ggml_tensor * attn_q_b = nullptr;
|
|
2767
|
+
|
|
2768
|
+
struct ggml_tensor * attn_k = nullptr;
|
|
2769
|
+
struct ggml_tensor * attn_k_b = nullptr;
|
|
2770
|
+
|
|
2771
|
+
struct ggml_tensor * attn_v = nullptr;
|
|
2772
|
+
struct ggml_tensor * attn_v_b = nullptr;
|
|
2773
|
+
|
|
2774
|
+
struct ggml_tensor * attn_o = nullptr;
|
|
2775
|
+
struct ggml_tensor * attn_o_b = nullptr;
|
|
2776
|
+
|
|
2777
|
+
// normalize
|
|
2778
|
+
struct ggml_tensor * norm = nullptr;
|
|
2779
|
+
struct ggml_tensor * norm_b = nullptr;
|
|
2780
|
+
};
|
|
2781
|
+
|
|
2782
|
+
struct llama_layer_convnext {
|
|
2783
|
+
struct ggml_tensor * dw = nullptr;
|
|
2784
|
+
struct ggml_tensor * dw_b = nullptr;
|
|
2785
|
+
|
|
2786
|
+
struct ggml_tensor * norm = nullptr;
|
|
2787
|
+
struct ggml_tensor * norm_b = nullptr;
|
|
2788
|
+
|
|
2789
|
+
struct ggml_tensor * pw1 = nullptr;
|
|
2790
|
+
struct ggml_tensor * pw1_b = nullptr;
|
|
2791
|
+
|
|
2792
|
+
struct ggml_tensor * pw2 = nullptr;
|
|
2793
|
+
struct ggml_tensor * pw2_b = nullptr;
|
|
2794
|
+
|
|
2795
|
+
struct ggml_tensor * gamma = nullptr;
|
|
2796
|
+
};
|
|
2797
|
+
|
|
2798
|
+
struct llama_layer {
|
|
2593
2799
|
// normalization
|
|
2594
|
-
struct ggml_tensor * attn_norm;
|
|
2595
|
-
struct ggml_tensor * attn_norm_b;
|
|
2596
|
-
struct ggml_tensor * attn_norm_2;
|
|
2597
|
-
struct ggml_tensor * attn_norm_2_b;
|
|
2598
|
-
struct ggml_tensor * attn_q_norm;
|
|
2599
|
-
struct ggml_tensor * attn_q_norm_b;
|
|
2600
|
-
struct ggml_tensor * attn_k_norm;
|
|
2601
|
-
struct ggml_tensor * attn_k_norm_b;
|
|
2602
|
-
struct ggml_tensor * attn_out_norm;
|
|
2603
|
-
struct ggml_tensor * attn_out_norm_b;
|
|
2604
|
-
struct ggml_tensor * attn_q_a_norm;
|
|
2605
|
-
struct ggml_tensor * attn_kv_a_norm;
|
|
2606
|
-
struct ggml_tensor * attn_sub_norm;
|
|
2607
|
-
struct ggml_tensor * attn_post_norm;
|
|
2608
|
-
struct ggml_tensor * ffn_sub_norm;
|
|
2609
|
-
struct ggml_tensor * attn_norm_cross;
|
|
2610
|
-
struct ggml_tensor * attn_norm_enc;
|
|
2800
|
+
struct ggml_tensor * attn_norm = nullptr;
|
|
2801
|
+
struct ggml_tensor * attn_norm_b = nullptr;
|
|
2802
|
+
struct ggml_tensor * attn_norm_2 = nullptr;
|
|
2803
|
+
struct ggml_tensor * attn_norm_2_b = nullptr;
|
|
2804
|
+
struct ggml_tensor * attn_q_norm = nullptr;
|
|
2805
|
+
struct ggml_tensor * attn_q_norm_b = nullptr;
|
|
2806
|
+
struct ggml_tensor * attn_k_norm = nullptr;
|
|
2807
|
+
struct ggml_tensor * attn_k_norm_b = nullptr;
|
|
2808
|
+
struct ggml_tensor * attn_out_norm = nullptr;
|
|
2809
|
+
struct ggml_tensor * attn_out_norm_b = nullptr;
|
|
2810
|
+
struct ggml_tensor * attn_q_a_norm = nullptr;
|
|
2811
|
+
struct ggml_tensor * attn_kv_a_norm = nullptr;
|
|
2812
|
+
struct ggml_tensor * attn_sub_norm = nullptr;
|
|
2813
|
+
struct ggml_tensor * attn_post_norm = nullptr;
|
|
2814
|
+
struct ggml_tensor * ffn_sub_norm = nullptr;
|
|
2815
|
+
struct ggml_tensor * attn_norm_cross = nullptr;
|
|
2816
|
+
struct ggml_tensor * attn_norm_enc = nullptr;
|
|
2611
2817
|
|
|
2612
2818
|
// attention
|
|
2613
|
-
struct ggml_tensor * wq;
|
|
2614
|
-
struct ggml_tensor * wk;
|
|
2615
|
-
struct ggml_tensor * wv;
|
|
2616
|
-
struct ggml_tensor * wo;
|
|
2617
|
-
struct ggml_tensor * wqkv;
|
|
2618
|
-
struct ggml_tensor * wq_a;
|
|
2619
|
-
struct ggml_tensor * wq_b;
|
|
2620
|
-
struct ggml_tensor * wkv_a_mqa;
|
|
2621
|
-
struct ggml_tensor * wkv_b;
|
|
2622
|
-
struct ggml_tensor * wq_cross;
|
|
2623
|
-
struct ggml_tensor * wk_cross;
|
|
2624
|
-
struct ggml_tensor * wv_cross;
|
|
2625
|
-
struct ggml_tensor * wo_cross;
|
|
2626
|
-
struct ggml_tensor * wq_enc;
|
|
2627
|
-
struct ggml_tensor * wk_enc;
|
|
2628
|
-
struct ggml_tensor * wv_enc;
|
|
2629
|
-
struct ggml_tensor * wo_enc;
|
|
2819
|
+
struct ggml_tensor * wq = nullptr;
|
|
2820
|
+
struct ggml_tensor * wk = nullptr;
|
|
2821
|
+
struct ggml_tensor * wv = nullptr;
|
|
2822
|
+
struct ggml_tensor * wo = nullptr;
|
|
2823
|
+
struct ggml_tensor * wqkv = nullptr;
|
|
2824
|
+
struct ggml_tensor * wq_a = nullptr;
|
|
2825
|
+
struct ggml_tensor * wq_b = nullptr;
|
|
2826
|
+
struct ggml_tensor * wkv_a_mqa = nullptr;
|
|
2827
|
+
struct ggml_tensor * wkv_b = nullptr;
|
|
2828
|
+
struct ggml_tensor * wq_cross = nullptr;
|
|
2829
|
+
struct ggml_tensor * wk_cross = nullptr;
|
|
2830
|
+
struct ggml_tensor * wv_cross = nullptr;
|
|
2831
|
+
struct ggml_tensor * wo_cross = nullptr;
|
|
2832
|
+
struct ggml_tensor * wq_enc = nullptr;
|
|
2833
|
+
struct ggml_tensor * wk_enc = nullptr;
|
|
2834
|
+
struct ggml_tensor * wv_enc = nullptr;
|
|
2835
|
+
struct ggml_tensor * wo_enc = nullptr;
|
|
2630
2836
|
|
|
2631
2837
|
// attention bias
|
|
2632
|
-
struct ggml_tensor * bq;
|
|
2633
|
-
struct ggml_tensor * bk;
|
|
2634
|
-
struct ggml_tensor * bv;
|
|
2635
|
-
struct ggml_tensor * bo;
|
|
2636
|
-
struct ggml_tensor * bqkv;
|
|
2838
|
+
struct ggml_tensor * bq = nullptr;
|
|
2839
|
+
struct ggml_tensor * bk = nullptr;
|
|
2840
|
+
struct ggml_tensor * bv = nullptr;
|
|
2841
|
+
struct ggml_tensor * bo = nullptr;
|
|
2842
|
+
struct ggml_tensor * bqkv = nullptr;
|
|
2637
2843
|
|
|
2638
2844
|
// relative position bias
|
|
2639
|
-
struct ggml_tensor * attn_rel_b;
|
|
2640
|
-
struct ggml_tensor * attn_rel_b_enc;
|
|
2641
|
-
struct ggml_tensor * attn_rel_b_cross;
|
|
2845
|
+
struct ggml_tensor * attn_rel_b = nullptr;
|
|
2846
|
+
struct ggml_tensor * attn_rel_b_enc = nullptr;
|
|
2847
|
+
struct ggml_tensor * attn_rel_b_cross = nullptr;
|
|
2642
2848
|
|
|
2643
2849
|
// normalization
|
|
2644
|
-
struct ggml_tensor * ffn_norm;
|
|
2645
|
-
struct ggml_tensor * ffn_norm_b;
|
|
2646
|
-
struct ggml_tensor * ffn_post_norm;
|
|
2647
|
-
struct ggml_tensor * layer_out_norm;
|
|
2648
|
-
struct ggml_tensor * layer_out_norm_b;
|
|
2649
|
-
struct ggml_tensor * ffn_norm_exps;
|
|
2650
|
-
struct ggml_tensor * ffn_norm_enc;
|
|
2850
|
+
struct ggml_tensor * ffn_norm = nullptr;
|
|
2851
|
+
struct ggml_tensor * ffn_norm_b = nullptr;
|
|
2852
|
+
struct ggml_tensor * ffn_post_norm = nullptr;
|
|
2853
|
+
struct ggml_tensor * layer_out_norm = nullptr;
|
|
2854
|
+
struct ggml_tensor * layer_out_norm_b = nullptr;
|
|
2855
|
+
struct ggml_tensor * ffn_norm_exps = nullptr;
|
|
2856
|
+
struct ggml_tensor * ffn_norm_enc = nullptr;
|
|
2651
2857
|
|
|
2652
2858
|
// ff
|
|
2653
|
-
struct ggml_tensor * ffn_gate; // w1
|
|
2654
|
-
struct ggml_tensor * ffn_down; // w2
|
|
2655
|
-
struct ggml_tensor * ffn_up;
|
|
2656
|
-
struct ggml_tensor * ffn_gate_enc;
|
|
2657
|
-
struct ggml_tensor * ffn_down_enc;
|
|
2658
|
-
struct ggml_tensor * ffn_up_enc;
|
|
2859
|
+
struct ggml_tensor * ffn_gate = nullptr; // w1
|
|
2860
|
+
struct ggml_tensor * ffn_down = nullptr; // w2
|
|
2861
|
+
struct ggml_tensor * ffn_up = nullptr; // w3
|
|
2862
|
+
struct ggml_tensor * ffn_gate_enc = nullptr;
|
|
2863
|
+
struct ggml_tensor * ffn_down_enc = nullptr;
|
|
2864
|
+
struct ggml_tensor * ffn_up_enc = nullptr;
|
|
2659
2865
|
|
|
2660
2866
|
// ff MoE
|
|
2661
|
-
struct ggml_tensor * ffn_gate_inp;
|
|
2662
|
-
struct ggml_tensor * ffn_gate_exps;
|
|
2663
|
-
struct ggml_tensor * ffn_down_exps;
|
|
2664
|
-
struct ggml_tensor * ffn_up_exps ;
|
|
2867
|
+
struct ggml_tensor * ffn_gate_inp = nullptr;
|
|
2868
|
+
struct ggml_tensor * ffn_gate_exps = nullptr;
|
|
2869
|
+
struct ggml_tensor * ffn_down_exps = nullptr;
|
|
2870
|
+
struct ggml_tensor * ffn_up_exps = nullptr;
|
|
2665
2871
|
|
|
2666
2872
|
// ff shared expert (shexp)
|
|
2667
|
-
struct ggml_tensor * ffn_gate_inp_shexp;
|
|
2668
|
-
struct ggml_tensor * ffn_gate_shexp;
|
|
2669
|
-
struct ggml_tensor * ffn_down_shexp;
|
|
2670
|
-
struct ggml_tensor * ffn_up_shexp;
|
|
2873
|
+
struct ggml_tensor * ffn_gate_inp_shexp = nullptr;
|
|
2874
|
+
struct ggml_tensor * ffn_gate_shexp = nullptr;
|
|
2875
|
+
struct ggml_tensor * ffn_down_shexp = nullptr;
|
|
2876
|
+
struct ggml_tensor * ffn_up_shexp = nullptr;
|
|
2671
2877
|
|
|
2672
2878
|
// ff bias
|
|
2673
|
-
struct ggml_tensor * ffn_gate_b;
|
|
2674
|
-
struct ggml_tensor * ffn_down_b; // b2
|
|
2675
|
-
struct ggml_tensor * ffn_up_b; // b3
|
|
2676
|
-
struct ggml_tensor * ffn_act;
|
|
2879
|
+
struct ggml_tensor * ffn_gate_b = nullptr;
|
|
2880
|
+
struct ggml_tensor * ffn_down_b = nullptr; // b2
|
|
2881
|
+
struct ggml_tensor * ffn_up_b = nullptr; // b3
|
|
2882
|
+
struct ggml_tensor * ffn_act = nullptr;
|
|
2677
2883
|
|
|
2678
2884
|
// mamba proj
|
|
2679
|
-
struct ggml_tensor * ssm_in;
|
|
2680
|
-
struct ggml_tensor * ssm_x;
|
|
2681
|
-
struct ggml_tensor * ssm_dt;
|
|
2682
|
-
struct ggml_tensor * ssm_out;
|
|
2885
|
+
struct ggml_tensor * ssm_in = nullptr;
|
|
2886
|
+
struct ggml_tensor * ssm_x = nullptr;
|
|
2887
|
+
struct ggml_tensor * ssm_dt = nullptr;
|
|
2888
|
+
struct ggml_tensor * ssm_out = nullptr;
|
|
2683
2889
|
|
|
2684
2890
|
// mamba
|
|
2685
|
-
struct ggml_tensor * ssm_conv1d;
|
|
2686
|
-
struct ggml_tensor * ssm_a;
|
|
2687
|
-
struct ggml_tensor * ssm_d;
|
|
2891
|
+
struct ggml_tensor * ssm_conv1d = nullptr;
|
|
2892
|
+
struct ggml_tensor * ssm_a = nullptr;
|
|
2893
|
+
struct ggml_tensor * ssm_d = nullptr;
|
|
2688
2894
|
|
|
2689
2895
|
// mamba bias
|
|
2690
|
-
struct ggml_tensor * ssm_conv1d_b;
|
|
2691
|
-
struct ggml_tensor * ssm_dt_b;
|
|
2896
|
+
struct ggml_tensor * ssm_conv1d_b = nullptr;
|
|
2897
|
+
struct ggml_tensor * ssm_dt_b = nullptr;
|
|
2692
2898
|
|
|
2693
2899
|
// rwkv
|
|
2694
|
-
struct ggml_tensor * time_mix_w1;
|
|
2695
|
-
struct ggml_tensor * time_mix_w2;
|
|
2696
|
-
struct ggml_tensor * time_mix_lerp_x;
|
|
2697
|
-
struct ggml_tensor * time_mix_lerp_w;
|
|
2698
|
-
struct ggml_tensor * time_mix_lerp_k;
|
|
2699
|
-
struct ggml_tensor * time_mix_lerp_v;
|
|
2700
|
-
struct ggml_tensor * time_mix_lerp_r;
|
|
2701
|
-
struct ggml_tensor * time_mix_lerp_g;
|
|
2702
|
-
|
|
2703
|
-
struct ggml_tensor * time_mix_first;
|
|
2704
|
-
struct ggml_tensor * time_mix_decay;
|
|
2705
|
-
struct ggml_tensor * time_mix_decay_w1;
|
|
2706
|
-
struct ggml_tensor * time_mix_decay_w2;
|
|
2707
|
-
struct ggml_tensor * time_mix_key;
|
|
2708
|
-
struct ggml_tensor * time_mix_value;
|
|
2709
|
-
struct ggml_tensor * time_mix_receptance;
|
|
2710
|
-
struct ggml_tensor * time_mix_gate;
|
|
2711
|
-
|
|
2712
|
-
struct ggml_tensor * time_mix_ln;
|
|
2713
|
-
struct ggml_tensor * time_mix_ln_b;
|
|
2714
|
-
struct ggml_tensor * time_mix_output;
|
|
2715
|
-
|
|
2716
|
-
struct ggml_tensor * channel_mix_lerp_k;
|
|
2717
|
-
struct ggml_tensor * channel_mix_lerp_r;
|
|
2718
|
-
|
|
2719
|
-
struct ggml_tensor * channel_mix_key;
|
|
2720
|
-
struct ggml_tensor * channel_mix_receptance;
|
|
2721
|
-
struct ggml_tensor * channel_mix_value;
|
|
2900
|
+
struct ggml_tensor * time_mix_w1 = nullptr;
|
|
2901
|
+
struct ggml_tensor * time_mix_w2 = nullptr;
|
|
2902
|
+
struct ggml_tensor * time_mix_lerp_x = nullptr;
|
|
2903
|
+
struct ggml_tensor * time_mix_lerp_w = nullptr;
|
|
2904
|
+
struct ggml_tensor * time_mix_lerp_k = nullptr;
|
|
2905
|
+
struct ggml_tensor * time_mix_lerp_v = nullptr;
|
|
2906
|
+
struct ggml_tensor * time_mix_lerp_r = nullptr;
|
|
2907
|
+
struct ggml_tensor * time_mix_lerp_g = nullptr;
|
|
2908
|
+
|
|
2909
|
+
struct ggml_tensor * time_mix_first = nullptr;
|
|
2910
|
+
struct ggml_tensor * time_mix_decay = nullptr;
|
|
2911
|
+
struct ggml_tensor * time_mix_decay_w1 = nullptr;
|
|
2912
|
+
struct ggml_tensor * time_mix_decay_w2 = nullptr;
|
|
2913
|
+
struct ggml_tensor * time_mix_key = nullptr;
|
|
2914
|
+
struct ggml_tensor * time_mix_value = nullptr;
|
|
2915
|
+
struct ggml_tensor * time_mix_receptance = nullptr;
|
|
2916
|
+
struct ggml_tensor * time_mix_gate = nullptr;
|
|
2917
|
+
|
|
2918
|
+
struct ggml_tensor * time_mix_ln = nullptr;
|
|
2919
|
+
struct ggml_tensor * time_mix_ln_b = nullptr;
|
|
2920
|
+
struct ggml_tensor * time_mix_output = nullptr;
|
|
2921
|
+
|
|
2922
|
+
struct ggml_tensor * channel_mix_lerp_k = nullptr;
|
|
2923
|
+
struct ggml_tensor * channel_mix_lerp_r = nullptr;
|
|
2924
|
+
|
|
2925
|
+
struct ggml_tensor * channel_mix_key = nullptr;
|
|
2926
|
+
struct ggml_tensor * channel_mix_receptance = nullptr;
|
|
2927
|
+
struct ggml_tensor * channel_mix_value = nullptr;
|
|
2722
2928
|
|
|
2723
2929
|
// long rope factors
|
|
2724
2930
|
struct ggml_tensor * rope_long = nullptr;
|
|
@@ -2726,13 +2932,17 @@ struct llama_layer {
|
|
|
2726
2932
|
struct ggml_tensor * rope_freqs = nullptr;
|
|
2727
2933
|
|
|
2728
2934
|
// bitnet scale
|
|
2729
|
-
struct ggml_tensor * wq_scale;
|
|
2730
|
-
struct ggml_tensor * wk_scale;
|
|
2731
|
-
struct ggml_tensor * wv_scale;
|
|
2732
|
-
struct ggml_tensor * wo_scale;
|
|
2733
|
-
struct ggml_tensor * ffn_gate_scale;
|
|
2734
|
-
struct ggml_tensor * ffn_up_scale;
|
|
2735
|
-
struct ggml_tensor * ffn_down_scale;
|
|
2935
|
+
struct ggml_tensor * wq_scale = nullptr;
|
|
2936
|
+
struct ggml_tensor * wk_scale = nullptr;
|
|
2937
|
+
struct ggml_tensor * wv_scale = nullptr;
|
|
2938
|
+
struct ggml_tensor * wo_scale = nullptr;
|
|
2939
|
+
struct ggml_tensor * ffn_gate_scale = nullptr;
|
|
2940
|
+
struct ggml_tensor * ffn_up_scale = nullptr;
|
|
2941
|
+
struct ggml_tensor * ffn_down_scale = nullptr;
|
|
2942
|
+
|
|
2943
|
+
struct llama_layer_posnet posnet;
|
|
2944
|
+
|
|
2945
|
+
struct llama_layer_convnext convnext;
|
|
2736
2946
|
};
|
|
2737
2947
|
|
|
2738
2948
|
// very similar to llama_batch,
|
|
@@ -2863,6 +3073,9 @@ struct llama_model {
|
|
|
2863
3073
|
struct ggml_tensor * cls_out = nullptr;
|
|
2864
3074
|
struct ggml_tensor * cls_out_b = nullptr;
|
|
2865
3075
|
|
|
3076
|
+
struct ggml_tensor * conv1d = nullptr;
|
|
3077
|
+
struct ggml_tensor * conv1d_b = nullptr;
|
|
3078
|
+
|
|
2866
3079
|
std::vector<llama_layer> layers;
|
|
2867
3080
|
|
|
2868
3081
|
// gguf metadata
|
|
@@ -2947,6 +3160,7 @@ struct llama_sbatch {
|
|
|
2947
3160
|
// batch indices of the output
|
|
2948
3161
|
std::vector<size_t> out_ids;
|
|
2949
3162
|
std::vector<llama_sbatch_seq> seq;
|
|
3163
|
+
|
|
2950
3164
|
const llama_batch * batch = nullptr;
|
|
2951
3165
|
|
|
2952
3166
|
// buffers for the ubatch
|
|
@@ -3292,6 +3506,11 @@ struct llama_context {
|
|
|
3292
3506
|
// whether we are computing encoder output or decoder output
|
|
3293
3507
|
bool is_encoding = false;
|
|
3294
3508
|
|
|
3509
|
+
// TODO: find a better way to accommodate mutli-dimension position encoding methods
|
|
3510
|
+
// number of position id each token get, 1 for each token in most cases.
|
|
3511
|
+
// when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
|
|
3512
|
+
int n_pos_per_token = 1;
|
|
3513
|
+
|
|
3295
3514
|
// output of the encoder part of the encoder-decoder models
|
|
3296
3515
|
std::vector<float> embd_enc;
|
|
3297
3516
|
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
|
@@ -3362,6 +3581,17 @@ static int llama_get_device_count(const llama_model & model) {
|
|
|
3362
3581
|
return (int) model.devices.size();
|
|
3363
3582
|
}
|
|
3364
3583
|
|
|
3584
|
+
static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) {
|
|
3585
|
+
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
|
3586
|
+
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
|
3587
|
+
return it.first == name;
|
|
3588
|
+
});
|
|
3589
|
+
if (it == model->tensors_by_name.end()) {
|
|
3590
|
+
return nullptr;
|
|
3591
|
+
}
|
|
3592
|
+
return it->second;
|
|
3593
|
+
}
|
|
3594
|
+
|
|
3365
3595
|
template<typename F>
|
|
3366
3596
|
static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) {
|
|
3367
3597
|
ggml_init_params params = {
|
|
@@ -3415,7 +3645,9 @@ static bool llama_kv_cache_init(
|
|
|
3415
3645
|
|
|
3416
3646
|
const struct llama_hparams & hparams = model.hparams;
|
|
3417
3647
|
|
|
3418
|
-
const
|
|
3648
|
+
const int32_t n_layer = hparams.n_layer;
|
|
3649
|
+
|
|
3650
|
+
LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer);
|
|
3419
3651
|
|
|
3420
3652
|
cache.has_shift = false;
|
|
3421
3653
|
|
|
@@ -3456,10 +3688,12 @@ static bool llama_kv_cache_init(
|
|
|
3456
3688
|
cache.k_l.reserve(n_layer);
|
|
3457
3689
|
cache.v_l.reserve(n_layer);
|
|
3458
3690
|
|
|
3459
|
-
for (int i = 0; i <
|
|
3691
|
+
for (int i = 0; i < n_layer; i++) {
|
|
3460
3692
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
|
3461
3693
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
|
3462
3694
|
|
|
3695
|
+
LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa);
|
|
3696
|
+
|
|
3463
3697
|
ggml_backend_buffer_type_t buft;
|
|
3464
3698
|
if (offload) {
|
|
3465
3699
|
auto * dev = model.dev_layer.at(i).dev;
|
|
@@ -4492,9 +4726,6 @@ struct llama_model_loader {
|
|
|
4492
4726
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
|
4493
4727
|
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
|
4494
4728
|
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
|
4495
|
-
case GGML_TYPE_Q4_0_4_4: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_4; break;
|
|
4496
|
-
case GGML_TYPE_Q4_0_4_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_4_8; break;
|
|
4497
|
-
case GGML_TYPE_Q4_0_8_8: ftype = LLAMA_FTYPE_MOSTLY_Q4_0_8_8; break;
|
|
4498
4729
|
default:
|
|
4499
4730
|
{
|
|
4500
4731
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -4845,7 +5076,9 @@ struct llama_model_loader {
|
|
|
4845
5076
|
mappings.reserve(files.size());
|
|
4846
5077
|
mmaps_used.reserve(files.size());
|
|
4847
5078
|
for (const auto & file : files) {
|
|
4848
|
-
|
|
5079
|
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
|
5080
|
+
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
|
5081
|
+
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn()));
|
|
4849
5082
|
mmaps_used.emplace_back(mapping->size, 0);
|
|
4850
5083
|
if (mlock_mmaps) {
|
|
4851
5084
|
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
|
|
@@ -5256,9 +5489,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
5256
5489
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
|
5257
5490
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
|
5258
5491
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
|
5259
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: return "Q4_0_4_4";
|
|
5260
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: return "Q4_0_4_8";
|
|
5261
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: return "Q4_0_8_8";
|
|
5262
5492
|
|
|
5263
5493
|
default: return "unknown, may not work";
|
|
5264
5494
|
}
|
|
@@ -5307,6 +5537,7 @@ static const char * llama_model_type_name(e_model type) {
|
|
|
5307
5537
|
case MODEL_16B: return "16B";
|
|
5308
5538
|
case MODEL_20B: return "20B";
|
|
5309
5539
|
case MODEL_30B: return "30B";
|
|
5540
|
+
case MODEL_32B: return "32B";
|
|
5310
5541
|
case MODEL_34B: return "34B";
|
|
5311
5542
|
case MODEL_35B: return "35B";
|
|
5312
5543
|
case MODEL_40B: return "40B";
|
|
@@ -5375,7 +5606,7 @@ static void llm_load_hparams(
|
|
|
5375
5606
|
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
|
5376
5607
|
|
|
5377
5608
|
// get hparams kv
|
|
5378
|
-
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
|
5609
|
+
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false);
|
|
5379
5610
|
|
|
5380
5611
|
// everything past this point is not vocab-related
|
|
5381
5612
|
if (hparams.vocab_only) {
|
|
@@ -5388,6 +5619,16 @@ static void llm_load_hparams(
|
|
|
5388
5619
|
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
|
5389
5620
|
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
|
5390
5621
|
|
|
5622
|
+
if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) {
|
|
5623
|
+
ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features);
|
|
5624
|
+
|
|
5625
|
+
ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd);
|
|
5626
|
+
ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer);
|
|
5627
|
+
|
|
5628
|
+
ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd);
|
|
5629
|
+
ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer);
|
|
5630
|
+
}
|
|
5631
|
+
|
|
5391
5632
|
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
|
5392
5633
|
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
|
5393
5634
|
if (hparams.n_expert > 0) {
|
|
@@ -5396,13 +5637,13 @@ static void llm_load_hparams(
|
|
|
5396
5637
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
5397
5638
|
}
|
|
5398
5639
|
|
|
5399
|
-
// zero-out the
|
|
5640
|
+
// zero-out the array hparams
|
|
5400
5641
|
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
|
|
5401
5642
|
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
|
|
5402
5643
|
std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0);
|
|
5403
5644
|
|
|
5404
|
-
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer);
|
|
5405
|
-
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer);
|
|
5645
|
+
ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
|
|
5646
|
+
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
|
|
5406
5647
|
|
|
5407
5648
|
// n_head_kv is optional, default to n_head
|
|
5408
5649
|
hparams.n_head_kv_arr = hparams.n_head_arr;
|
|
@@ -5494,8 +5735,12 @@ static void llm_load_hparams(
|
|
|
5494
5735
|
case LLM_ARCH_MINICPM:
|
|
5495
5736
|
{
|
|
5496
5737
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
5738
|
+
ml.get_key(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
|
5739
|
+
ml.get_key(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
|
5740
|
+
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
|
5497
5741
|
|
|
5498
5742
|
switch (hparams.n_layer) {
|
|
5743
|
+
case 52: model.type = e_model::MODEL_1B; break;
|
|
5499
5744
|
case 40: model.type = e_model::MODEL_2B; break;
|
|
5500
5745
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
5501
5746
|
}
|
|
@@ -5660,6 +5905,13 @@ static void llm_load_hparams(
|
|
|
5660
5905
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
5661
5906
|
}
|
|
5662
5907
|
} break;
|
|
5908
|
+
case LLM_ARCH_QWEN2VL:
|
|
5909
|
+
{
|
|
5910
|
+
std::array<int, 4> section_dims;
|
|
5911
|
+
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, section_dims, 4, true);
|
|
5912
|
+
std::copy(section_dims.begin(), section_dims.begin() + 4, std::begin(hparams.rope_sections));
|
|
5913
|
+
}
|
|
5914
|
+
// fall through
|
|
5663
5915
|
case LLM_ARCH_QWEN2:
|
|
5664
5916
|
{
|
|
5665
5917
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -5667,7 +5919,10 @@ static void llm_load_hparams(
|
|
|
5667
5919
|
case 24: model.type = hparams.n_embd == 1024 ? e_model::MODEL_0_5B : e_model::MODEL_1B; break;
|
|
5668
5920
|
case 28: model.type = hparams.n_embd == 1536 ? e_model::MODEL_1_5B : e_model::MODEL_7B; break;
|
|
5669
5921
|
case 32: model.type = e_model::MODEL_7B; break;
|
|
5922
|
+
case 36: model.type = e_model::MODEL_3B; break;
|
|
5670
5923
|
case 40: model.type = hparams.n_head() == 20 ? e_model::MODEL_4B : e_model::MODEL_13B; break;
|
|
5924
|
+
case 48: model.type = e_model::MODEL_14B; break;
|
|
5925
|
+
case 64: model.type = e_model::MODEL_32B; break;
|
|
5671
5926
|
case 80: model.type = e_model::MODEL_70B; break;
|
|
5672
5927
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
5673
5928
|
}
|
|
@@ -5877,6 +6132,17 @@ static void llm_load_hparams(
|
|
|
5877
6132
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
5878
6133
|
}
|
|
5879
6134
|
} break;
|
|
6135
|
+
case LLM_ARCH_OLMO2:
|
|
6136
|
+
{
|
|
6137
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
6138
|
+
|
|
6139
|
+
switch (hparams.n_layer) {
|
|
6140
|
+
case 16: model.type = e_model::MODEL_1B; break;
|
|
6141
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
|
6142
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
|
6143
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
6144
|
+
}
|
|
6145
|
+
} break;
|
|
5880
6146
|
case LLM_ARCH_OLMOE:
|
|
5881
6147
|
{
|
|
5882
6148
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
@@ -5956,6 +6222,19 @@ static void llm_load_hparams(
|
|
|
5956
6222
|
model.type = e_model::MODEL_UNKNOWN;
|
|
5957
6223
|
}
|
|
5958
6224
|
} break;
|
|
6225
|
+
case LLM_ARCH_DEEPSEEK:
|
|
6226
|
+
{
|
|
6227
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
|
6228
|
+
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
|
6229
|
+
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
|
6230
|
+
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
|
6231
|
+
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
|
6232
|
+
|
|
6233
|
+
switch (hparams.n_layer) {
|
|
6234
|
+
case 28: model.type = e_model::MODEL_20B; break;
|
|
6235
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
|
6236
|
+
}
|
|
6237
|
+
} break;
|
|
5959
6238
|
case LLM_ARCH_DEEPSEEK2:
|
|
5960
6239
|
{
|
|
5961
6240
|
bool is_lite = (hparams.n_layer == 27);
|
|
@@ -6109,6 +6388,13 @@ static void llm_load_hparams(
|
|
|
6109
6388
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
6110
6389
|
}
|
|
6111
6390
|
} break;
|
|
6391
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
6392
|
+
{
|
|
6393
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
6394
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps);
|
|
6395
|
+
ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups);
|
|
6396
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
|
6397
|
+
} break;
|
|
6112
6398
|
default: (void)0;
|
|
6113
6399
|
}
|
|
6114
6400
|
|
|
@@ -6138,7 +6424,7 @@ static void llm_load_vocab(
|
|
|
6138
6424
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
|
6139
6425
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
|
6140
6426
|
|
|
6141
|
-
if (tokenizer_model == "no_vocab") {
|
|
6427
|
+
if (tokenizer_model == "no_vocab" || tokenizer_model == "none") {
|
|
6142
6428
|
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
|
6143
6429
|
|
|
6144
6430
|
// default special tokens
|
|
@@ -6302,10 +6588,12 @@ static void llm_load_vocab(
|
|
|
6302
6588
|
tokenizer_pre == "phi-2" ||
|
|
6303
6589
|
tokenizer_pre == "jina-es" ||
|
|
6304
6590
|
tokenizer_pre == "jina-de" ||
|
|
6591
|
+
tokenizer_pre == "gigachat" ||
|
|
6305
6592
|
tokenizer_pre == "jina-v1-en" ||
|
|
6306
6593
|
tokenizer_pre == "jina-v2-es" ||
|
|
6307
6594
|
tokenizer_pre == "jina-v2-de" ||
|
|
6308
|
-
tokenizer_pre == "jina-v2-code"
|
|
6595
|
+
tokenizer_pre == "jina-v2-code" ||
|
|
6596
|
+
tokenizer_pre == "roberta-bpe") {
|
|
6309
6597
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
|
6310
6598
|
} else if (
|
|
6311
6599
|
tokenizer_pre == "refact") {
|
|
@@ -6372,6 +6660,9 @@ static void llm_load_vocab(
|
|
|
6372
6660
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
|
6373
6661
|
vocab.tokenizer_add_bos = true;
|
|
6374
6662
|
vocab.tokenizer_clean_spaces = false;
|
|
6663
|
+
} else if (
|
|
6664
|
+
tokenizer_pre == "minerva-7b") {
|
|
6665
|
+
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA;
|
|
6375
6666
|
} else {
|
|
6376
6667
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
|
6377
6668
|
}
|
|
@@ -6950,6 +7241,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
6950
7241
|
|
|
6951
7242
|
LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len);
|
|
6952
7243
|
|
|
7244
|
+
if (model.arch == LLM_ARCH_DEEPSEEK) {
|
|
7245
|
+
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
7246
|
+
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
|
|
7247
|
+
LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared);
|
|
7248
|
+
LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale);
|
|
7249
|
+
}
|
|
7250
|
+
|
|
6953
7251
|
if (model.arch == LLM_ARCH_DEEPSEEK2) {
|
|
6954
7252
|
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
|
|
6955
7253
|
LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q);
|
|
@@ -6965,7 +7263,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
6965
7263
|
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
|
|
6966
7264
|
}
|
|
6967
7265
|
|
|
6968
|
-
if (model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
|
7266
|
+
if (model.arch == LLM_ARCH_MINICPM || model.arch == LLM_ARCH_GRANITE || model.arch == LLM_ARCH_GRANITE_MOE) {
|
|
6969
7267
|
LLAMA_LOG_INFO("%s: f_embedding_scale = %f\n", __func__, hparams.f_embedding_scale);
|
|
6970
7268
|
LLAMA_LOG_INFO("%s: f_residual_scale = %f\n", __func__, hparams.f_residual_scale);
|
|
6971
7269
|
LLAMA_LOG_INFO("%s: f_attention_scale = %f\n", __func__, hparams.f_attention_scale);
|
|
@@ -7106,6 +7404,22 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
|
|
|
7106
7404
|
{LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}},
|
|
7107
7405
|
// this tensor is loaded for T5, but never used
|
|
7108
7406
|
{LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}},
|
|
7407
|
+
{LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}},
|
|
7408
|
+
{LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
7409
|
+
{LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
7410
|
+
{LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
7411
|
+
{LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
|
|
7412
|
+
{LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
|
|
7413
|
+
{LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
7414
|
+
{LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
7415
|
+
{LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
7416
|
+
{LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
7417
|
+
{LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
7418
|
+
{LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}},
|
|
7419
|
+
{LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
7420
|
+
{LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
7421
|
+
{LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
|
7422
|
+
{LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
|
|
7109
7423
|
};
|
|
7110
7424
|
|
|
7111
7425
|
// checks if the weight tensor can be used with the specified buffer type and device
|
|
@@ -7149,12 +7463,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
7149
7463
|
} break;
|
|
7150
7464
|
case GGML_OP_ADD:
|
|
7151
7465
|
{
|
|
7152
|
-
ggml_tensor * a =
|
|
7466
|
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
7153
7467
|
op_tensor = ggml_add(ctx, a, w);
|
|
7154
7468
|
} break;
|
|
7155
7469
|
case GGML_OP_MUL:
|
|
7156
7470
|
{
|
|
7157
|
-
ggml_tensor * a =
|
|
7471
|
+
ggml_tensor * a = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, w->ne[0], w->ne[1], w->ne[2], w->ne[3]);
|
|
7158
7472
|
op_tensor = ggml_mul(ctx, a, w);
|
|
7159
7473
|
} break;
|
|
7160
7474
|
case GGML_OP_DIV:
|
|
@@ -7210,6 +7524,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
|
|
|
7210
7524
|
ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H);
|
|
7211
7525
|
op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state);
|
|
7212
7526
|
} break;
|
|
7527
|
+
case GGML_OP_IM2COL:
|
|
7528
|
+
{
|
|
7529
|
+
const int n_embd = hparams.n_embd;
|
|
7530
|
+
ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1);
|
|
7531
|
+
op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16);
|
|
7532
|
+
} break;
|
|
7213
7533
|
default:
|
|
7214
7534
|
GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name);
|
|
7215
7535
|
}
|
|
@@ -7340,7 +7660,8 @@ static bool llm_load_tensors(
|
|
|
7340
7660
|
model.main_gpu = main_gpu;
|
|
7341
7661
|
model.n_gpu_layers = n_gpu_layers;
|
|
7342
7662
|
|
|
7343
|
-
const int n_layer
|
|
7663
|
+
const int n_layer = hparams.n_layer;
|
|
7664
|
+
|
|
7344
7665
|
bool use_mmap_buffer = true;
|
|
7345
7666
|
|
|
7346
7667
|
// build a list of buffer types for the CPU and GPU devices
|
|
@@ -7590,7 +7911,13 @@ static bool llm_load_tensors(
|
|
|
7590
7911
|
|
|
7591
7912
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
7592
7913
|
|
|
7593
|
-
|
|
7914
|
+
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
|
7915
|
+
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
7916
|
+
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
7917
|
+
}
|
|
7918
|
+
else {
|
|
7919
|
+
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
|
7920
|
+
}
|
|
7594
7921
|
|
|
7595
7922
|
if (n_expert == 0) {
|
|
7596
7923
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
@@ -8057,6 +8384,7 @@ static bool llm_load_tensors(
|
|
|
8057
8384
|
}
|
|
8058
8385
|
} break;
|
|
8059
8386
|
case LLM_ARCH_QWEN2:
|
|
8387
|
+
case LLM_ARCH_QWEN2VL:
|
|
8060
8388
|
{
|
|
8061
8389
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
8062
8390
|
|
|
@@ -8559,6 +8887,31 @@ static bool llm_load_tensors(
|
|
|
8559
8887
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
8560
8888
|
}
|
|
8561
8889
|
} break;
|
|
8890
|
+
case LLM_ARCH_OLMO2:
|
|
8891
|
+
{
|
|
8892
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
8893
|
+
|
|
8894
|
+
// output
|
|
8895
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
8896
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
8897
|
+
|
|
8898
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
8899
|
+
auto & layer = model.layers[i];
|
|
8900
|
+
|
|
8901
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
8902
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
8903
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
8904
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
8905
|
+
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
|
8906
|
+
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
|
8907
|
+
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
8908
|
+
|
|
8909
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
8910
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
8911
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
8912
|
+
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
|
8913
|
+
}
|
|
8914
|
+
} break;
|
|
8562
8915
|
case LLM_ARCH_OLMOE:
|
|
8563
8916
|
{
|
|
8564
8917
|
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
@@ -8692,6 +9045,55 @@ static bool llm_load_tensors(
|
|
|
8692
9045
|
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0);
|
|
8693
9046
|
}
|
|
8694
9047
|
} break;
|
|
9048
|
+
case LLM_ARCH_DEEPSEEK:
|
|
9049
|
+
{
|
|
9050
|
+
|
|
9051
|
+
const int64_t n_ff_exp = hparams.n_ff_exp;
|
|
9052
|
+
const int64_t n_expert_shared = hparams.n_expert_shared;
|
|
9053
|
+
|
|
9054
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
9055
|
+
|
|
9056
|
+
// output
|
|
9057
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
9058
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
|
9059
|
+
|
|
9060
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
9061
|
+
auto & layer = model.layers[i];
|
|
9062
|
+
|
|
9063
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
|
|
9064
|
+
|
|
9065
|
+
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
|
9066
|
+
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
9067
|
+
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
|
9068
|
+
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
|
9069
|
+
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
|
9070
|
+
|
|
9071
|
+
if (i < (int) hparams.n_layer_dense_lead) {
|
|
9072
|
+
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
|
9073
|
+
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
|
9074
|
+
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
9075
|
+
} else {
|
|
9076
|
+
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
|
|
9077
|
+
|
|
9078
|
+
if (n_expert == 0) {
|
|
9079
|
+
throw std::runtime_error("n_expert must be > 0");
|
|
9080
|
+
}
|
|
9081
|
+
if (n_expert_used == 0) {
|
|
9082
|
+
throw std::runtime_error("n_expert_used must be > 0");
|
|
9083
|
+
}
|
|
9084
|
+
|
|
9085
|
+
// MoE branch
|
|
9086
|
+
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
9087
|
+
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
|
|
9088
|
+
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0);
|
|
9089
|
+
|
|
9090
|
+
// Shared expert branch
|
|
9091
|
+
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
9092
|
+
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0);
|
|
9093
|
+
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
|
|
9094
|
+
}
|
|
9095
|
+
}
|
|
9096
|
+
} break;
|
|
8695
9097
|
case LLM_ARCH_DEEPSEEK2:
|
|
8696
9098
|
{
|
|
8697
9099
|
const bool is_lite = (hparams.n_layer == 27);
|
|
@@ -9062,9 +9464,9 @@ static bool llm_load_tensors(
|
|
|
9062
9464
|
} break;
|
|
9063
9465
|
case LLM_ARCH_CHAMELEON:
|
|
9064
9466
|
{
|
|
9065
|
-
|
|
9467
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
|
9066
9468
|
|
|
9067
|
-
|
|
9469
|
+
// output
|
|
9068
9470
|
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
9069
9471
|
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
|
9070
9472
|
// if output is NULL, init from the input tok embed
|
|
@@ -9093,13 +9495,116 @@ static bool llm_load_tensors(
|
|
|
9093
9495
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
|
9094
9496
|
}
|
|
9095
9497
|
} break;
|
|
9096
|
-
|
|
9097
|
-
|
|
9098
|
-
|
|
9498
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
9499
|
+
{
|
|
9500
|
+
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0);
|
|
9099
9501
|
|
|
9100
|
-
|
|
9101
|
-
|
|
9102
|
-
|
|
9502
|
+
model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0);
|
|
9503
|
+
model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0);
|
|
9504
|
+
|
|
9505
|
+
// posnet
|
|
9506
|
+
{
|
|
9507
|
+
const int64_t n_embd = hparams.posnet.n_embd;
|
|
9508
|
+
|
|
9509
|
+
for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) {
|
|
9510
|
+
auto & layer = model.layers[i].posnet;
|
|
9511
|
+
|
|
9512
|
+
// posnet:
|
|
9513
|
+
//
|
|
9514
|
+
// - resnet
|
|
9515
|
+
// - resnet
|
|
9516
|
+
// - attn
|
|
9517
|
+
// - resnet
|
|
9518
|
+
// - resnet
|
|
9519
|
+
// - norm
|
|
9520
|
+
//
|
|
9521
|
+
switch (i) {
|
|
9522
|
+
case 0:
|
|
9523
|
+
case 1:
|
|
9524
|
+
case 3:
|
|
9525
|
+
case 4:
|
|
9526
|
+
{
|
|
9527
|
+
layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0);
|
|
9528
|
+
layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0);
|
|
9529
|
+
|
|
9530
|
+
layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0);
|
|
9531
|
+
layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0);
|
|
9532
|
+
|
|
9533
|
+
layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0);
|
|
9534
|
+
layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0);
|
|
9535
|
+
|
|
9536
|
+
layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0);
|
|
9537
|
+
layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0);
|
|
9538
|
+
} break;
|
|
9539
|
+
case 2:
|
|
9540
|
+
{
|
|
9541
|
+
layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
|
9542
|
+
layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
|
9543
|
+
|
|
9544
|
+
layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0);
|
|
9545
|
+
layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0);
|
|
9546
|
+
|
|
9547
|
+
layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0);
|
|
9548
|
+
layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0);
|
|
9549
|
+
|
|
9550
|
+
layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0);
|
|
9551
|
+
layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0);
|
|
9552
|
+
|
|
9553
|
+
layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0);
|
|
9554
|
+
layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0);
|
|
9555
|
+
} break;
|
|
9556
|
+
case 5:
|
|
9557
|
+
{
|
|
9558
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0);
|
|
9559
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0);
|
|
9560
|
+
} break;
|
|
9561
|
+
default: GGML_ABORT("unknown posnet layer");
|
|
9562
|
+
};
|
|
9563
|
+
}
|
|
9564
|
+
}
|
|
9565
|
+
|
|
9566
|
+
GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd);
|
|
9567
|
+
|
|
9568
|
+
model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0);
|
|
9569
|
+
model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0);
|
|
9570
|
+
|
|
9571
|
+
// convnext
|
|
9572
|
+
{
|
|
9573
|
+
const int64_t n_embd = hparams.convnext.n_embd;
|
|
9574
|
+
|
|
9575
|
+
for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) {
|
|
9576
|
+
auto & layer = model.layers[i].convnext;
|
|
9577
|
+
|
|
9578
|
+
layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0);
|
|
9579
|
+
layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0);
|
|
9580
|
+
|
|
9581
|
+
layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0);
|
|
9582
|
+
layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0);
|
|
9583
|
+
|
|
9584
|
+
layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0);
|
|
9585
|
+
layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0);
|
|
9586
|
+
|
|
9587
|
+
layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0);
|
|
9588
|
+
layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0);
|
|
9589
|
+
|
|
9590
|
+
layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0);
|
|
9591
|
+
}
|
|
9592
|
+
|
|
9593
|
+
// output
|
|
9594
|
+
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
|
9595
|
+
model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
|
9596
|
+
}
|
|
9597
|
+
|
|
9598
|
+
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0);
|
|
9599
|
+
model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0);
|
|
9600
|
+
} break;
|
|
9601
|
+
default:
|
|
9602
|
+
throw std::runtime_error("unknown architecture");
|
|
9603
|
+
}
|
|
9604
|
+
|
|
9605
|
+
if (n_moved_tensors > 0) {
|
|
9606
|
+
LLAMA_LOG_DEBUG("%s: tensor '%s' (%s) (and %d others) cannot be used with preferred buffer type %s, using %s instead\n",
|
|
9607
|
+
__func__, first_moved_tensor->name, ggml_type_name(first_moved_tensor->type), n_moved_tensors - 1,
|
|
9103
9608
|
ggml_backend_buft_name(first_moved_from_buft), ggml_backend_buft_name(first_moved_to_buft));
|
|
9104
9609
|
}
|
|
9105
9610
|
}
|
|
@@ -9133,7 +9638,7 @@ static bool llm_load_tensors(
|
|
|
9133
9638
|
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
|
9134
9639
|
if (!dev) {
|
|
9135
9640
|
// FIXME: workaround for CPU backend buft having a NULL device
|
|
9136
|
-
dev =
|
|
9641
|
+
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
9137
9642
|
}
|
|
9138
9643
|
ggml_backend_dev_props props;
|
|
9139
9644
|
ggml_backend_dev_get_props(dev, &props);
|
|
@@ -9312,6 +9817,7 @@ enum llm_ffn_gate_type {
|
|
|
9312
9817
|
enum llm_norm_type {
|
|
9313
9818
|
LLM_NORM,
|
|
9314
9819
|
LLM_NORM_RMS,
|
|
9820
|
+
LLM_NORM_GROUP,
|
|
9315
9821
|
};
|
|
9316
9822
|
|
|
9317
9823
|
static struct ggml_tensor * llm_build_inp_embd(
|
|
@@ -9332,7 +9838,7 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
|
9332
9838
|
|
|
9333
9839
|
inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens);
|
|
9334
9840
|
} else {
|
|
9335
|
-
|
|
9841
|
+
lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
|
|
9336
9842
|
inpL = lctx.inp_embd;
|
|
9337
9843
|
ggml_set_input(lctx.inp_embd);
|
|
9338
9844
|
}
|
|
@@ -9453,8 +9959,14 @@ static struct ggml_tensor * llm_build_norm(
|
|
|
9453
9959
|
const llm_build_cb & cb,
|
|
9454
9960
|
int il) {
|
|
9455
9961
|
switch (type) {
|
|
9456
|
-
case LLM_NORM:
|
|
9457
|
-
case LLM_NORM_RMS:
|
|
9962
|
+
case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break;
|
|
9963
|
+
case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break;
|
|
9964
|
+
case LLM_NORM_GROUP:
|
|
9965
|
+
{
|
|
9966
|
+
cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
|
|
9967
|
+
cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps);
|
|
9968
|
+
cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]);
|
|
9969
|
+
} break;
|
|
9458
9970
|
}
|
|
9459
9971
|
|
|
9460
9972
|
if (mw || mb) {
|
|
@@ -12421,6 +12933,124 @@ struct llm_build_context {
|
|
|
12421
12933
|
return gf;
|
|
12422
12934
|
}
|
|
12423
12935
|
|
|
12936
|
+
struct ggml_cgraph * build_qwen2vl() {
|
|
12937
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
12938
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
12939
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
12940
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
12941
|
+
|
|
12942
|
+
struct ggml_tensor * cur;
|
|
12943
|
+
struct ggml_tensor * inpL;
|
|
12944
|
+
|
|
12945
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
12946
|
+
|
|
12947
|
+
// inp_pos - contains the positions
|
|
12948
|
+
lctx.inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens * 4);
|
|
12949
|
+
cb(lctx.inp_pos, "inp_pos", -1);
|
|
12950
|
+
ggml_set_input(lctx.inp_pos);
|
|
12951
|
+
struct ggml_tensor * inp_pos = lctx.inp_pos;
|
|
12952
|
+
|
|
12953
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
12954
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
12955
|
+
int sections[4];
|
|
12956
|
+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
|
|
12957
|
+
|
|
12958
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
12959
|
+
struct ggml_tensor * inpSA = inpL;
|
|
12960
|
+
|
|
12961
|
+
// norm
|
|
12962
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
12963
|
+
model.layers[il].attn_norm, NULL,
|
|
12964
|
+
LLM_NORM_RMS, cb, il);
|
|
12965
|
+
cb(cur, "attn_norm", il);
|
|
12966
|
+
|
|
12967
|
+
// self-attention
|
|
12968
|
+
{
|
|
12969
|
+
// compute Q and K and RoPE them
|
|
12970
|
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
12971
|
+
cb(Qcur, "Qcur", il);
|
|
12972
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
12973
|
+
cb(Qcur, "Qcur", il);
|
|
12974
|
+
|
|
12975
|
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
|
12976
|
+
cb(Kcur, "Kcur", il);
|
|
12977
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
12978
|
+
cb(Kcur, "Kcur", il);
|
|
12979
|
+
|
|
12980
|
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
|
12981
|
+
cb(Vcur, "Vcur", il);
|
|
12982
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
12983
|
+
cb(Vcur, "Vcur", il);
|
|
12984
|
+
|
|
12985
|
+
Qcur = ggml_rope_multi(
|
|
12986
|
+
ctx0,
|
|
12987
|
+
ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
|
12988
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12989
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12990
|
+
);
|
|
12991
|
+
cb(Qcur, "Qcur", il);
|
|
12992
|
+
|
|
12993
|
+
Kcur = ggml_rope_multi(
|
|
12994
|
+
ctx0,
|
|
12995
|
+
ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
|
12996
|
+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
12997
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
12998
|
+
);
|
|
12999
|
+
cb(Kcur, "Kcur", il);
|
|
13000
|
+
|
|
13001
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
|
13002
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
13003
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
13004
|
+
}
|
|
13005
|
+
|
|
13006
|
+
if (il == n_layer - 1) {
|
|
13007
|
+
// skip computing output for unused tokens
|
|
13008
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
13009
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13010
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13011
|
+
}
|
|
13012
|
+
|
|
13013
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
13014
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
13015
|
+
|
|
13016
|
+
// feed-forward network
|
|
13017
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
13018
|
+
model.layers[il].ffn_norm, NULL,
|
|
13019
|
+
LLM_NORM_RMS, cb, il);
|
|
13020
|
+
cb(cur, "ffn_norm", il);
|
|
13021
|
+
|
|
13022
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
13023
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
13024
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
13025
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
13026
|
+
NULL,
|
|
13027
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
13028
|
+
cb(cur, "ffn_out", il);
|
|
13029
|
+
|
|
13030
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13031
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
13032
|
+
cb(cur, "l_out", il);
|
|
13033
|
+
|
|
13034
|
+
// input for next layer
|
|
13035
|
+
inpL = cur;
|
|
13036
|
+
}
|
|
13037
|
+
|
|
13038
|
+
cur = inpL;
|
|
13039
|
+
|
|
13040
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
13041
|
+
model.output_norm, NULL,
|
|
13042
|
+
LLM_NORM_RMS, cb, -1);
|
|
13043
|
+
cb(cur, "result_norm", -1);
|
|
13044
|
+
|
|
13045
|
+
// lm_head
|
|
13046
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
13047
|
+
cb(cur, "result_output", -1);
|
|
13048
|
+
|
|
13049
|
+
ggml_build_forward_expand(gf, cur);
|
|
13050
|
+
|
|
13051
|
+
return gf;
|
|
13052
|
+
}
|
|
13053
|
+
|
|
12424
13054
|
struct ggml_cgraph * build_qwen2moe() {
|
|
12425
13055
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
12426
13056
|
|
|
@@ -12704,7 +13334,13 @@ struct llm_build_context {
|
|
|
12704
13334
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
12705
13335
|
|
|
12706
13336
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
12707
|
-
struct ggml_tensor *
|
|
13337
|
+
struct ggml_tensor * KQ_mask = nullptr;
|
|
13338
|
+
if (hparams.n_swa == 0) {
|
|
13339
|
+
// Phi-4 doesn't use sliding window attention
|
|
13340
|
+
KQ_mask = build_inp_KQ_mask();
|
|
13341
|
+
} else {
|
|
13342
|
+
KQ_mask = build_inp_KQ_mask_swa();
|
|
13343
|
+
}
|
|
12708
13344
|
|
|
12709
13345
|
for (int il = 0; il < n_layer; ++il) {
|
|
12710
13346
|
auto residual = inpL;
|
|
@@ -12762,7 +13398,7 @@ struct llm_build_context {
|
|
|
12762
13398
|
|
|
12763
13399
|
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
|
12764
13400
|
model.layers[il].wo, model.layers[il].bo,
|
|
12765
|
-
Kcur, Vcur, Qcur,
|
|
13401
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
|
|
12766
13402
|
}
|
|
12767
13403
|
|
|
12768
13404
|
if (il == n_layer - 1) {
|
|
@@ -13372,21 +14008,18 @@ struct llm_build_context {
|
|
|
13372
14008
|
return gf;
|
|
13373
14009
|
}
|
|
13374
14010
|
|
|
13375
|
-
|
|
13376
|
-
// https://github.com/ggerganov/llama.cpp/issues/5276#issuecomment-1925774738
|
|
13377
|
-
// based on the original build_llama() function
|
|
13378
|
-
struct ggml_cgraph * build_minicpm() {
|
|
14011
|
+
struct ggml_cgraph * build_minicpm3() {
|
|
13379
14012
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
13380
14013
|
|
|
13381
|
-
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
13382
|
-
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
13383
|
-
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
13384
|
-
|
|
13385
|
-
const int64_t n_embd = hparams.n_embd;
|
|
13386
14014
|
//TODO: if the model varies, these parameters need to be read from the model
|
|
13387
14015
|
const int64_t n_embd_base = 256;
|
|
13388
14016
|
const float scale_embd = 12.0f;
|
|
13389
14017
|
const float scale_depth = 1.4f;
|
|
14018
|
+
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
|
14019
|
+
|
|
14020
|
+
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
14021
|
+
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
14022
|
+
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
13390
14023
|
|
|
13391
14024
|
struct ggml_tensor * cur;
|
|
13392
14025
|
struct ggml_tensor * inpL;
|
|
@@ -13406,209 +14039,65 @@ struct llm_build_context {
|
|
|
13406
14039
|
for (int il = 0; il < n_layer; ++il) {
|
|
13407
14040
|
struct ggml_tensor * inpSA = inpL;
|
|
13408
14041
|
|
|
14042
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
13409
14043
|
// norm
|
|
13410
14044
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
13411
14045
|
model.layers[il].attn_norm, NULL,
|
|
13412
14046
|
LLM_NORM_RMS, cb, il);
|
|
13413
14047
|
cb(cur, "attn_norm", il);
|
|
13414
14048
|
|
|
13415
|
-
//
|
|
14049
|
+
// self_attention
|
|
13416
14050
|
{
|
|
13417
|
-
|
|
13418
|
-
|
|
13419
|
-
|
|
13420
|
-
|
|
13421
|
-
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
13422
|
-
cb(Qcur, "Qcur", il);
|
|
13423
|
-
}
|
|
13424
|
-
|
|
13425
|
-
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
|
13426
|
-
cb(Kcur, "Kcur", il);
|
|
13427
|
-
if (model.layers[il].bk) {
|
|
13428
|
-
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
13429
|
-
cb(Kcur, "Kcur", il);
|
|
13430
|
-
}
|
|
14051
|
+
struct ggml_tensor * q = NULL;
|
|
14052
|
+
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
|
14053
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
|
14054
|
+
cb(q, "q", il);
|
|
13431
14055
|
|
|
13432
|
-
|
|
13433
|
-
|
|
13434
|
-
|
|
13435
|
-
|
|
13436
|
-
cb(Vcur, "Vcur", il);
|
|
13437
|
-
}
|
|
14056
|
+
q = llm_build_norm(ctx0, q, hparams,
|
|
14057
|
+
model.layers[il].attn_q_a_norm, NULL,
|
|
14058
|
+
LLM_NORM_RMS, cb, il);
|
|
14059
|
+
cb(q, "q", il);
|
|
13438
14060
|
|
|
13439
|
-
|
|
13440
|
-
|
|
13441
|
-
|
|
13442
|
-
ext_factor, attn_factor, beta_fast, beta_slow
|
|
13443
|
-
);
|
|
13444
|
-
cb(Qcur, "Qcur", il);
|
|
14061
|
+
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
|
14062
|
+
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
|
14063
|
+
cb(q, "q", il);
|
|
13445
14064
|
|
|
13446
|
-
|
|
13447
|
-
|
|
13448
|
-
|
|
13449
|
-
|
|
13450
|
-
|
|
13451
|
-
cb(
|
|
14065
|
+
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
14066
|
+
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
14067
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
14068
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
14069
|
+
0);
|
|
14070
|
+
cb(q_nope, "q_nope", il);
|
|
13452
14071
|
|
|
13453
|
-
|
|
13454
|
-
|
|
13455
|
-
|
|
13456
|
-
|
|
14072
|
+
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
14073
|
+
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
14074
|
+
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
14075
|
+
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
14076
|
+
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
14077
|
+
cb(q_pe, "q_pe", il);
|
|
13457
14078
|
|
|
13458
|
-
|
|
13459
|
-
|
|
13460
|
-
|
|
13461
|
-
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
13462
|
-
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
13463
|
-
}
|
|
14079
|
+
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
14080
|
+
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
14081
|
+
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
13464
14082
|
|
|
13465
|
-
|
|
13466
|
-
|
|
13467
|
-
|
|
13468
|
-
|
|
14083
|
+
// split into {kv_lora_rank, n_tokens}
|
|
14084
|
+
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
14085
|
+
kv_pe_compresseed->nb[1],
|
|
14086
|
+
0);
|
|
14087
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
13469
14088
|
|
|
13470
|
-
|
|
13471
|
-
|
|
14089
|
+
// and {n_embd_head_qk_rope, n_tokens}
|
|
14090
|
+
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
14091
|
+
kv_pe_compresseed->nb[1],
|
|
14092
|
+
kv_pe_compresseed->nb[1],
|
|
14093
|
+
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
14094
|
+
cb(k_pe, "k_pe", il);
|
|
13472
14095
|
|
|
13473
|
-
|
|
13474
|
-
|
|
13475
|
-
|
|
13476
|
-
model.layers[il].ffn_norm, NULL,
|
|
14096
|
+
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
|
14097
|
+
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
|
14098
|
+
model.layers[il].attn_kv_a_norm, NULL,
|
|
13477
14099
|
LLM_NORM_RMS, cb, il);
|
|
13478
|
-
cb(
|
|
13479
|
-
|
|
13480
|
-
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
13481
|
-
model.layers[il].ffn_up, NULL, NULL,
|
|
13482
|
-
model.layers[il].ffn_gate, NULL, NULL,
|
|
13483
|
-
model.layers[il].ffn_down, NULL, NULL,
|
|
13484
|
-
NULL,
|
|
13485
|
-
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
13486
|
-
cb(cur, "ffn_out", il);
|
|
13487
|
-
}
|
|
13488
|
-
|
|
13489
|
-
// scale the hidden states for residual connection
|
|
13490
|
-
cur = ggml_scale(ctx0, cur, scale_res);
|
|
13491
|
-
cb(cur, "hidden_scaled_ffn", -1);
|
|
13492
|
-
|
|
13493
|
-
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
13494
|
-
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
13495
|
-
cb(cur, "l_out", il);
|
|
13496
|
-
|
|
13497
|
-
// input for next layer
|
|
13498
|
-
inpL = cur;
|
|
13499
|
-
}
|
|
13500
|
-
|
|
13501
|
-
cur = inpL;
|
|
13502
|
-
|
|
13503
|
-
cur = llm_build_norm(ctx0, cur, hparams,
|
|
13504
|
-
model.output_norm, NULL,
|
|
13505
|
-
LLM_NORM_RMS, cb, -1);
|
|
13506
|
-
cb(cur, "result_norm", -1);
|
|
13507
|
-
|
|
13508
|
-
// lm_head scaling
|
|
13509
|
-
const float scale_lmhead = float(n_embd_base)/float(n_embd);
|
|
13510
|
-
cur = ggml_scale(ctx0, cur, scale_lmhead);
|
|
13511
|
-
cb(cur, "lmhead_scaling", -1);
|
|
13512
|
-
|
|
13513
|
-
// lm_head
|
|
13514
|
-
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
13515
|
-
cb(cur, "result_output", -1);
|
|
13516
|
-
|
|
13517
|
-
ggml_build_forward_expand(gf, cur);
|
|
13518
|
-
|
|
13519
|
-
return gf;
|
|
13520
|
-
}
|
|
13521
|
-
|
|
13522
|
-
struct ggml_cgraph * build_minicpm3() {
|
|
13523
|
-
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
13524
|
-
|
|
13525
|
-
//TODO: if the model varies, these parameters need to be read from the model
|
|
13526
|
-
const int64_t n_embd_base = 256;
|
|
13527
|
-
const float scale_embd = 12.0f;
|
|
13528
|
-
const float scale_depth = 1.4f;
|
|
13529
|
-
const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
|
|
13530
|
-
|
|
13531
|
-
const uint32_t n_embd_head_qk_rope = hparams.n_rot;
|
|
13532
|
-
const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
|
|
13533
|
-
const uint32_t kv_lora_rank = hparams.n_lora_kv;
|
|
13534
|
-
|
|
13535
|
-
struct ggml_tensor * cur;
|
|
13536
|
-
struct ggml_tensor * inpL;
|
|
13537
|
-
|
|
13538
|
-
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
13539
|
-
|
|
13540
|
-
// scale the input embeddings
|
|
13541
|
-
inpL = ggml_scale(ctx0, inpL, scale_embd);
|
|
13542
|
-
cb(inpL, "inp_scaled", -1);
|
|
13543
|
-
|
|
13544
|
-
// inp_pos - contains the positions
|
|
13545
|
-
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
13546
|
-
|
|
13547
|
-
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
13548
|
-
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
13549
|
-
|
|
13550
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
13551
|
-
struct ggml_tensor * inpSA = inpL;
|
|
13552
|
-
|
|
13553
|
-
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
13554
|
-
// norm
|
|
13555
|
-
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
13556
|
-
model.layers[il].attn_norm, NULL,
|
|
13557
|
-
LLM_NORM_RMS, cb, il);
|
|
13558
|
-
cb(cur, "attn_norm", il);
|
|
13559
|
-
|
|
13560
|
-
// self_attention
|
|
13561
|
-
{
|
|
13562
|
-
struct ggml_tensor * q = NULL;
|
|
13563
|
-
// {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
|
|
13564
|
-
q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
|
|
13565
|
-
cb(q, "q", il);
|
|
13566
|
-
|
|
13567
|
-
q = llm_build_norm(ctx0, q, hparams,
|
|
13568
|
-
model.layers[il].attn_q_a_norm, NULL,
|
|
13569
|
-
LLM_NORM_RMS, cb, il);
|
|
13570
|
-
cb(q, "q", il);
|
|
13571
|
-
|
|
13572
|
-
// {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
|
|
13573
|
-
q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
|
|
13574
|
-
cb(q, "q", il);
|
|
13575
|
-
|
|
13576
|
-
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
|
13577
|
-
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
|
13578
|
-
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
13579
|
-
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
13580
|
-
0);
|
|
13581
|
-
cb(q_nope, "q_nope", il);
|
|
13582
|
-
|
|
13583
|
-
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
|
13584
|
-
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
|
13585
|
-
ggml_row_size(q->type, hparams.n_embd_head_k),
|
|
13586
|
-
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
|
13587
|
-
ggml_row_size(q->type, n_embd_head_qk_nope));
|
|
13588
|
-
cb(q_pe, "q_pe", il);
|
|
13589
|
-
|
|
13590
|
-
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
|
13591
|
-
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
|
13592
|
-
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
|
13593
|
-
|
|
13594
|
-
// split into {kv_lora_rank, n_tokens}
|
|
13595
|
-
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
|
13596
|
-
kv_pe_compresseed->nb[1],
|
|
13597
|
-
0);
|
|
13598
|
-
cb(kv_compressed, "kv_compressed", il);
|
|
13599
|
-
|
|
13600
|
-
// and {n_embd_head_qk_rope, n_tokens}
|
|
13601
|
-
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
|
13602
|
-
kv_pe_compresseed->nb[1],
|
|
13603
|
-
kv_pe_compresseed->nb[1],
|
|
13604
|
-
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
|
13605
|
-
cb(k_pe, "k_pe", il);
|
|
13606
|
-
|
|
13607
|
-
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
|
13608
|
-
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
|
13609
|
-
model.layers[il].attn_kv_a_norm, NULL,
|
|
13610
|
-
LLM_NORM_RMS, cb, il);
|
|
13611
|
-
cb(kv_compressed, "kv_compressed", il);
|
|
14100
|
+
cb(kv_compressed, "kv_compressed", il);
|
|
13612
14101
|
|
|
13613
14102
|
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
|
13614
14103
|
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
|
@@ -14424,6 +14913,130 @@ struct llm_build_context {
|
|
|
14424
14913
|
return gf;
|
|
14425
14914
|
}
|
|
14426
14915
|
|
|
14916
|
+
struct ggml_cgraph * build_olmo2() {
|
|
14917
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
14918
|
+
|
|
14919
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
14920
|
+
int32_t n_tokens = this->n_tokens;
|
|
14921
|
+
|
|
14922
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
14923
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
14924
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
14925
|
+
|
|
14926
|
+
struct ggml_tensor * cur;
|
|
14927
|
+
struct ggml_tensor * inpL;
|
|
14928
|
+
|
|
14929
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
14930
|
+
|
|
14931
|
+
// inp_pos - contains the positions
|
|
14932
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
14933
|
+
|
|
14934
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
14935
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
14936
|
+
|
|
14937
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
14938
|
+
struct ggml_tensor * inpSA = inpL;
|
|
14939
|
+
|
|
14940
|
+
cur = inpL;
|
|
14941
|
+
|
|
14942
|
+
// self_attention
|
|
14943
|
+
{
|
|
14944
|
+
// compute Q and K and RoPE them
|
|
14945
|
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
14946
|
+
cb(Qcur, "Qcur", il);
|
|
14947
|
+
|
|
14948
|
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
|
14949
|
+
cb(Kcur, "Kcur", il);
|
|
14950
|
+
|
|
14951
|
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
|
14952
|
+
cb(Vcur, "Vcur", il);
|
|
14953
|
+
|
|
14954
|
+
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
|
|
14955
|
+
LLM_NORM_RMS, cb, il);
|
|
14956
|
+
cb(Qcur, "Qcur_normed", il);
|
|
14957
|
+
|
|
14958
|
+
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
|
|
14959
|
+
LLM_NORM_RMS, cb, il);
|
|
14960
|
+
cb(Kcur, "Kcur_normed", il);
|
|
14961
|
+
|
|
14962
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
|
14963
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
|
14964
|
+
|
|
14965
|
+
Qcur = ggml_rope_ext(
|
|
14966
|
+
ctx0, Qcur, inp_pos, nullptr,
|
|
14967
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14968
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14969
|
+
);
|
|
14970
|
+
cb(Qcur, "Qcur_rope", il);
|
|
14971
|
+
|
|
14972
|
+
Kcur = ggml_rope_ext(
|
|
14973
|
+
ctx0, Kcur, inp_pos, nullptr,
|
|
14974
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
14975
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
14976
|
+
);
|
|
14977
|
+
cb(Kcur, "Kcur_rope", il);
|
|
14978
|
+
|
|
14979
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
|
14980
|
+
model.layers[il].wo, NULL,
|
|
14981
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
|
14982
|
+
}
|
|
14983
|
+
|
|
14984
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
14985
|
+
model.layers[il].attn_post_norm, NULL,
|
|
14986
|
+
LLM_NORM_RMS, cb, il);
|
|
14987
|
+
cb(cur, "attn_post_norm", il);
|
|
14988
|
+
|
|
14989
|
+
if (il == n_layer - 1) {
|
|
14990
|
+
// skip computing output for unused tokens
|
|
14991
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
14992
|
+
n_tokens = n_outputs;
|
|
14993
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
14994
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
14995
|
+
}
|
|
14996
|
+
|
|
14997
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
14998
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
14999
|
+
|
|
15000
|
+
// feed-forward network
|
|
15001
|
+
cur = llm_build_ffn(ctx0, lctx, ffn_inp,
|
|
15002
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15003
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
15004
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15005
|
+
NULL,
|
|
15006
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
15007
|
+
cb(cur, "ffn_out", il);
|
|
15008
|
+
|
|
15009
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
15010
|
+
model.layers[il].ffn_post_norm, NULL,
|
|
15011
|
+
LLM_NORM_RMS, cb, -1);
|
|
15012
|
+
cb(cur, "ffn_post_norm", -1);
|
|
15013
|
+
|
|
15014
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
15015
|
+
cb(cur, "ffn_out", il);
|
|
15016
|
+
|
|
15017
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
15018
|
+
cb(cur, "l_out", il);
|
|
15019
|
+
|
|
15020
|
+
// input for next layer
|
|
15021
|
+
inpL = cur;
|
|
15022
|
+
}
|
|
15023
|
+
|
|
15024
|
+
cur = inpL;
|
|
15025
|
+
|
|
15026
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
15027
|
+
model.output_norm, NULL,
|
|
15028
|
+
LLM_NORM_RMS, cb, -1);
|
|
15029
|
+
cb(cur, "result_norm", -1);
|
|
15030
|
+
|
|
15031
|
+
// lm_head
|
|
15032
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
15033
|
+
cb(cur, "result_output", -1);
|
|
15034
|
+
|
|
15035
|
+
ggml_build_forward_expand(gf, cur);
|
|
15036
|
+
|
|
15037
|
+
return gf;
|
|
15038
|
+
}
|
|
15039
|
+
|
|
14427
15040
|
// based on the build_qwen2moe() function, changes:
|
|
14428
15041
|
// * removed shared experts
|
|
14429
15042
|
// * removed bias
|
|
@@ -14905,29 +15518,183 @@ struct llm_build_context {
|
|
|
14905
15518
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
14906
15519
|
cb(cur, "ffn_out", il);
|
|
14907
15520
|
|
|
14908
|
-
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
|
14909
|
-
cb(ffn_out, "ffn_out", il);
|
|
15521
|
+
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
|
15522
|
+
cb(ffn_out, "ffn_out", il);
|
|
15523
|
+
|
|
15524
|
+
// MoE
|
|
15525
|
+
cur = llm_build_norm(ctx0, inpSA, hparams,
|
|
15526
|
+
model.layers[il].ffn_norm_exps, NULL,
|
|
15527
|
+
LLM_NORM_RMS, cb, il);
|
|
15528
|
+
cb(cur, "ffn_norm_exps", il);
|
|
15529
|
+
|
|
15530
|
+
cur = llm_build_moe_ffn(ctx0, lctx, cur,
|
|
15531
|
+
model.layers[il].ffn_gate_inp,
|
|
15532
|
+
model.layers[il].ffn_up_exps,
|
|
15533
|
+
model.layers[il].ffn_gate_exps,
|
|
15534
|
+
model.layers[il].ffn_down_exps,
|
|
15535
|
+
n_expert, n_expert_used,
|
|
15536
|
+
LLM_FFN_SILU, true,
|
|
15537
|
+
false, 0.0,
|
|
15538
|
+
cb, il);
|
|
15539
|
+
cb(cur, "ffn_moe_out", il);
|
|
15540
|
+
|
|
15541
|
+
cur = ggml_add(ctx0, cur, ffn_out);
|
|
15542
|
+
cb(cur, "ffn_out", il);
|
|
15543
|
+
|
|
15544
|
+
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
15545
|
+
cb(cur, "l_out", il);
|
|
15546
|
+
|
|
15547
|
+
// input for next layer
|
|
15548
|
+
inpL = cur;
|
|
15549
|
+
}
|
|
15550
|
+
|
|
15551
|
+
cur = inpL;
|
|
15552
|
+
|
|
15553
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
15554
|
+
model.output_norm, NULL,
|
|
15555
|
+
LLM_NORM_RMS, cb, -1);
|
|
15556
|
+
cb(cur, "result_norm", -1);
|
|
15557
|
+
|
|
15558
|
+
// lm_head
|
|
15559
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
15560
|
+
cb(cur, "result_output", -1);
|
|
15561
|
+
|
|
15562
|
+
ggml_build_forward_expand(gf, cur);
|
|
15563
|
+
|
|
15564
|
+
return gf;
|
|
15565
|
+
}
|
|
15566
|
+
|
|
15567
|
+
struct ggml_cgraph * build_deepseek() {
|
|
15568
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
15569
|
+
|
|
15570
|
+
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
15571
|
+
int32_t n_tokens = this->n_tokens;
|
|
15572
|
+
|
|
15573
|
+
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
15574
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
15575
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
15576
|
+
|
|
15577
|
+
struct ggml_tensor * cur;
|
|
15578
|
+
struct ggml_tensor * inpL;
|
|
15579
|
+
|
|
15580
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
15581
|
+
|
|
15582
|
+
// inp_pos - contains the positions
|
|
15583
|
+
struct ggml_tensor * inp_pos = build_inp_pos();
|
|
15584
|
+
|
|
15585
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
|
15586
|
+
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
|
15587
|
+
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
|
|
15588
|
+
for (int il = 0; il < n_layer; ++il) {
|
|
15589
|
+
struct ggml_tensor * inpSA = inpL;
|
|
15590
|
+
|
|
15591
|
+
// norm
|
|
15592
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
|
15593
|
+
model.layers[il].attn_norm, NULL,
|
|
15594
|
+
LLM_NORM_RMS, cb, il);
|
|
15595
|
+
cb(cur, "attn_norm", il);
|
|
15596
|
+
|
|
15597
|
+
// self-attention
|
|
15598
|
+
{
|
|
15599
|
+
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
|
15600
|
+
struct ggml_tensor * rope_factors = build_rope_factors(il);
|
|
15601
|
+
|
|
15602
|
+
// compute Q and K and RoPE them
|
|
15603
|
+
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
|
15604
|
+
cb(Qcur, "Qcur", il);
|
|
15605
|
+
if (model.layers[il].bq) {
|
|
15606
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
|
15607
|
+
cb(Qcur, "Qcur", il);
|
|
15608
|
+
}
|
|
15609
|
+
|
|
15610
|
+
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
|
15611
|
+
cb(Kcur, "Kcur", il);
|
|
15612
|
+
if (model.layers[il].bk) {
|
|
15613
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
|
15614
|
+
cb(Kcur, "Kcur", il);
|
|
15615
|
+
}
|
|
15616
|
+
|
|
15617
|
+
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
|
15618
|
+
cb(Vcur, "Vcur", il);
|
|
15619
|
+
if (model.layers[il].bv) {
|
|
15620
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
|
15621
|
+
cb(Vcur, "Vcur", il);
|
|
15622
|
+
}
|
|
15623
|
+
|
|
15624
|
+
Qcur = ggml_rope_ext(
|
|
15625
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
|
|
15626
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15627
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15628
|
+
);
|
|
15629
|
+
cb(Qcur, "Qcur", il);
|
|
15630
|
+
|
|
15631
|
+
Kcur = ggml_rope_ext(
|
|
15632
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
|
|
15633
|
+
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
|
15634
|
+
ext_factor, attn_factor, beta_fast, beta_slow
|
|
15635
|
+
);
|
|
15636
|
+
cb(Kcur, "Kcur", il);
|
|
15637
|
+
|
|
15638
|
+
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
|
15639
|
+
model.layers[il].wo, model.layers[il].bo,
|
|
15640
|
+
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
|
|
15641
|
+
}
|
|
15642
|
+
|
|
15643
|
+
if (il == n_layer - 1) {
|
|
15644
|
+
// skip computing output for unused tokens
|
|
15645
|
+
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
|
15646
|
+
n_tokens = n_outputs;
|
|
15647
|
+
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
|
15648
|
+
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
|
15649
|
+
}
|
|
15650
|
+
|
|
14910
15651
|
|
|
14911
|
-
|
|
14912
|
-
|
|
14913
|
-
|
|
15652
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
|
15653
|
+
cb(ffn_inp, "ffn_inp", il);
|
|
15654
|
+
|
|
15655
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
|
15656
|
+
model.layers[il].ffn_norm, NULL,
|
|
14914
15657
|
LLM_NORM_RMS, cb, il);
|
|
14915
|
-
cb(cur, "
|
|
15658
|
+
cb(cur, "ffn_norm", il);
|
|
14916
15659
|
|
|
14917
|
-
|
|
14918
|
-
|
|
14919
|
-
|
|
14920
|
-
|
|
14921
|
-
|
|
14922
|
-
|
|
14923
|
-
|
|
14924
|
-
|
|
14925
|
-
|
|
14926
|
-
|
|
15660
|
+
if ((uint32_t) il < hparams.n_layer_dense_lead) {
|
|
15661
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
15662
|
+
model.layers[il].ffn_up, NULL, NULL,
|
|
15663
|
+
model.layers[il].ffn_gate, NULL, NULL,
|
|
15664
|
+
model.layers[il].ffn_down, NULL, NULL,
|
|
15665
|
+
NULL,
|
|
15666
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
15667
|
+
cb(cur, "ffn_out", il);
|
|
15668
|
+
} else {
|
|
15669
|
+
// MoE branch
|
|
15670
|
+
ggml_tensor * moe_out =
|
|
15671
|
+
llm_build_moe_ffn(ctx0, lctx, cur,
|
|
15672
|
+
model.layers[il].ffn_gate_inp,
|
|
15673
|
+
model.layers[il].ffn_up_exps,
|
|
15674
|
+
model.layers[il].ffn_gate_exps,
|
|
15675
|
+
model.layers[il].ffn_down_exps,
|
|
15676
|
+
n_expert, n_expert_used,
|
|
15677
|
+
LLM_FFN_SILU, false,
|
|
15678
|
+
false, hparams.expert_weights_scale,
|
|
15679
|
+
cb, il);
|
|
15680
|
+
cb(moe_out, "ffn_moe_out", il);
|
|
14927
15681
|
|
|
14928
|
-
|
|
14929
|
-
|
|
15682
|
+
// FFN shared expert
|
|
15683
|
+
{
|
|
15684
|
+
ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
|
|
15685
|
+
model.layers[il].ffn_up_shexp, NULL, NULL,
|
|
15686
|
+
model.layers[il].ffn_gate_shexp, NULL, NULL,
|
|
15687
|
+
model.layers[il].ffn_down_shexp, NULL, NULL,
|
|
15688
|
+
NULL,
|
|
15689
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
|
15690
|
+
cb(ffn_shexp, "ffn_shexp", il);
|
|
15691
|
+
|
|
15692
|
+
cur = ggml_add(ctx0, moe_out, ffn_shexp);
|
|
15693
|
+
cb(cur, "ffn_out", il);
|
|
15694
|
+
}
|
|
15695
|
+
}
|
|
14930
15696
|
|
|
15697
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
|
14931
15698
|
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
|
14932
15699
|
cb(cur, "l_out", il);
|
|
14933
15700
|
|
|
@@ -14944,6 +15711,7 @@ struct llm_build_context {
|
|
|
14944
15711
|
|
|
14945
15712
|
// lm_head
|
|
14946
15713
|
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
15714
|
+
|
|
14947
15715
|
cb(cur, "result_output", -1);
|
|
14948
15716
|
|
|
14949
15717
|
ggml_build_forward_expand(gf, cur);
|
|
@@ -15330,7 +16098,7 @@ struct llm_build_context {
|
|
|
15330
16098
|
return gf;
|
|
15331
16099
|
}
|
|
15332
16100
|
|
|
15333
|
-
struct ggml_cgraph *
|
|
16101
|
+
struct ggml_cgraph * build_t5_enc() {
|
|
15334
16102
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
15335
16103
|
|
|
15336
16104
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
@@ -15462,7 +16230,7 @@ struct llm_build_context {
|
|
|
15462
16230
|
return gf;
|
|
15463
16231
|
}
|
|
15464
16232
|
|
|
15465
|
-
struct ggml_cgraph *
|
|
16233
|
+
struct ggml_cgraph * build_t5_dec() {
|
|
15466
16234
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
15467
16235
|
|
|
15468
16236
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
|
@@ -16411,6 +17179,158 @@ struct llm_build_context {
|
|
|
16411
17179
|
|
|
16412
17180
|
return gf;
|
|
16413
17181
|
}
|
|
17182
|
+
|
|
17183
|
+
struct ggml_cgraph * build_wavtokenizer_dec() {
|
|
17184
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
|
17185
|
+
|
|
17186
|
+
struct ggml_tensor * cur;
|
|
17187
|
+
struct ggml_tensor * inpL;
|
|
17188
|
+
|
|
17189
|
+
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
|
17190
|
+
|
|
17191
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
|
|
17192
|
+
|
|
17193
|
+
cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1);
|
|
17194
|
+
cur = ggml_add(ctx0, cur, model.conv1d_b);
|
|
17195
|
+
|
|
17196
|
+
// posnet
|
|
17197
|
+
for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) {
|
|
17198
|
+
const auto & layer = model.layers[il].posnet;
|
|
17199
|
+
|
|
17200
|
+
inpL = cur;
|
|
17201
|
+
|
|
17202
|
+
switch (il) {
|
|
17203
|
+
case 0:
|
|
17204
|
+
case 1:
|
|
17205
|
+
case 3:
|
|
17206
|
+
case 4:
|
|
17207
|
+
{
|
|
17208
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
17209
|
+
layer.norm1,
|
|
17210
|
+
layer.norm1_b,
|
|
17211
|
+
LLM_NORM_GROUP, cb, 0);
|
|
17212
|
+
|
|
17213
|
+
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
17214
|
+
|
|
17215
|
+
cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1);
|
|
17216
|
+
cur = ggml_add(ctx0, cur, layer.conv1_b);
|
|
17217
|
+
|
|
17218
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
17219
|
+
layer.norm2,
|
|
17220
|
+
layer.norm2_b,
|
|
17221
|
+
LLM_NORM_GROUP, cb, 0);
|
|
17222
|
+
|
|
17223
|
+
cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur);
|
|
17224
|
+
|
|
17225
|
+
cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1);
|
|
17226
|
+
cur = ggml_add(ctx0, cur, layer.conv2_b);
|
|
17227
|
+
|
|
17228
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
17229
|
+
} break;
|
|
17230
|
+
case 2:
|
|
17231
|
+
{
|
|
17232
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
17233
|
+
layer.attn_norm,
|
|
17234
|
+
layer.attn_norm_b,
|
|
17235
|
+
LLM_NORM_GROUP, cb, 0);
|
|
17236
|
+
|
|
17237
|
+
struct ggml_tensor * q;
|
|
17238
|
+
struct ggml_tensor * k;
|
|
17239
|
+
struct ggml_tensor * v;
|
|
17240
|
+
|
|
17241
|
+
q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1);
|
|
17242
|
+
k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1);
|
|
17243
|
+
v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1);
|
|
17244
|
+
|
|
17245
|
+
q = ggml_add(ctx0, q, layer.attn_q_b);
|
|
17246
|
+
k = ggml_add(ctx0, k, layer.attn_k_b);
|
|
17247
|
+
v = ggml_add(ctx0, v, layer.attn_v_b);
|
|
17248
|
+
|
|
17249
|
+
q = ggml_cont(ctx0, ggml_transpose(ctx0, q));
|
|
17250
|
+
k = ggml_cont(ctx0, ggml_transpose(ctx0, k));
|
|
17251
|
+
|
|
17252
|
+
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
|
17253
|
+
|
|
17254
|
+
kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f);
|
|
17255
|
+
|
|
17256
|
+
cur = ggml_mul_mat(ctx0, kq, v);
|
|
17257
|
+
|
|
17258
|
+
cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1);
|
|
17259
|
+
cur = ggml_add(ctx0, cur, layer.attn_o_b);
|
|
17260
|
+
|
|
17261
|
+
cur = ggml_add(ctx0, cur, inpL);
|
|
17262
|
+
} break;
|
|
17263
|
+
case 5:
|
|
17264
|
+
{
|
|
17265
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
17266
|
+
layer.norm,
|
|
17267
|
+
layer.norm_b,
|
|
17268
|
+
LLM_NORM_GROUP, cb, 0);
|
|
17269
|
+
} break;
|
|
17270
|
+
default: GGML_ABORT("unknown posnet layer");
|
|
17271
|
+
};
|
|
17272
|
+
}
|
|
17273
|
+
|
|
17274
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
17275
|
+
|
|
17276
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
17277
|
+
model.tok_norm,
|
|
17278
|
+
model.tok_norm_b,
|
|
17279
|
+
LLM_NORM, cb, -1);
|
|
17280
|
+
|
|
17281
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
17282
|
+
|
|
17283
|
+
inpL = cur;
|
|
17284
|
+
|
|
17285
|
+
// convnext
|
|
17286
|
+
for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) {
|
|
17287
|
+
const auto & layer = model.layers[il].convnext;
|
|
17288
|
+
|
|
17289
|
+
cur = inpL;
|
|
17290
|
+
|
|
17291
|
+
cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1);
|
|
17292
|
+
cur = ggml_add(ctx0, cur, layer.dw_b);
|
|
17293
|
+
|
|
17294
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
17295
|
+
|
|
17296
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
17297
|
+
layer.norm,
|
|
17298
|
+
layer.norm_b,
|
|
17299
|
+
LLM_NORM, cb, -1);
|
|
17300
|
+
|
|
17301
|
+
cur = llm_build_ffn(ctx0, lctx, cur,
|
|
17302
|
+
layer.pw1, layer.pw1_b, NULL,
|
|
17303
|
+
NULL, NULL, NULL,
|
|
17304
|
+
layer.pw2, layer.pw2_b, NULL,
|
|
17305
|
+
NULL,
|
|
17306
|
+
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
|
17307
|
+
|
|
17308
|
+
cur = ggml_mul(ctx0, cur, layer.gamma);
|
|
17309
|
+
|
|
17310
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
17311
|
+
|
|
17312
|
+
inpL = ggml_add(ctx0, cur, inpL);
|
|
17313
|
+
}
|
|
17314
|
+
|
|
17315
|
+
cur = inpL;
|
|
17316
|
+
|
|
17317
|
+
cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
17318
|
+
|
|
17319
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
|
17320
|
+
model.output_norm,
|
|
17321
|
+
model.output_norm_b,
|
|
17322
|
+
LLM_NORM, cb, -1);
|
|
17323
|
+
|
|
17324
|
+
// lm_head
|
|
17325
|
+
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
|
17326
|
+
|
|
17327
|
+
cur = ggml_add(ctx0, cur, model.output_b);
|
|
17328
|
+
cb(cur, "result_embd", -1);
|
|
17329
|
+
|
|
17330
|
+
ggml_build_forward_expand(gf, cur);
|
|
17331
|
+
|
|
17332
|
+
return gf;
|
|
17333
|
+
}
|
|
16414
17334
|
};
|
|
16415
17335
|
|
|
16416
17336
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
|
@@ -16493,6 +17413,7 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
16493
17413
|
|
|
16494
17414
|
switch (model.arch) {
|
|
16495
17415
|
case LLM_ARCH_LLAMA:
|
|
17416
|
+
case LLM_ARCH_MINICPM:
|
|
16496
17417
|
case LLM_ARCH_GRANITE:
|
|
16497
17418
|
case LLM_ARCH_GRANITE_MOE:
|
|
16498
17419
|
{
|
|
@@ -16544,6 +17465,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
16544
17465
|
{
|
|
16545
17466
|
result = llm.build_qwen2();
|
|
16546
17467
|
} break;
|
|
17468
|
+
case LLM_ARCH_QWEN2VL:
|
|
17469
|
+
{
|
|
17470
|
+
lctx.n_pos_per_token = 4;
|
|
17471
|
+
result = llm.build_qwen2vl();
|
|
17472
|
+
} break;
|
|
16547
17473
|
case LLM_ARCH_QWEN2MOE:
|
|
16548
17474
|
{
|
|
16549
17475
|
result = llm.build_qwen2moe();
|
|
@@ -16576,10 +17502,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
16576
17502
|
{
|
|
16577
17503
|
result = llm.build_internlm2();
|
|
16578
17504
|
} break;
|
|
16579
|
-
case LLM_ARCH_MINICPM:
|
|
16580
|
-
{
|
|
16581
|
-
result = llm.build_minicpm();
|
|
16582
|
-
} break;
|
|
16583
17505
|
case LLM_ARCH_MINICPM3:
|
|
16584
17506
|
{
|
|
16585
17507
|
result = llm.build_minicpm3();
|
|
@@ -16616,6 +17538,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
16616
17538
|
{
|
|
16617
17539
|
result = llm.build_olmo();
|
|
16618
17540
|
} break;
|
|
17541
|
+
case LLM_ARCH_OLMO2:
|
|
17542
|
+
{
|
|
17543
|
+
result = llm.build_olmo2();
|
|
17544
|
+
} break;
|
|
16619
17545
|
case LLM_ARCH_OLMOE:
|
|
16620
17546
|
{
|
|
16621
17547
|
result = llm.build_olmoe();
|
|
@@ -16632,6 +17558,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
16632
17558
|
{
|
|
16633
17559
|
result = llm.build_arctic();
|
|
16634
17560
|
} break;
|
|
17561
|
+
case LLM_ARCH_DEEPSEEK:
|
|
17562
|
+
{
|
|
17563
|
+
result = llm.build_deepseek();
|
|
17564
|
+
} break;
|
|
16635
17565
|
case LLM_ARCH_DEEPSEEK2:
|
|
16636
17566
|
{
|
|
16637
17567
|
result = llm.build_deepseek2();
|
|
@@ -16647,14 +17577,14 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
16647
17577
|
case LLM_ARCH_T5:
|
|
16648
17578
|
{
|
|
16649
17579
|
if (lctx.is_encoding) {
|
|
16650
|
-
result = llm.
|
|
17580
|
+
result = llm.build_t5_enc();
|
|
16651
17581
|
} else {
|
|
16652
|
-
result = llm.
|
|
17582
|
+
result = llm.build_t5_dec();
|
|
16653
17583
|
}
|
|
16654
17584
|
} break;
|
|
16655
17585
|
case LLM_ARCH_T5ENCODER:
|
|
16656
17586
|
{
|
|
16657
|
-
result = llm.
|
|
17587
|
+
result = llm.build_t5_enc();
|
|
16658
17588
|
} break;
|
|
16659
17589
|
case LLM_ARCH_JAIS:
|
|
16660
17590
|
{
|
|
@@ -16676,6 +17606,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
16676
17606
|
{
|
|
16677
17607
|
result = llm.build_chameleon();
|
|
16678
17608
|
} break;
|
|
17609
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
17610
|
+
{
|
|
17611
|
+
result = llm.build_wavtokenizer_dec();
|
|
17612
|
+
} break;
|
|
16679
17613
|
default:
|
|
16680
17614
|
GGML_ABORT("fatal error");
|
|
16681
17615
|
}
|
|
@@ -16762,35 +17696,40 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch)
|
|
|
16762
17696
|
|
|
16763
17697
|
if (ubatch.pos && lctx.inp_pos) {
|
|
16764
17698
|
const int64_t n_tokens = ubatch.n_tokens;
|
|
16765
|
-
|
|
16766
|
-
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos));
|
|
17699
|
+
auto n_pos = lctx.n_pos_per_token;
|
|
17700
|
+
ggml_backend_tensor_set(lctx.inp_pos, ubatch.pos, 0, n_tokens*n_pos*ggml_element_size(lctx.inp_pos));
|
|
16767
17701
|
}
|
|
16768
17702
|
|
|
16769
17703
|
if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
|
16770
|
-
GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
|
16771
|
-
|
|
17704
|
+
//GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs");
|
|
17705
|
+
|
|
17706
|
+
if (!lctx.inp_out_ids) {
|
|
17707
|
+
LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__);
|
|
17708
|
+
} else {
|
|
17709
|
+
const int64_t n_tokens = ubatch.n_tokens;
|
|
16772
17710
|
|
|
16773
|
-
|
|
16774
|
-
|
|
17711
|
+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer));
|
|
17712
|
+
int32_t * data = (int32_t *) lctx.inp_out_ids->data;
|
|
16775
17713
|
|
|
16776
|
-
|
|
16777
|
-
|
|
16778
|
-
|
|
16779
|
-
}
|
|
16780
|
-
} else if (ubatch.output) {
|
|
16781
|
-
int32_t n_outputs = 0;
|
|
16782
|
-
for (int i = 0; i < n_tokens; ++i) {
|
|
16783
|
-
if (ubatch.output[i]) {
|
|
16784
|
-
data[n_outputs++] = i;
|
|
17714
|
+
if (lctx.n_outputs == n_tokens) {
|
|
17715
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
17716
|
+
data[i] = i;
|
|
16785
17717
|
}
|
|
17718
|
+
} else if (ubatch.output) {
|
|
17719
|
+
int32_t n_outputs = 0;
|
|
17720
|
+
for (int i = 0; i < n_tokens; ++i) {
|
|
17721
|
+
if (ubatch.output[i]) {
|
|
17722
|
+
data[n_outputs++] = i;
|
|
17723
|
+
}
|
|
17724
|
+
}
|
|
17725
|
+
// the graph needs to have been passed the correct number of outputs
|
|
17726
|
+
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
|
17727
|
+
} else if (lctx.n_outputs == 1) {
|
|
17728
|
+
// only keep last output
|
|
17729
|
+
data[0] = n_tokens - 1;
|
|
17730
|
+
} else {
|
|
17731
|
+
GGML_ASSERT(lctx.n_outputs == 0);
|
|
16786
17732
|
}
|
|
16787
|
-
// the graph needs to have been passed the correct number of outputs
|
|
16788
|
-
GGML_ASSERT(lctx.n_outputs == n_outputs);
|
|
16789
|
-
} else if (lctx.n_outputs == 1) {
|
|
16790
|
-
// only keep last output
|
|
16791
|
-
data[0] = n_tokens - 1;
|
|
16792
|
-
} else {
|
|
16793
|
-
GGML_ASSERT(lctx.n_outputs == 0);
|
|
16794
17733
|
}
|
|
16795
17734
|
}
|
|
16796
17735
|
|
|
@@ -17258,8 +18197,9 @@ static enum ggml_status llama_graph_compute(
|
|
|
17258
18197
|
int n_threads,
|
|
17259
18198
|
ggml_threadpool * threadpool) {
|
|
17260
18199
|
if (lctx.backend_cpu != nullptr) {
|
|
17261
|
-
|
|
17262
|
-
|
|
18200
|
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu));
|
|
18201
|
+
auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool");
|
|
18202
|
+
set_threadpool_fn(lctx.backend_cpu, threadpool);
|
|
17263
18203
|
}
|
|
17264
18204
|
|
|
17265
18205
|
// set the number of threads for all the backends
|
|
@@ -17460,6 +18400,7 @@ static int llama_decode_internal(
|
|
|
17460
18400
|
embd = nullptr; // do not extract embeddings when not needed
|
|
17461
18401
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor");
|
|
17462
18402
|
}
|
|
18403
|
+
|
|
17463
18404
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
17464
18405
|
|
|
17465
18406
|
ggml_backend_sched_alloc_graph(lctx.sched.get(), gf);
|
|
@@ -18026,13 +18967,13 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
|
|
18026
18967
|
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
|
18027
18968
|
bool need_reserve = false;
|
|
18028
18969
|
|
|
18029
|
-
|
|
18030
|
-
|
|
18031
|
-
|
|
18032
|
-
GGML_ABORT("Deepseek2 does not support K-shift");
|
|
18970
|
+
if (lctx.kv_self.has_shift) {
|
|
18971
|
+
if (!llama_kv_cache_can_shift(&lctx)) {
|
|
18972
|
+
GGML_ABORT("The current context does not support K-shift");
|
|
18033
18973
|
}
|
|
18034
18974
|
|
|
18035
|
-
|
|
18975
|
+
// apply K-shift if needed
|
|
18976
|
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
|
|
18036
18977
|
ggml_backend_sched_reset(lctx.sched.get());
|
|
18037
18978
|
|
|
18038
18979
|
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
|
@@ -18247,10 +19188,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
|
|
18247
19188
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
|
18248
19189
|
new_type = GGML_TYPE_IQ3_S;
|
|
18249
19190
|
}
|
|
18250
|
-
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 ||
|
|
18251
|
-
new_type == GGML_TYPE_Q4_0_8_8) {
|
|
18252
|
-
new_type = GGML_TYPE_Q4_0;
|
|
18253
|
-
}
|
|
18254
19191
|
else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
|
|
18255
19192
|
new_type = GGML_TYPE_Q4_K;
|
|
18256
19193
|
}
|
|
@@ -18573,9 +19510,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
18573
19510
|
case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break;
|
|
18574
19511
|
case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break;
|
|
18575
19512
|
case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break;
|
|
18576
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_4: default_type = GGML_TYPE_Q4_0_4_4; break;
|
|
18577
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_4_8: default_type = GGML_TYPE_Q4_0_4_8; break;
|
|
18578
|
-
case LLAMA_FTYPE_MOSTLY_Q4_0_8_8: default_type = GGML_TYPE_Q4_0_8_8; break;
|
|
18579
19513
|
|
|
18580
19514
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
|
18581
19515
|
}
|
|
@@ -18914,14 +19848,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
18914
19848
|
f32_data = (float *) f32_conv_buf.data();
|
|
18915
19849
|
}
|
|
18916
19850
|
|
|
18917
|
-
int chunk_size_multiplier = 1;
|
|
18918
|
-
if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8 || new_type == GGML_TYPE_Q4_0_8_8) {
|
|
18919
|
-
if ((new_type == GGML_TYPE_Q4_0_8_8) && (tensor->ne[1] % 8 != 0)) new_type = GGML_TYPE_Q4_0;
|
|
18920
|
-
else if (tensor->ne[1] % 4 != 0) new_type = GGML_TYPE_Q4_0;
|
|
18921
|
-
if (new_type == GGML_TYPE_Q4_0_8_8) chunk_size_multiplier = 8;
|
|
18922
|
-
else if (new_type == GGML_TYPE_Q4_0_4_4 || new_type == GGML_TYPE_Q4_0_4_8) chunk_size_multiplier = 4;
|
|
18923
|
-
}
|
|
18924
|
-
|
|
18925
19851
|
LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
|
|
18926
19852
|
fflush(stdout);
|
|
18927
19853
|
|
|
@@ -18934,8 +19860,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
18934
19860
|
const int64_t nrows = tensor->ne[1];
|
|
18935
19861
|
|
|
18936
19862
|
static const int64_t min_chunk_size = 32 * 512;
|
|
18937
|
-
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row))
|
|
18938
|
-
chunk_size_multiplier;
|
|
19863
|
+
const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
|
|
18939
19864
|
|
|
18940
19865
|
const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
|
|
18941
19866
|
const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
|
|
@@ -19176,6 +20101,7 @@ void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
|
|
|
19176
20101
|
//
|
|
19177
20102
|
struct llama_model_params llama_model_default_params() {
|
|
19178
20103
|
struct llama_model_params result = {
|
|
20104
|
+
/*.devices =*/ nullptr,
|
|
19179
20105
|
/*.n_gpu_layers =*/ 0,
|
|
19180
20106
|
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
|
19181
20107
|
/*.main_gpu =*/ 0,
|
|
@@ -19293,7 +20219,11 @@ void llama_backend_init(void) {
|
|
|
19293
20219
|
|
|
19294
20220
|
void llama_numa_init(enum ggml_numa_strategy numa) {
|
|
19295
20221
|
if (numa != GGML_NUMA_STRATEGY_DISABLED) {
|
|
19296
|
-
|
|
20222
|
+
auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
20223
|
+
GGML_ASSERT(dev && "CPU backend is not loaded");
|
|
20224
|
+
auto * reg = ggml_backend_dev_backend_reg(dev);
|
|
20225
|
+
auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init");
|
|
20226
|
+
numa_init_fn(numa);
|
|
19297
20227
|
}
|
|
19298
20228
|
}
|
|
19299
20229
|
|
|
@@ -19384,19 +20314,24 @@ struct llama_model * llama_load_model_from_file(
|
|
|
19384
20314
|
}
|
|
19385
20315
|
|
|
19386
20316
|
// create list of devices to use with this model
|
|
19387
|
-
|
|
19388
|
-
|
|
19389
|
-
|
|
19390
|
-
|
|
19391
|
-
|
|
19392
|
-
|
|
19393
|
-
|
|
19394
|
-
|
|
19395
|
-
|
|
20317
|
+
if (params.devices) {
|
|
20318
|
+
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {
|
|
20319
|
+
model->devices.push_back(*dev);
|
|
20320
|
+
}
|
|
20321
|
+
} else {
|
|
20322
|
+
// use all available devices
|
|
20323
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
20324
|
+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
|
20325
|
+
switch (ggml_backend_dev_type(dev)) {
|
|
20326
|
+
case GGML_BACKEND_DEVICE_TYPE_CPU:
|
|
20327
|
+
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
|
20328
|
+
// skip CPU backends since they are handled separately
|
|
20329
|
+
break;
|
|
19396
20330
|
|
|
19397
|
-
|
|
19398
|
-
|
|
19399
|
-
|
|
20331
|
+
case GGML_BACKEND_DEVICE_TYPE_GPU:
|
|
20332
|
+
model->devices.push_back(dev);
|
|
20333
|
+
break;
|
|
20334
|
+
}
|
|
19400
20335
|
}
|
|
19401
20336
|
}
|
|
19402
20337
|
|
|
@@ -19567,9 +20502,6 @@ struct llama_context * llama_new_context_with_model(
|
|
|
19567
20502
|
__func__, n_ctx_per_seq, hparams.n_ctx_train);
|
|
19568
20503
|
}
|
|
19569
20504
|
|
|
19570
|
-
ctx->abort_callback = params.abort_callback;
|
|
19571
|
-
ctx->abort_callback_data = params.abort_callback_data;
|
|
19572
|
-
|
|
19573
20505
|
ctx->logits_all = params.logits_all;
|
|
19574
20506
|
|
|
19575
20507
|
// build worst-case graph for encoder if a model contains encoder
|
|
@@ -19618,7 +20550,7 @@ struct llama_context * llama_new_context_with_model(
|
|
|
19618
20550
|
}
|
|
19619
20551
|
|
|
19620
20552
|
// add CPU backend
|
|
19621
|
-
ctx->backend_cpu =
|
|
20553
|
+
ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
|
19622
20554
|
if (ctx->backend_cpu == nullptr) {
|
|
19623
20555
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
|
19624
20556
|
llama_free(ctx);
|
|
@@ -19638,6 +20570,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
19638
20570
|
}
|
|
19639
20571
|
}
|
|
19640
20572
|
|
|
20573
|
+
llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data);
|
|
20574
|
+
|
|
19641
20575
|
if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) {
|
|
19642
20576
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
|
19643
20577
|
llama_free(ctx);
|
|
@@ -19683,7 +20617,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
19683
20617
|
std::vector<ggml_backend_t> backend_ptrs;
|
|
19684
20618
|
for (auto & backend : ctx->backends) {
|
|
19685
20619
|
auto * buft = ggml_backend_get_default_buffer_type(backend.get());
|
|
19686
|
-
|
|
20620
|
+
auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
|
20621
|
+
if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) {
|
|
19687
20622
|
// use the host buffer of the first device CPU for faster transfer of the intermediate state
|
|
19688
20623
|
auto * dev = model->devices[0];
|
|
19689
20624
|
auto * host_buft = ggml_backend_dev_host_buffer_type(dev);
|
|
@@ -19711,7 +20646,8 @@ struct llama_context * llama_new_context_with_model(
|
|
|
19711
20646
|
// pipeline parallelism requires support for async compute and events in all devices
|
|
19712
20647
|
if (pipeline_parallel) {
|
|
19713
20648
|
for (auto & backend : ctx->backends) {
|
|
19714
|
-
|
|
20649
|
+
auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get()));
|
|
20650
|
+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) {
|
|
19715
20651
|
// ignore CPU backend
|
|
19716
20652
|
continue;
|
|
19717
20653
|
}
|
|
@@ -19853,6 +20789,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
19853
20789
|
case LLM_ARCH_T5ENCODER:
|
|
19854
20790
|
case LLM_ARCH_JAIS:
|
|
19855
20791
|
case LLM_ARCH_RWKV6:
|
|
20792
|
+
case LLM_ARCH_WAVTOKENIZER_DEC:
|
|
19856
20793
|
return LLAMA_ROPE_TYPE_NONE;
|
|
19857
20794
|
|
|
19858
20795
|
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
|
@@ -19867,6 +20804,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
19867
20804
|
case LLM_ARCH_COMMAND_R:
|
|
19868
20805
|
case LLM_ARCH_OLMO:
|
|
19869
20806
|
case LLM_ARCH_ARCTIC:
|
|
20807
|
+
case LLM_ARCH_DEEPSEEK:
|
|
19870
20808
|
case LLM_ARCH_DEEPSEEK2:
|
|
19871
20809
|
case LLM_ARCH_CHATGLM:
|
|
19872
20810
|
case LLM_ARCH_GRANITE:
|
|
@@ -19885,6 +20823,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
19885
20823
|
case LLM_ARCH_QWEN:
|
|
19886
20824
|
case LLM_ARCH_QWEN2:
|
|
19887
20825
|
case LLM_ARCH_QWEN2MOE:
|
|
20826
|
+
case LLM_ARCH_OLMO2:
|
|
19888
20827
|
case LLM_ARCH_OLMOE:
|
|
19889
20828
|
case LLM_ARCH_PHI2:
|
|
19890
20829
|
case LLM_ARCH_PHI3:
|
|
@@ -19899,6 +20838,9 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|
|
19899
20838
|
case LLM_ARCH_MINICPM3:
|
|
19900
20839
|
return LLAMA_ROPE_TYPE_NEOX;
|
|
19901
20840
|
|
|
20841
|
+
case LLM_ARCH_QWEN2VL:
|
|
20842
|
+
return LLAMA_ROPE_TYPE_MROPE;
|
|
20843
|
+
|
|
19902
20844
|
// all model arches should be listed explicitly here
|
|
19903
20845
|
case LLM_ARCH_UNKNOWN:
|
|
19904
20846
|
GGML_ABORT("unknown architecture");
|
|
@@ -19965,17 +20907,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
|
19965
20907
|
return model->n_elements;
|
|
19966
20908
|
}
|
|
19967
20909
|
|
|
19968
|
-
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
|
19969
|
-
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
|
19970
|
-
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
|
19971
|
-
return it.first == name;
|
|
19972
|
-
});
|
|
19973
|
-
if (it == model->tensors_by_name.end()) {
|
|
19974
|
-
return nullptr;
|
|
19975
|
-
}
|
|
19976
|
-
return it->second;
|
|
19977
|
-
}
|
|
19978
|
-
|
|
19979
20910
|
bool llama_model_has_encoder(const struct llama_model * model) {
|
|
19980
20911
|
switch (model->arch) {
|
|
19981
20912
|
case LLM_ARCH_T5: return true;
|
|
@@ -20276,6 +21207,10 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
|
|
20276
21207
|
llama_kv_cache_update_internal(*ctx);
|
|
20277
21208
|
}
|
|
20278
21209
|
|
|
21210
|
+
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
|
21211
|
+
return !ctx->kv_self.recurrent && ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
|
|
21212
|
+
}
|
|
21213
|
+
|
|
20279
21214
|
// deprecated
|
|
20280
21215
|
size_t llama_get_state_size(struct llama_context * ctx) {
|
|
20281
21216
|
return llama_state_get_size(ctx);
|
|
@@ -21260,6 +22195,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
|
|
21260
22195
|
void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) {
|
|
21261
22196
|
ctx->abort_callback = abort_callback;
|
|
21262
22197
|
ctx->abort_callback_data = abort_callback_data;
|
|
22198
|
+
|
|
22199
|
+
for (auto & backend : ctx->backends) {
|
|
22200
|
+
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
|
|
22201
|
+
auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
|
|
22202
|
+
if (set_abort_callback_fn) {
|
|
22203
|
+
set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data);
|
|
22204
|
+
}
|
|
22205
|
+
}
|
|
21263
22206
|
}
|
|
21264
22207
|
|
|
21265
22208
|
void llama_set_embeddings(struct llama_context * ctx, bool embeddings) {
|
|
@@ -21455,7 +22398,7 @@ float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
|
|
|
21455
22398
|
throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs));
|
|
21456
22399
|
}
|
|
21457
22400
|
} else if ((size_t) i >= ctx->output_ids.size()) {
|
|
21458
|
-
throw std::runtime_error(format("out of range [0, %
|
|
22401
|
+
throw std::runtime_error(format("out of range [0, %zu)", ctx->output_ids.size()));
|
|
21459
22402
|
} else {
|
|
21460
22403
|
j = ctx->output_ids[i];
|
|
21461
22404
|
}
|
|
@@ -21626,18 +22569,111 @@ int32_t llama_detokenize(
|
|
|
21626
22569
|
// chat templates
|
|
21627
22570
|
//
|
|
21628
22571
|
|
|
22572
|
+
static llm_chat_template llama_chat_detect_template(const std::string & tmpl) {
|
|
22573
|
+
if (LLM_CHAT_TEMPLATES.find(tmpl) != LLM_CHAT_TEMPLATES.end()) {
|
|
22574
|
+
return LLM_CHAT_TEMPLATES.at(tmpl);
|
|
22575
|
+
}
|
|
22576
|
+
auto tmpl_contains = [&tmpl](const char * haystack) -> bool {
|
|
22577
|
+
return tmpl.find(haystack) != std::string::npos;
|
|
22578
|
+
};
|
|
22579
|
+
if (tmpl_contains("<|im_start|>")) {
|
|
22580
|
+
return LLM_CHAT_TEMPLATE_CHATML;
|
|
22581
|
+
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
|
22582
|
+
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
|
22583
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
|
22584
|
+
} else if (
|
|
22585
|
+
// catches official 'v1' template
|
|
22586
|
+
tmpl_contains("' [INST] ' + system_message")
|
|
22587
|
+
// catches official 'v3' and 'v3-tekken' templates
|
|
22588
|
+
|| tmpl_contains("[AVAILABLE_TOOLS]")
|
|
22589
|
+
) {
|
|
22590
|
+
// Official mistral 'v1', 'v3' and 'v3-tekken' templates
|
|
22591
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
|
22592
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
|
22593
|
+
if (tmpl_contains(" [INST]")) {
|
|
22594
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V1;
|
|
22595
|
+
} else if (tmpl_contains("\"[INST]\"")) {
|
|
22596
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN;
|
|
22597
|
+
}
|
|
22598
|
+
return LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
|
22599
|
+
} else {
|
|
22600
|
+
// llama2 template and its variants
|
|
22601
|
+
// [variant] support system message
|
|
22602
|
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
|
22603
|
+
bool support_system_message = tmpl_contains("<<SYS>>");
|
|
22604
|
+
bool add_bos_inside_history = tmpl_contains("bos_token + '[INST]");
|
|
22605
|
+
bool strip_message = tmpl_contains("content.strip()");
|
|
22606
|
+
if (strip_message) {
|
|
22607
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
|
22608
|
+
} else if (add_bos_inside_history) {
|
|
22609
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
|
22610
|
+
} else if (support_system_message) {
|
|
22611
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2_SYS;
|
|
22612
|
+
} else {
|
|
22613
|
+
return LLM_CHAT_TEMPLATE_LLAMA_2;
|
|
22614
|
+
}
|
|
22615
|
+
}
|
|
22616
|
+
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
|
22617
|
+
return LLM_CHAT_TEMPLATE_PHI_3;
|
|
22618
|
+
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
|
22619
|
+
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
|
22620
|
+
} else if (tmpl_contains("bos_token + message['role']")) {
|
|
22621
|
+
return LLM_CHAT_TEMPLATE_MONARCH;
|
|
22622
|
+
} else if (tmpl_contains("<start_of_turn>")) {
|
|
22623
|
+
return LLM_CHAT_TEMPLATE_GEMMA;
|
|
22624
|
+
} else if (tmpl_contains("'\\n\\nAssistant: ' + eos_token")) {
|
|
22625
|
+
// OrionStarAI/Orion-14B-Chat
|
|
22626
|
+
return LLM_CHAT_TEMPLATE_ORION;
|
|
22627
|
+
} else if (tmpl_contains("GPT4 Correct ")) {
|
|
22628
|
+
// openchat/openchat-3.5-0106
|
|
22629
|
+
return LLM_CHAT_TEMPLATE_OPENCHAT;
|
|
22630
|
+
} else if (tmpl_contains("USER: ") && tmpl_contains("ASSISTANT: ")) {
|
|
22631
|
+
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
|
22632
|
+
if (tmpl_contains("SYSTEM: ")) {
|
|
22633
|
+
return LLM_CHAT_TEMPLATE_VICUNA_ORCA;
|
|
22634
|
+
}
|
|
22635
|
+
return LLM_CHAT_TEMPLATE_VICUNA;
|
|
22636
|
+
} else if (tmpl_contains("### Instruction:") && tmpl_contains("<|EOT|>")) {
|
|
22637
|
+
// deepseek-ai/deepseek-coder-33b-instruct
|
|
22638
|
+
return LLM_CHAT_TEMPLATE_DEEPSEEK;
|
|
22639
|
+
} else if (tmpl_contains("<|START_OF_TURN_TOKEN|>") && tmpl_contains("<|USER_TOKEN|>")) {
|
|
22640
|
+
// CohereForAI/c4ai-command-r-plus
|
|
22641
|
+
return LLM_CHAT_TEMPLATE_COMMAND_R;
|
|
22642
|
+
} else if (tmpl_contains("<|start_header_id|>") && tmpl_contains("<|end_header_id|>")) {
|
|
22643
|
+
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
|
22644
|
+
} else if (tmpl_contains("[gMASK]sop")) {
|
|
22645
|
+
// chatglm3-6b
|
|
22646
|
+
return LLM_CHAT_TEMPLATE_CHATGML_3;
|
|
22647
|
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
|
22648
|
+
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
|
22649
|
+
} else if (tmpl_contains(LU8("<用户>"))) {
|
|
22650
|
+
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
|
22651
|
+
return LLM_CHAT_TEMPLATE_MINICPM;
|
|
22652
|
+
} else if (tmpl_contains("'Assistant: ' + message['content'] + eos_token")) {
|
|
22653
|
+
return LLM_CHAT_TEMPLATE_DEEPSEEK_2;
|
|
22654
|
+
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
|
22655
|
+
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
|
22656
|
+
// EXAONE-3.0-7.8B-Instruct
|
|
22657
|
+
return LLM_CHAT_TEMPLATE_EXAONE_3;
|
|
22658
|
+
} else if (tmpl_contains("rwkv-world")) {
|
|
22659
|
+
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
|
22660
|
+
} else if (tmpl_contains("<|start_of_role|>")) {
|
|
22661
|
+
return LLM_CHAT_TEMPLATE_GRANITE;
|
|
22662
|
+
} else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) {
|
|
22663
|
+
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
|
22664
|
+
}
|
|
22665
|
+
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
22666
|
+
}
|
|
22667
|
+
|
|
21629
22668
|
// Simple version of "llama_apply_chat_template" that only works with strings
|
|
21630
22669
|
// This function uses heuristic checks to determine commonly used template. It is not a jinja parser.
|
|
21631
22670
|
static int32_t llama_chat_apply_template_internal(
|
|
21632
|
-
const
|
|
22671
|
+
const llm_chat_template tmpl,
|
|
21633
22672
|
const std::vector<const llama_chat_message *> & chat,
|
|
21634
22673
|
std::string & dest, bool add_ass) {
|
|
21635
22674
|
// Taken from the research: https://github.com/ggerganov/llama.cpp/issues/5527
|
|
21636
22675
|
std::stringstream ss;
|
|
21637
|
-
|
|
21638
|
-
return tmpl.find(haystack) != std::string::npos;
|
|
21639
|
-
};
|
|
21640
|
-
if (tmpl == "chatml" || tmpl_contains("<|im_start|>")) {
|
|
22676
|
+
if (tmpl == LLM_CHAT_TEMPLATE_CHATML) {
|
|
21641
22677
|
// chatml template
|
|
21642
22678
|
for (auto message : chat) {
|
|
21643
22679
|
ss << "<|im_start|>" << message->role << "\n" << message->content << "<|im_end|>\n";
|
|
@@ -21645,16 +22681,59 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21645
22681
|
if (add_ass) {
|
|
21646
22682
|
ss << "<|im_start|>assistant\n";
|
|
21647
22683
|
}
|
|
21648
|
-
} else if (tmpl ==
|
|
22684
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
|
22685
|
+
// Official mistral 'v7' template
|
|
22686
|
+
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
|
22687
|
+
for (auto message : chat) {
|
|
22688
|
+
std::string role(message->role);
|
|
22689
|
+
std::string content(message->content);
|
|
22690
|
+
if (role == "system") {
|
|
22691
|
+
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
|
22692
|
+
} else if (role == "user") {
|
|
22693
|
+
ss << "[INST] " << content << "[/INST]";
|
|
22694
|
+
}
|
|
22695
|
+
else {
|
|
22696
|
+
ss << " " << content << "</s>";
|
|
22697
|
+
}
|
|
22698
|
+
}
|
|
22699
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
|
22700
|
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3
|
|
22701
|
+
|| tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN) {
|
|
22702
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/chat_templates.md
|
|
22703
|
+
// See: https://github.com/mistralai/cookbook/blob/main/concept-deep-dive/tokenization/templates.md
|
|
22704
|
+
std::string leading_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1 ? " " : "";
|
|
22705
|
+
std::string trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN ? "" : " ";
|
|
22706
|
+
bool trim_assistant_message = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V3;
|
|
22707
|
+
bool is_inside_turn = false;
|
|
22708
|
+
for (auto message : chat) {
|
|
22709
|
+
if (!is_inside_turn) {
|
|
22710
|
+
ss << leading_space << "[INST]" << trailing_space;
|
|
22711
|
+
is_inside_turn = true;
|
|
22712
|
+
}
|
|
22713
|
+
std::string role(message->role);
|
|
22714
|
+
std::string content(message->content);
|
|
22715
|
+
if (role == "system") {
|
|
22716
|
+
ss << content << "\n\n";
|
|
22717
|
+
} else if (role == "user") {
|
|
22718
|
+
ss << content << leading_space << "[/INST]";
|
|
22719
|
+
} else {
|
|
22720
|
+
ss << trailing_space << (trim_assistant_message ? trim(content) : content) << "</s>";
|
|
22721
|
+
is_inside_turn = false;
|
|
22722
|
+
}
|
|
22723
|
+
}
|
|
22724
|
+
} else if (
|
|
22725
|
+
tmpl == LLM_CHAT_TEMPLATE_LLAMA_2
|
|
22726
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS
|
|
22727
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS
|
|
22728
|
+
|| tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP) {
|
|
21649
22729
|
// llama2 template and its variants
|
|
21650
22730
|
// [variant] support system message
|
|
21651
|
-
|
|
21652
|
-
|
|
21653
|
-
bool space_around_response = tmpl_contains("' ' + eos_token");
|
|
22731
|
+
// See: https://huggingface.co/blog/llama2#how-to-prompt-llama-2
|
|
22732
|
+
bool support_system_message = tmpl != LLM_CHAT_TEMPLATE_LLAMA_2;
|
|
21654
22733
|
// [variant] add BOS inside history
|
|
21655
|
-
bool add_bos_inside_history =
|
|
22734
|
+
bool add_bos_inside_history = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_BOS;
|
|
21656
22735
|
// [variant] trim spaces from the input message
|
|
21657
|
-
bool strip_message =
|
|
22736
|
+
bool strip_message = tmpl == LLM_CHAT_TEMPLATE_LLAMA_2_SYS_STRIP;
|
|
21658
22737
|
// construct the prompt
|
|
21659
22738
|
bool is_inside_turn = true; // skip BOS at the beginning
|
|
21660
22739
|
ss << "[INST] ";
|
|
@@ -21675,12 +22754,11 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21675
22754
|
} else if (role == "user") {
|
|
21676
22755
|
ss << content << " [/INST]";
|
|
21677
22756
|
} else {
|
|
21678
|
-
ss <<
|
|
22757
|
+
ss << content << "</s>";
|
|
21679
22758
|
is_inside_turn = false;
|
|
21680
22759
|
}
|
|
21681
22760
|
}
|
|
21682
|
-
|
|
21683
|
-
} else if (tmpl == "phi3" || (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>"))) {
|
|
22761
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_PHI_3) {
|
|
21684
22762
|
// Phi 3
|
|
21685
22763
|
for (auto message : chat) {
|
|
21686
22764
|
std::string role(message->role);
|
|
@@ -21689,7 +22767,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21689
22767
|
if (add_ass) {
|
|
21690
22768
|
ss << "<|assistant|>\n";
|
|
21691
22769
|
}
|
|
21692
|
-
} else if (tmpl ==
|
|
22770
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) {
|
|
21693
22771
|
// zephyr template
|
|
21694
22772
|
for (auto message : chat) {
|
|
21695
22773
|
ss << "<|" << message->role << "|>" << "\n" << message->content << "<|endoftext|>\n";
|
|
@@ -21697,7 +22775,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21697
22775
|
if (add_ass) {
|
|
21698
22776
|
ss << "<|assistant|>\n";
|
|
21699
22777
|
}
|
|
21700
|
-
} else if (tmpl ==
|
|
22778
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MONARCH) {
|
|
21701
22779
|
// mlabonne/AlphaMonarch-7B template (the <s> is included inside history)
|
|
21702
22780
|
for (auto message : chat) {
|
|
21703
22781
|
std::string bos = (message == chat.front()) ? "" : "<s>"; // skip BOS for first message
|
|
@@ -21706,7 +22784,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21706
22784
|
if (add_ass) {
|
|
21707
22785
|
ss << "<s>assistant\n";
|
|
21708
22786
|
}
|
|
21709
|
-
} else if (tmpl ==
|
|
22787
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GEMMA) {
|
|
21710
22788
|
// google/gemma-7b-it
|
|
21711
22789
|
std::string system_prompt = "";
|
|
21712
22790
|
for (auto message : chat) {
|
|
@@ -21728,7 +22806,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21728
22806
|
if (add_ass) {
|
|
21729
22807
|
ss << "<start_of_turn>model\n";
|
|
21730
22808
|
}
|
|
21731
|
-
} else if (tmpl ==
|
|
22809
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_ORION) {
|
|
21732
22810
|
// OrionStarAI/Orion-14B-Chat
|
|
21733
22811
|
std::string system_prompt = "";
|
|
21734
22812
|
for (auto message : chat) {
|
|
@@ -21748,7 +22826,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21748
22826
|
ss << message->content << "</s>";
|
|
21749
22827
|
}
|
|
21750
22828
|
}
|
|
21751
|
-
} else if (tmpl ==
|
|
22829
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_OPENCHAT) {
|
|
21752
22830
|
// openchat/openchat-3.5-0106,
|
|
21753
22831
|
for (auto message : chat) {
|
|
21754
22832
|
std::string role(message->role);
|
|
@@ -21762,13 +22840,13 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21762
22840
|
if (add_ass) {
|
|
21763
22841
|
ss << "GPT4 Correct Assistant:";
|
|
21764
22842
|
}
|
|
21765
|
-
} else if (tmpl ==
|
|
22843
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_VICUNA || tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
|
21766
22844
|
// eachadea/vicuna-13b-1.1 (and Orca variant)
|
|
21767
22845
|
for (auto message : chat) {
|
|
21768
22846
|
std::string role(message->role);
|
|
21769
22847
|
if (role == "system") {
|
|
21770
22848
|
// Orca-Vicuna variant uses a system prefix
|
|
21771
|
-
if (tmpl ==
|
|
22849
|
+
if (tmpl == LLM_CHAT_TEMPLATE_VICUNA_ORCA) {
|
|
21772
22850
|
ss << "SYSTEM: " << message->content << "\n";
|
|
21773
22851
|
} else {
|
|
21774
22852
|
ss << message->content << "\n\n";
|
|
@@ -21782,7 +22860,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21782
22860
|
if (add_ass) {
|
|
21783
22861
|
ss << "ASSISTANT:";
|
|
21784
22862
|
}
|
|
21785
|
-
} else if (tmpl ==
|
|
22863
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK) {
|
|
21786
22864
|
// deepseek-ai/deepseek-coder-33b-instruct
|
|
21787
22865
|
for (auto message : chat) {
|
|
21788
22866
|
std::string role(message->role);
|
|
@@ -21797,7 +22875,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21797
22875
|
if (add_ass) {
|
|
21798
22876
|
ss << "### Response:\n";
|
|
21799
22877
|
}
|
|
21800
|
-
} else if (tmpl ==
|
|
22878
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_COMMAND_R) {
|
|
21801
22879
|
// CohereForAI/c4ai-command-r-plus
|
|
21802
22880
|
for (auto message : chat) {
|
|
21803
22881
|
std::string role(message->role);
|
|
@@ -21812,7 +22890,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21812
22890
|
if (add_ass) {
|
|
21813
22891
|
ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>";
|
|
21814
22892
|
}
|
|
21815
|
-
} else if (tmpl ==
|
|
22893
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA_3) {
|
|
21816
22894
|
// Llama 3
|
|
21817
22895
|
for (auto message : chat) {
|
|
21818
22896
|
std::string role(message->role);
|
|
@@ -21821,7 +22899,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21821
22899
|
if (add_ass) {
|
|
21822
22900
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
|
21823
22901
|
}
|
|
21824
|
-
} else if (tmpl ==
|
|
22902
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
|
|
21825
22903
|
// chatglm3-6b
|
|
21826
22904
|
ss << "[gMASK]" << "sop";
|
|
21827
22905
|
for (auto message : chat) {
|
|
@@ -21831,7 +22909,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21831
22909
|
if (add_ass) {
|
|
21832
22910
|
ss << "<|assistant|>";
|
|
21833
22911
|
}
|
|
21834
|
-
} else if (tmpl ==
|
|
22912
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
|
|
21835
22913
|
ss << "[gMASK]" << "<sop>";
|
|
21836
22914
|
for (auto message : chat) {
|
|
21837
22915
|
std::string role(message->role);
|
|
@@ -21840,7 +22918,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21840
22918
|
if (add_ass) {
|
|
21841
22919
|
ss << "<|assistant|>";
|
|
21842
22920
|
}
|
|
21843
|
-
} else if (tmpl ==
|
|
22921
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
|
|
21844
22922
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
|
21845
22923
|
for (auto message : chat) {
|
|
21846
22924
|
std::string role(message->role);
|
|
@@ -21852,7 +22930,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21852
22930
|
ss << trim(message->content);
|
|
21853
22931
|
}
|
|
21854
22932
|
}
|
|
21855
|
-
} else if (tmpl ==
|
|
22933
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_DEEPSEEK_2) {
|
|
21856
22934
|
// DeepSeek-V2
|
|
21857
22935
|
for (auto message : chat) {
|
|
21858
22936
|
std::string role(message->role);
|
|
@@ -21867,7 +22945,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21867
22945
|
if (add_ass) {
|
|
21868
22946
|
ss << "Assistant:";
|
|
21869
22947
|
}
|
|
21870
|
-
} else if (tmpl ==
|
|
22948
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_3) {
|
|
21871
22949
|
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
|
21872
22950
|
// EXAONE-3.0-7.8B-Instruct
|
|
21873
22951
|
for (auto message : chat) {
|
|
@@ -21883,7 +22961,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21883
22961
|
if (add_ass) {
|
|
21884
22962
|
ss << "[|assistant|]";
|
|
21885
22963
|
}
|
|
21886
|
-
} else if (tmpl ==
|
|
22964
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
|
|
21887
22965
|
// this template requires the model to have "\n\n" as EOT token
|
|
21888
22966
|
for (auto message : chat) {
|
|
21889
22967
|
std::string role(message->role);
|
|
@@ -21893,7 +22971,7 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21893
22971
|
ss << message->content << "\n\n";
|
|
21894
22972
|
}
|
|
21895
22973
|
}
|
|
21896
|
-
} else if (tmpl ==
|
|
22974
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GRANITE) {
|
|
21897
22975
|
// IBM Granite template
|
|
21898
22976
|
for (const auto & message : chat) {
|
|
21899
22977
|
std::string role(message->role);
|
|
@@ -21906,6 +22984,32 @@ static int32_t llama_chat_apply_template_internal(
|
|
|
21906
22984
|
if (add_ass) {
|
|
21907
22985
|
ss << "<|start_of_role|>assistant<|end_of_role|>\n";
|
|
21908
22986
|
}
|
|
22987
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) {
|
|
22988
|
+
// GigaChat template
|
|
22989
|
+
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
|
|
22990
|
+
|
|
22991
|
+
// Handle system message if present
|
|
22992
|
+
if (has_system) {
|
|
22993
|
+
ss << "<s>" << chat[0]->content << "<|message_sep|>";
|
|
22994
|
+
} else {
|
|
22995
|
+
ss << "<s>";
|
|
22996
|
+
}
|
|
22997
|
+
|
|
22998
|
+
// Process remaining messages
|
|
22999
|
+
for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) {
|
|
23000
|
+
std::string role(chat[i]->role);
|
|
23001
|
+
if (role == "user") {
|
|
23002
|
+
ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>"
|
|
23003
|
+
<< "available functions<|role_sep|>[]<|message_sep|>";
|
|
23004
|
+
} else if (role == "assistant") {
|
|
23005
|
+
ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>";
|
|
23006
|
+
}
|
|
23007
|
+
}
|
|
23008
|
+
|
|
23009
|
+
// Add generation prompt if needed
|
|
23010
|
+
if (add_ass) {
|
|
23011
|
+
ss << "assistant<|role_sep|>";
|
|
23012
|
+
}
|
|
21909
23013
|
} else {
|
|
21910
23014
|
// template not supported
|
|
21911
23015
|
return -1;
|
|
@@ -21925,15 +23029,15 @@ int32_t llama_chat_apply_template(
|
|
|
21925
23029
|
std::string curr_tmpl(tmpl == nullptr ? "" : tmpl);
|
|
21926
23030
|
if (tmpl == nullptr) {
|
|
21927
23031
|
GGML_ASSERT(model != nullptr);
|
|
21928
|
-
|
|
21929
|
-
|
|
21930
|
-
|
|
21931
|
-
|
|
21932
|
-
|
|
23032
|
+
|
|
23033
|
+
// load template from model, if available
|
|
23034
|
+
const auto & it = model->gguf_kv.find("tokenizer.chat_template");
|
|
23035
|
+
if (it != model->gguf_kv.end() && it->second.size() > 0) {
|
|
23036
|
+
curr_tmpl = it->second;
|
|
23037
|
+
}
|
|
23038
|
+
else {
|
|
21933
23039
|
// worst case: there is no information about template, we will use chatml by default
|
|
21934
|
-
curr_tmpl = "chatml";
|
|
21935
|
-
} else {
|
|
21936
|
-
curr_tmpl = std::string(model_template.data(), model_template.size());
|
|
23040
|
+
curr_tmpl = "chatml"; // see llama_chat_apply_template_internal
|
|
21937
23041
|
}
|
|
21938
23042
|
}
|
|
21939
23043
|
|
|
@@ -21945,7 +23049,11 @@ int32_t llama_chat_apply_template(
|
|
|
21945
23049
|
}
|
|
21946
23050
|
|
|
21947
23051
|
std::string formatted_chat;
|
|
21948
|
-
|
|
23052
|
+
llm_chat_template detected_tmpl = llama_chat_detect_template(curr_tmpl);
|
|
23053
|
+
if (detected_tmpl == LLM_CHAT_TEMPLATE_UNKNOWN) {
|
|
23054
|
+
return -1;
|
|
23055
|
+
}
|
|
23056
|
+
int32_t res = llama_chat_apply_template_internal(detected_tmpl, chat_vec, formatted_chat, add_ass);
|
|
21949
23057
|
if (res < 0) {
|
|
21950
23058
|
return res;
|
|
21951
23059
|
}
|
|
@@ -21955,6 +23063,15 @@ int32_t llama_chat_apply_template(
|
|
|
21955
23063
|
return res;
|
|
21956
23064
|
}
|
|
21957
23065
|
|
|
23066
|
+
int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
|
|
23067
|
+
auto it = LLM_CHAT_TEMPLATES.begin();
|
|
23068
|
+
for (size_t i = 0; i < std::min(len, LLM_CHAT_TEMPLATES.size()); i++) {
|
|
23069
|
+
output[i] = it->first.c_str();
|
|
23070
|
+
std::advance(it, 1);
|
|
23071
|
+
}
|
|
23072
|
+
return (int32_t) LLM_CHAT_TEMPLATES.size();
|
|
23073
|
+
}
|
|
23074
|
+
|
|
21958
23075
|
//
|
|
21959
23076
|
// sampling
|
|
21960
23077
|
//
|
|
@@ -22001,32 +23118,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int
|
|
|
22001
23118
|
}
|
|
22002
23119
|
|
|
22003
23120
|
const char * llama_print_system_info(void) {
|
|
22004
|
-
ggml_cpu_init(); // some ARM features are detected at runtime
|
|
22005
|
-
|
|
22006
23121
|
static std::string s;
|
|
22007
23122
|
|
|
22008
|
-
|
|
22009
|
-
|
|
22010
|
-
|
|
22011
|
-
|
|
22012
|
-
|
|
22013
|
-
|
|
22014
|
-
|
|
22015
|
-
|
|
22016
|
-
|
|
22017
|
-
|
|
22018
|
-
|
|
22019
|
-
|
|
22020
|
-
|
|
22021
|
-
|
|
22022
|
-
|
|
22023
|
-
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
|
22024
|
-
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
|
22025
|
-
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
|
22026
|
-
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
|
22027
|
-
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
|
22028
|
-
s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
|
|
22029
|
-
s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | ";
|
|
23123
|
+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
23124
|
+
auto * reg = ggml_backend_reg_get(i);
|
|
23125
|
+
auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
|
|
23126
|
+
if (get_features_fn) {
|
|
23127
|
+
ggml_backend_feature * features = get_features_fn(reg);
|
|
23128
|
+
s += ggml_backend_reg_name(reg);
|
|
23129
|
+
s += " : ";
|
|
23130
|
+
for (; features->name; features++) {
|
|
23131
|
+
s += features->name;
|
|
23132
|
+
s += " = ";
|
|
23133
|
+
s += features->value;
|
|
23134
|
+
s += " | ";
|
|
23135
|
+
}
|
|
23136
|
+
}
|
|
23137
|
+
}
|
|
22030
23138
|
|
|
22031
23139
|
return s.c_str();
|
|
22032
23140
|
}
|