@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -19,6 +19,7 @@ struct llama_cparams;
|
|
|
19
19
|
|
|
20
20
|
class llama_memory_i;
|
|
21
21
|
class llama_kv_cache_unified;
|
|
22
|
+
class llama_kv_cache_recurrent;
|
|
22
23
|
|
|
23
24
|
// certain models (typically multi-modal) can produce different types of graphs
|
|
24
25
|
enum llm_graph_type {
|
|
@@ -90,14 +91,29 @@ public:
|
|
|
90
91
|
|
|
91
92
|
class llm_graph_input_pos : public llm_graph_input_i {
|
|
92
93
|
public:
|
|
93
|
-
llm_graph_input_pos(int64_t
|
|
94
|
+
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
|
94
95
|
virtual ~llm_graph_input_pos() = default;
|
|
95
96
|
|
|
96
97
|
void set_input(const llama_ubatch * ubatch) override;
|
|
97
98
|
|
|
98
99
|
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
|
99
100
|
|
|
100
|
-
const int64_t
|
|
101
|
+
const int64_t n_pos_per_embd = 1;
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
// temperature tuning, used by llama4
|
|
105
|
+
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
|
106
|
+
public:
|
|
107
|
+
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
|
108
|
+
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
|
109
|
+
virtual ~llm_graph_input_attn_temp() = default;
|
|
110
|
+
|
|
111
|
+
void set_input(const llama_ubatch * ubatch) override;
|
|
112
|
+
|
|
113
|
+
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
|
114
|
+
|
|
115
|
+
const uint32_t n_attn_temp_floor_scale;
|
|
116
|
+
const float f_attn_temp_scale;
|
|
101
117
|
};
|
|
102
118
|
|
|
103
119
|
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
|
@@ -171,26 +187,26 @@ public:
|
|
|
171
187
|
|
|
172
188
|
class llm_graph_input_s_copy : public llm_graph_input_i {
|
|
173
189
|
public:
|
|
174
|
-
llm_graph_input_s_copy(const
|
|
190
|
+
llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
|
175
191
|
virtual ~llm_graph_input_s_copy() = default;
|
|
176
192
|
|
|
177
193
|
void set_input(const llama_ubatch * ubatch) override;
|
|
178
194
|
|
|
179
195
|
ggml_tensor * s_copy; // I32 [kv_size]
|
|
180
196
|
|
|
181
|
-
const
|
|
197
|
+
const llama_kv_cache_recurrent * kv_self;
|
|
182
198
|
};
|
|
183
199
|
|
|
184
200
|
class llm_graph_input_s_mask : public llm_graph_input_i {
|
|
185
201
|
public:
|
|
186
|
-
llm_graph_input_s_mask(const
|
|
202
|
+
llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
|
187
203
|
virtual ~llm_graph_input_s_mask() = default;
|
|
188
204
|
|
|
189
205
|
void set_input(const llama_ubatch * ubatch) override;
|
|
190
206
|
|
|
191
207
|
ggml_tensor * s_mask; // F32 [1, n_kv]
|
|
192
208
|
|
|
193
|
-
const
|
|
209
|
+
const llama_kv_cache_recurrent * kv_self;
|
|
194
210
|
};
|
|
195
211
|
|
|
196
212
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
|
@@ -282,6 +298,7 @@ class llm_graph_result_i {
|
|
|
282
298
|
public:
|
|
283
299
|
virtual ~llm_graph_result_i() = default;
|
|
284
300
|
|
|
301
|
+
virtual ggml_tensor * get_tokens() = 0;
|
|
285
302
|
virtual ggml_tensor * get_logits() = 0;
|
|
286
303
|
virtual ggml_tensor * get_embd() = 0;
|
|
287
304
|
virtual ggml_tensor * get_embd_pooled() = 0;
|
|
@@ -296,6 +313,7 @@ class llm_graph_result : public llm_graph_result_i {
|
|
|
296
313
|
public:
|
|
297
314
|
virtual ~llm_graph_result() = default;
|
|
298
315
|
|
|
316
|
+
ggml_tensor * get_tokens() override { return t_tokens; }
|
|
299
317
|
ggml_tensor * get_logits() override { return t_logits; }
|
|
300
318
|
ggml_tensor * get_embd() override { return t_embd; }
|
|
301
319
|
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
|
@@ -312,6 +330,7 @@ public:
|
|
|
312
330
|
}
|
|
313
331
|
|
|
314
332
|
// important graph nodes
|
|
333
|
+
ggml_tensor * t_tokens = nullptr;
|
|
315
334
|
ggml_tensor * t_logits = nullptr;
|
|
316
335
|
ggml_tensor * t_embd = nullptr;
|
|
317
336
|
ggml_tensor * t_embd_pooled = nullptr;
|
|
@@ -335,8 +354,8 @@ struct llm_graph_params {
|
|
|
335
354
|
const llama_cparams & cparams;
|
|
336
355
|
const llama_ubatch & ubatch;
|
|
337
356
|
|
|
338
|
-
|
|
339
|
-
|
|
357
|
+
ggml_backend_sched_t sched;
|
|
358
|
+
ggml_backend_t backend_cpu;
|
|
340
359
|
|
|
341
360
|
const llama_adapter_cvec * cvec;
|
|
342
361
|
const llama_adapter_loras * loras;
|
|
@@ -387,9 +406,9 @@ struct llm_graph_context {
|
|
|
387
406
|
|
|
388
407
|
ggml_context * ctx0 = nullptr;
|
|
389
408
|
|
|
390
|
-
|
|
409
|
+
ggml_backend_sched_t sched;
|
|
391
410
|
|
|
392
|
-
|
|
411
|
+
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
|
393
412
|
|
|
394
413
|
const llama_adapter_cvec * cvec;
|
|
395
414
|
const llama_adapter_loras * loras;
|
|
@@ -402,7 +421,7 @@ struct llm_graph_context {
|
|
|
402
421
|
|
|
403
422
|
llm_graph_context(const llm_graph_params & params);
|
|
404
423
|
|
|
405
|
-
int64_t
|
|
424
|
+
int64_t n_pos_per_embd() const;
|
|
406
425
|
|
|
407
426
|
void cb(ggml_tensor * cur, const char * name, int il) const;
|
|
408
427
|
|
|
@@ -470,6 +489,7 @@ struct llm_graph_context {
|
|
|
470
489
|
|
|
471
490
|
ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
|
|
472
491
|
ggml_tensor * build_inp_pos() const;
|
|
492
|
+
ggml_tensor * build_inp_attn_scale() const;
|
|
473
493
|
ggml_tensor * build_inp_out_ids() const;
|
|
474
494
|
ggml_tensor * build_inp_mean() const;
|
|
475
495
|
ggml_tensor * build_inp_cls() const;
|
|
@@ -487,11 +507,12 @@ struct llm_graph_context {
|
|
|
487
507
|
|
|
488
508
|
ggml_tensor * build_attn_mha(
|
|
489
509
|
ggml_cgraph * gf,
|
|
490
|
-
ggml_tensor * q,
|
|
491
|
-
ggml_tensor * k,
|
|
492
|
-
ggml_tensor * v,
|
|
510
|
+
ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q]
|
|
511
|
+
ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k]
|
|
512
|
+
ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
|
|
493
513
|
ggml_tensor * kq_b,
|
|
494
514
|
ggml_tensor * kq_mask,
|
|
515
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
495
516
|
bool v_trans,
|
|
496
517
|
float kq_scale) const;
|
|
497
518
|
|
|
@@ -506,6 +527,7 @@ struct llm_graph_context {
|
|
|
506
527
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
507
528
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
508
529
|
ggml_tensor * kq_b,
|
|
530
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
509
531
|
float kq_scale,
|
|
510
532
|
int il) const;
|
|
511
533
|
|
|
@@ -520,6 +542,7 @@ struct llm_graph_context {
|
|
|
520
542
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
521
543
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
522
544
|
ggml_tensor * kq_b,
|
|
545
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
523
546
|
float kq_scale,
|
|
524
547
|
int il) const;
|
|
525
548
|
|
|
@@ -534,6 +557,7 @@ struct llm_graph_context {
|
|
|
534
557
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
535
558
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
|
536
559
|
ggml_tensor * kq_b,
|
|
560
|
+
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
537
561
|
float kq_scale,
|
|
538
562
|
int il) const;
|
|
539
563
|
|
|
@@ -43,6 +43,10 @@ struct llama_hparams {
|
|
|
43
43
|
uint32_t n_expert_used = 0;
|
|
44
44
|
uint32_t n_rel_attn_bkts = 0;
|
|
45
45
|
|
|
46
|
+
// note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
|
|
47
|
+
uint32_t n_embd_head_k_mla = 0;
|
|
48
|
+
uint32_t n_embd_head_v_mla = 0;
|
|
49
|
+
|
|
46
50
|
// for WavTokenizer
|
|
47
51
|
struct llama_hparams_posnet posnet;
|
|
48
52
|
struct llama_hparams_convnext convnext;
|
|
@@ -62,6 +66,7 @@ struct llama_hparams {
|
|
|
62
66
|
float expert_weights_scale = 0.0;
|
|
63
67
|
bool expert_weights_norm = false;
|
|
64
68
|
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
|
|
69
|
+
uint32_t moe_every_n_layers = 0;
|
|
65
70
|
|
|
66
71
|
float f_norm_eps;
|
|
67
72
|
float f_norm_rms_eps;
|
|
@@ -112,6 +117,14 @@ struct llama_hparams {
|
|
|
112
117
|
bool use_alibi = false;
|
|
113
118
|
bool attn_soft_cap = false;
|
|
114
119
|
|
|
120
|
+
uint32_t n_moe_layer_step = 0;
|
|
121
|
+
bool use_kq_norm = true;
|
|
122
|
+
uint32_t n_attn_chunk = 0;
|
|
123
|
+
// values below seems to be fixed on llama4
|
|
124
|
+
uint32_t n_no_rope_layer_step = 4;
|
|
125
|
+
uint32_t n_attn_temp_floor_scale = 8192;
|
|
126
|
+
float f_attn_temp_scale = 0.1;
|
|
127
|
+
|
|
115
128
|
// needed by encoder-decoder models (e.g. T5, FLAN-T5)
|
|
116
129
|
// ref: https://github.com/ggerganov/llama.cpp/pull/8141
|
|
117
130
|
llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
|