@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
|
|
189
189
|
return ubatch;
|
|
190
190
|
}
|
|
191
191
|
|
|
192
|
-
|
|
192
|
+
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
|
193
193
|
GGML_ASSERT(batch.n_tokens >= 0);
|
|
194
194
|
this->batch = &batch;
|
|
195
195
|
this->n_embd = n_embd;
|
|
@@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
|
203
203
|
for (size_t i = 0; i < n_tokens; ++i) {
|
|
204
204
|
ids[i] = i;
|
|
205
205
|
}
|
|
206
|
+
|
|
206
207
|
if (simple_split) {
|
|
207
208
|
seq.resize(1);
|
|
208
209
|
llama_sbatch_seq & s = seq[0];
|
|
@@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
|
212
213
|
s.length = n_tokens;
|
|
213
214
|
return;
|
|
214
215
|
}
|
|
216
|
+
|
|
215
217
|
std::sort(ids.begin(), ids.end(),
|
|
216
218
|
[&batch](size_t a, size_t b) {
|
|
217
219
|
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
|
@@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
|
239
241
|
return n_seq_a > n_seq_b;
|
|
240
242
|
}
|
|
241
243
|
);
|
|
244
|
+
|
|
242
245
|
// init seq
|
|
243
246
|
llama_sbatch_seq * last_seq = nullptr;
|
|
244
247
|
|
|
@@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|
|
262
265
|
seq.push_back(new_seq);
|
|
263
266
|
last_seq = &seq.back();
|
|
264
267
|
}
|
|
268
|
+
|
|
265
269
|
// keep shared prompts first at the end, then sort by length descending.
|
|
266
270
|
std::sort(seq.begin(), seq.end(),
|
|
267
271
|
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
|
@@ -70,7 +70,8 @@ struct llama_sbatch {
|
|
|
70
70
|
// sequence-wise split
|
|
71
71
|
llama_ubatch split_seq(size_t n_ubatch);
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
llama_sbatch() = default;
|
|
74
|
+
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
|
74
75
|
};
|
|
75
76
|
|
|
76
77
|
// temporary allocate memory for the input batch if needed
|
|
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
35
35
|
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
|
36
36
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
|
37
37
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
|
38
|
+
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
|
38
39
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
|
39
40
|
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
|
40
41
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
|
@@ -50,8 +51,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
50
51
|
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
|
|
51
52
|
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
|
|
52
53
|
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
|
|
53
|
-
{ "chatglm3",
|
|
54
|
-
{ "chatglm4",
|
|
54
|
+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
|
|
55
|
+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
|
|
55
56
|
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
|
56
57
|
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
|
57
58
|
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
|
@@ -59,6 +60,10 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|
|
59
60
|
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
|
60
61
|
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
|
61
62
|
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
|
|
63
|
+
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
|
|
64
|
+
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
|
|
65
|
+
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
|
66
|
+
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
|
62
67
|
};
|
|
63
68
|
|
|
64
69
|
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
|
@@ -78,7 +83,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
78
83
|
if (tmpl_contains("<|im_start|>")) {
|
|
79
84
|
return tmpl_contains("<|im_sep|>")
|
|
80
85
|
? LLM_CHAT_TEMPLATE_PHI_4
|
|
81
|
-
:
|
|
86
|
+
: tmpl_contains("<end_of_utterance>")
|
|
87
|
+
? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
|
|
88
|
+
: LLM_CHAT_TEMPLATE_CHATML;
|
|
82
89
|
} else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
|
|
83
90
|
if (tmpl_contains("[SYSTEM_PROMPT]")) {
|
|
84
91
|
return LLM_CHAT_TEMPLATE_MISTRAL_V7;
|
|
@@ -116,8 +123,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
116
123
|
}
|
|
117
124
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
|
|
118
125
|
return LLM_CHAT_TEMPLATE_PHI_3;
|
|
126
|
+
} else if (tmpl_contains("[gMASK]<sop>")) {
|
|
127
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_4;
|
|
119
128
|
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
|
|
120
129
|
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
|
|
130
|
+
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
|
|
131
|
+
return LLM_CHAT_TEMPLATE_GLMEDGE;
|
|
121
132
|
} else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
|
|
122
133
|
return LLM_CHAT_TEMPLATE_ZEPHYR;
|
|
123
134
|
} else if (tmpl_contains("bos_token + message['role']")) {
|
|
@@ -146,9 +157,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
146
157
|
return LLM_CHAT_TEMPLATE_LLAMA_3;
|
|
147
158
|
} else if (tmpl_contains("[gMASK]sop")) {
|
|
148
159
|
// chatglm3-6b
|
|
149
|
-
return
|
|
150
|
-
} else if (tmpl_contains("[gMASK]<sop>")) {
|
|
151
|
-
return LLM_CHAT_TEMPLATE_CHATGML_4;
|
|
160
|
+
return LLM_CHAT_TEMPLATE_CHATGLM_3;
|
|
152
161
|
} else if (tmpl_contains(LU8("<用户>"))) {
|
|
153
162
|
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
|
|
154
163
|
return LLM_CHAT_TEMPLATE_MINICPM;
|
|
@@ -168,6 +177,12 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
|
|
168
177
|
return LLM_CHAT_TEMPLATE_GIGACHAT;
|
|
169
178
|
} else if (tmpl_contains("<|role_start|>")) {
|
|
170
179
|
return LLM_CHAT_TEMPLATE_MEGREZ;
|
|
180
|
+
} else if (tmpl_contains(" Ассистент:")) {
|
|
181
|
+
return LLM_CHAT_TEMPLATE_YANDEX;
|
|
182
|
+
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
|
|
183
|
+
return LLM_CHAT_TEMPLATE_BAILING;
|
|
184
|
+
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
|
|
185
|
+
return LLM_CHAT_TEMPLATE_LLAMA4;
|
|
171
186
|
}
|
|
172
187
|
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
|
173
188
|
}
|
|
@@ -188,19 +203,20 @@ int32_t llm_chat_apply_template(
|
|
|
188
203
|
if (add_ass) {
|
|
189
204
|
ss << "<|im_start|>assistant\n";
|
|
190
205
|
}
|
|
191
|
-
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
|
206
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
|
192
207
|
// Official mistral 'v7' template
|
|
193
208
|
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
|
209
|
+
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
|
210
|
+
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
|
194
211
|
for (auto message : chat) {
|
|
195
212
|
std::string role(message->role);
|
|
196
213
|
std::string content(message->content);
|
|
197
214
|
if (role == "system") {
|
|
198
|
-
ss << "[SYSTEM_PROMPT]
|
|
215
|
+
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
|
199
216
|
} else if (role == "user") {
|
|
200
|
-
ss << "[INST]
|
|
201
|
-
}
|
|
202
|
-
|
|
203
|
-
ss << " " << content << "</s>";
|
|
217
|
+
ss << "[INST]" << trailing_space << content << "[/INST]";
|
|
218
|
+
} else {
|
|
219
|
+
ss << trailing_space << content << "</s>";
|
|
204
220
|
}
|
|
205
221
|
}
|
|
206
222
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
|
@@ -423,7 +439,7 @@ int32_t llm_chat_apply_template(
|
|
|
423
439
|
if (add_ass) {
|
|
424
440
|
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
|
|
425
441
|
}
|
|
426
|
-
} else if (tmpl ==
|
|
442
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
|
|
427
443
|
// chatglm3-6b
|
|
428
444
|
ss << "[gMASK]" << "sop";
|
|
429
445
|
for (auto message : chat) {
|
|
@@ -433,14 +449,14 @@ int32_t llm_chat_apply_template(
|
|
|
433
449
|
if (add_ass) {
|
|
434
450
|
ss << "<|assistant|>";
|
|
435
451
|
}
|
|
436
|
-
} else if (tmpl ==
|
|
452
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
|
437
453
|
ss << "[gMASK]" << "<sop>";
|
|
438
454
|
for (auto message : chat) {
|
|
439
455
|
std::string role(message->role);
|
|
440
456
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
|
441
457
|
}
|
|
442
458
|
if (add_ass) {
|
|
443
|
-
ss << "<|assistant
|
|
459
|
+
ss << "<|assistant|>\n";
|
|
444
460
|
}
|
|
445
461
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
|
446
462
|
for (auto message : chat) {
|
|
@@ -567,6 +583,66 @@ int32_t llm_chat_apply_template(
|
|
|
567
583
|
if (add_ass) {
|
|
568
584
|
ss << "<|role_start|>assistant<|role_end|>";
|
|
569
585
|
}
|
|
586
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_YANDEX) {
|
|
587
|
+
// Yandex template ("\n\n" is defined as EOT token)
|
|
588
|
+
|
|
589
|
+
ss << "<s>";
|
|
590
|
+
|
|
591
|
+
for (size_t i = 0; i < chat.size(); i++) {
|
|
592
|
+
std::string role(chat[i]->role);
|
|
593
|
+
if (role == "user") {
|
|
594
|
+
ss << " Пользователь: " << chat[i]->content << "\n\n";
|
|
595
|
+
} else if (role == "assistant") {
|
|
596
|
+
ss << " Ассистент: " << chat[i]->content << "\n\n";
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Add generation prompt if needed
|
|
601
|
+
if (add_ass) {
|
|
602
|
+
ss << " Ассистент:[SEP]";
|
|
603
|
+
}
|
|
604
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
|
|
605
|
+
// Bailing (Ling) template
|
|
606
|
+
for (auto message : chat) {
|
|
607
|
+
std::string role(message->role);
|
|
608
|
+
|
|
609
|
+
if (role == "user") {
|
|
610
|
+
role = "HUMAN";
|
|
611
|
+
} else {
|
|
612
|
+
std::transform(role.begin(), role.end(), role.begin(), ::toupper);
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
ss << "<role>" << role << "</role>" << message->content;
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
if (add_ass) {
|
|
619
|
+
ss << "<role>ASSISTANT</role>";
|
|
620
|
+
}
|
|
621
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_LLAMA4) {
|
|
622
|
+
// Llama 4
|
|
623
|
+
for (auto message : chat) {
|
|
624
|
+
std::string role(message->role);
|
|
625
|
+
ss << "<|header_start|>" << role << "<|header_end|>\n\n" << trim(message->content) << "<|eot|>";
|
|
626
|
+
}
|
|
627
|
+
if (add_ass) {
|
|
628
|
+
ss << "<|header_start|>assistant<|header_end|>\n\n";
|
|
629
|
+
}
|
|
630
|
+
} else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
|
|
631
|
+
// SmolVLM
|
|
632
|
+
ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
|
|
633
|
+
for (auto message : chat) {
|
|
634
|
+
std::string role(message->role);
|
|
635
|
+
if (role == "system") {
|
|
636
|
+
ss << message->content << "\n\n";
|
|
637
|
+
} else if (role == "user") {
|
|
638
|
+
ss << "User: " << message->content << "<end_of_utterance>\n";
|
|
639
|
+
} else {
|
|
640
|
+
ss << "Assistant: " << message->content << "<end_of_utterance>\n";
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
if (add_ass) {
|
|
644
|
+
ss << "Assistant:";
|
|
645
|
+
}
|
|
570
646
|
} else {
|
|
571
647
|
// template not supported
|
|
572
648
|
return -1;
|
|
@@ -585,4 +661,3 @@ int32_t llama_chat_builtin_templates(const char ** output, size_t len) {
|
|
|
585
661
|
}
|
|
586
662
|
return (int32_t) LLM_CHAT_TEMPLATES.size();
|
|
587
663
|
}
|
|
588
|
-
|
|
@@ -14,6 +14,7 @@ enum llm_chat_template {
|
|
|
14
14
|
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
|
15
15
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
|
16
16
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
|
17
|
+
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
|
17
18
|
LLM_CHAT_TEMPLATE_PHI_3,
|
|
18
19
|
LLM_CHAT_TEMPLATE_PHI_4,
|
|
19
20
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
|
@@ -29,8 +30,8 @@ enum llm_chat_template {
|
|
|
29
30
|
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
|
|
30
31
|
LLM_CHAT_TEMPLATE_COMMAND_R,
|
|
31
32
|
LLM_CHAT_TEMPLATE_LLAMA_3,
|
|
32
|
-
|
|
33
|
-
|
|
33
|
+
LLM_CHAT_TEMPLATE_CHATGLM_3,
|
|
34
|
+
LLM_CHAT_TEMPLATE_CHATGLM_4,
|
|
34
35
|
LLM_CHAT_TEMPLATE_GLMEDGE,
|
|
35
36
|
LLM_CHAT_TEMPLATE_MINICPM,
|
|
36
37
|
LLM_CHAT_TEMPLATE_EXAONE_3,
|
|
@@ -38,6 +39,10 @@ enum llm_chat_template {
|
|
|
38
39
|
LLM_CHAT_TEMPLATE_GRANITE,
|
|
39
40
|
LLM_CHAT_TEMPLATE_GIGACHAT,
|
|
40
41
|
LLM_CHAT_TEMPLATE_MEGREZ,
|
|
42
|
+
LLM_CHAT_TEMPLATE_YANDEX,
|
|
43
|
+
LLM_CHAT_TEMPLATE_BAILING,
|
|
44
|
+
LLM_CHAT_TEMPLATE_LLAMA4,
|
|
45
|
+
LLM_CHAT_TEMPLATE_SMOLVLM,
|
|
41
46
|
LLM_CHAT_TEMPLATE_UNKNOWN,
|
|
42
47
|
};
|
|
43
48
|
|