@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
#include "regex-partial.h"
|
|
2
|
+
#include "common.h"
|
|
3
|
+
#include <functional>
|
|
4
|
+
#include <optional>
|
|
5
|
+
|
|
6
|
+
common_regex::common_regex(const std::string & pattern) :
|
|
7
|
+
pattern(pattern),
|
|
8
|
+
rx(pattern),
|
|
9
|
+
rx_reversed_partial(regex_to_reversed_partial_regex(pattern)) {}
|
|
10
|
+
|
|
11
|
+
common_regex_match common_regex::search(const std::string & input, size_t pos, bool as_match) const {
|
|
12
|
+
std::smatch match;
|
|
13
|
+
if (pos > input.size()) {
|
|
14
|
+
throw std::runtime_error("Position out of bounds");
|
|
15
|
+
}
|
|
16
|
+
auto start = input.begin() + pos;
|
|
17
|
+
auto found = as_match
|
|
18
|
+
? std::regex_match(start, input.end(), match, rx)
|
|
19
|
+
: std::regex_search(start, input.end(), match, rx);
|
|
20
|
+
if (found) {
|
|
21
|
+
common_regex_match res;
|
|
22
|
+
res.type = COMMON_REGEX_MATCH_TYPE_FULL;
|
|
23
|
+
for (size_t i = 0; i < match.size(); ++i) {
|
|
24
|
+
auto begin = pos + match.position(i);
|
|
25
|
+
res.groups.emplace_back(begin, begin + match.length(i));
|
|
26
|
+
}
|
|
27
|
+
return res;
|
|
28
|
+
}
|
|
29
|
+
std::match_results<std::string::const_reverse_iterator> srmatch;
|
|
30
|
+
if (std::regex_match(input.rbegin(), input.rend() - pos, srmatch, rx_reversed_partial)) {
|
|
31
|
+
auto group = srmatch[1].str();
|
|
32
|
+
if (group.length() != 0) {
|
|
33
|
+
auto it = srmatch[1].second.base();
|
|
34
|
+
// auto position = static_cast<size_t>(std::distance(input.begin(), it));
|
|
35
|
+
if ((!as_match) || it == input.begin()) {
|
|
36
|
+
common_regex_match res;
|
|
37
|
+
res.type = COMMON_REGEX_MATCH_TYPE_PARTIAL;
|
|
38
|
+
const size_t begin = std::distance(input.begin(), it);
|
|
39
|
+
const size_t end = input.size();
|
|
40
|
+
if (begin == std::string::npos || end == std::string::npos || begin > end) {
|
|
41
|
+
throw std::runtime_error("Invalid range");
|
|
42
|
+
}
|
|
43
|
+
res.groups.push_back({begin, end});
|
|
44
|
+
return res;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
return {};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/*
|
|
52
|
+
Transforms a regex pattern to a partial match pattern that operates on a reversed input string to find partial final matches of the original pattern.
|
|
53
|
+
|
|
54
|
+
Ideally we'd like to use boost::match_partial (https://beta.boost.org/doc/libs/1_59_0/libs/regex/doc/html/boost_regex/partial_matches.html)
|
|
55
|
+
to see if a string ends with a partial regex match, but but it's not in std::regex yet.
|
|
56
|
+
Instead, we'll the regex into a partial match regex operating as a full match on the reverse iterators of the input.
|
|
57
|
+
|
|
58
|
+
- /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:(?:d)?c)?b)?a).*
|
|
59
|
+
- /a|b/ -> (a|b).*
|
|
60
|
+
- /a*?/ -> error, could match ""
|
|
61
|
+
- /a*b/ -> ((?:b)?a*+).* (final repetitions become eager)
|
|
62
|
+
- /.*?ab/ -> ((?:b)?a).* (merge .*)
|
|
63
|
+
- /a.*?b/ -> ((?:b)?.*?a).* (keep reluctant matches)
|
|
64
|
+
- /a(bc)d/ -> ((?:(?:d)?(?:(?:c)?b))?a).*
|
|
65
|
+
- /a(bc|de)/ -> ((?:(?:(?:e)?d)?|(?:(?:c)?b)?)?a).*
|
|
66
|
+
- /ab{2,4}c/ -> abbb?b?c -> ((?:(?:(?:(?:(?:c)?b)?b)?b?)?b?)?a).*
|
|
67
|
+
|
|
68
|
+
The regex will match a reversed string fully, and the end of the first (And only) capturing group will indicate the reversed start of the original partial pattern
|
|
69
|
+
(i.e. just where the final .* starts in the inverted pattern; all other groups are turned into non-capturing groups, and reluctant quantifiers are ignored)
|
|
70
|
+
*/
|
|
71
|
+
std::string regex_to_reversed_partial_regex(const std::string & pattern) {
|
|
72
|
+
auto it = pattern.begin();
|
|
73
|
+
const auto end = pattern.end();
|
|
74
|
+
|
|
75
|
+
std::function<std::string()> process = [&]() {
|
|
76
|
+
std::vector<std::vector<std::string>> alternatives(1);
|
|
77
|
+
std::vector<std::string> * sequence = &alternatives.back();
|
|
78
|
+
|
|
79
|
+
while (it != end) {
|
|
80
|
+
if (*it == '[') {
|
|
81
|
+
auto start = it;
|
|
82
|
+
++it;
|
|
83
|
+
while (it != end) {
|
|
84
|
+
if ((*it == '\\') && (++it != end)) {
|
|
85
|
+
++it;
|
|
86
|
+
} else if ((it != end) && (*it == ']')) {
|
|
87
|
+
break;
|
|
88
|
+
} else {
|
|
89
|
+
++it;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
if (it == end) {
|
|
93
|
+
throw std::runtime_error("Unmatched '[' in pattern");
|
|
94
|
+
}
|
|
95
|
+
++it;
|
|
96
|
+
sequence->push_back(std::string(start, it));
|
|
97
|
+
} else if (*it == '*' || *it == '?' || *it == '+') {
|
|
98
|
+
if (sequence->empty()) {
|
|
99
|
+
throw std::runtime_error("Quantifier without preceding element");
|
|
100
|
+
}
|
|
101
|
+
sequence->back() += *it;
|
|
102
|
+
auto is_star = *it == '*';
|
|
103
|
+
++it;
|
|
104
|
+
if (is_star) {
|
|
105
|
+
if (*it == '?') {
|
|
106
|
+
++it;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
} else if (*it == '{') {
|
|
110
|
+
if (sequence->empty()) {
|
|
111
|
+
throw std::runtime_error("Repetition without preceding element");
|
|
112
|
+
}
|
|
113
|
+
++it;
|
|
114
|
+
auto start = it;
|
|
115
|
+
while (it != end && *it != '}') {
|
|
116
|
+
++it;
|
|
117
|
+
}
|
|
118
|
+
if (it == end) {
|
|
119
|
+
throw std::runtime_error("Unmatched '{' in pattern");
|
|
120
|
+
}
|
|
121
|
+
auto parts = string_split(std::string(start, it), ",");
|
|
122
|
+
++it;
|
|
123
|
+
if (parts.size() > 2) {
|
|
124
|
+
throw std::runtime_error("Invalid repetition range in pattern");
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
auto parseOptInt = [&](const std::string & s, const std::optional<int> & def = std::nullopt) -> std::optional<int> {
|
|
128
|
+
if (s.empty()) {
|
|
129
|
+
return def;
|
|
130
|
+
}
|
|
131
|
+
return std::stoi(s);
|
|
132
|
+
};
|
|
133
|
+
auto min = parseOptInt(parts[0], 0);
|
|
134
|
+
auto max = parts.size() == 1 ? min : parseOptInt(parts[1]);
|
|
135
|
+
if (min && max && *max < *min) {
|
|
136
|
+
throw std::runtime_error("Invalid repetition range in pattern");
|
|
137
|
+
}
|
|
138
|
+
// Brutal but... let's repeat at least min times, then ? for the delta between min & max (or * for unbounded)
|
|
139
|
+
auto part = sequence->back();
|
|
140
|
+
sequence->pop_back();
|
|
141
|
+
for (int i = 0; i < *min; i++) {
|
|
142
|
+
sequence->push_back(part);
|
|
143
|
+
}
|
|
144
|
+
if (max) {
|
|
145
|
+
for (int i = *min; i < *max; i++) {
|
|
146
|
+
sequence->push_back(part + "?");
|
|
147
|
+
}
|
|
148
|
+
} else {
|
|
149
|
+
sequence->push_back(part + "*");
|
|
150
|
+
}
|
|
151
|
+
} else if (*it == '(') {
|
|
152
|
+
++it;
|
|
153
|
+
if (it != end && *it == '?' && (it + 1 != end) && *(it + 1) == ':') {
|
|
154
|
+
it += 2;
|
|
155
|
+
}
|
|
156
|
+
auto sub = process();
|
|
157
|
+
if (*it != ')') {
|
|
158
|
+
throw std::runtime_error("Unmatched '(' in pattern");
|
|
159
|
+
}
|
|
160
|
+
++it;
|
|
161
|
+
auto & part = sequence->emplace_back("(?:");
|
|
162
|
+
part += sub;
|
|
163
|
+
part += ")";
|
|
164
|
+
} else if (*it == ')') {
|
|
165
|
+
break;
|
|
166
|
+
} else if (*it == '|') {
|
|
167
|
+
++it;
|
|
168
|
+
alternatives.emplace_back();
|
|
169
|
+
sequence = &alternatives.back();
|
|
170
|
+
} else if (*it == '\\' && (++it != end)) {
|
|
171
|
+
auto str = std::string("\\") + *it;
|
|
172
|
+
sequence->push_back(str);
|
|
173
|
+
++it;
|
|
174
|
+
} else if (it != end) {
|
|
175
|
+
sequence->push_back(std::string(1, *it));
|
|
176
|
+
++it;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// /abcd/ -> (dcba|cba|ba|a).* -> ((?:(?:(?:d)?c)?b)?a).*
|
|
181
|
+
// if n(=4) parts, opening n-1(=3) non-capturing groups after the 1 capturing group
|
|
182
|
+
// We'll do the outermost capturing group and final .* in the enclosing function.
|
|
183
|
+
std::vector<std::string> res_alts;
|
|
184
|
+
for (const auto & parts : alternatives) {
|
|
185
|
+
auto & res = res_alts.emplace_back();
|
|
186
|
+
for (size_t i = 0; i < parts.size() - 1; i++) {
|
|
187
|
+
res += "(?:";
|
|
188
|
+
}
|
|
189
|
+
for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
|
|
190
|
+
res += *it;
|
|
191
|
+
if (it != parts.rend() - 1) {
|
|
192
|
+
res += ")?";
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
return string_join(res_alts, "|");
|
|
197
|
+
};
|
|
198
|
+
auto res = process();
|
|
199
|
+
if (it != end) {
|
|
200
|
+
throw std::runtime_error("Unmatched '(' in pattern");
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return "(" + res + ")[\\s\\S]*";
|
|
204
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
#include <regex>
|
|
4
|
+
#include <string>
|
|
5
|
+
|
|
6
|
+
enum common_regex_match_type {
|
|
7
|
+
COMMON_REGEX_MATCH_TYPE_NONE,
|
|
8
|
+
COMMON_REGEX_MATCH_TYPE_PARTIAL,
|
|
9
|
+
COMMON_REGEX_MATCH_TYPE_FULL,
|
|
10
|
+
};
|
|
11
|
+
|
|
12
|
+
struct common_string_range {
|
|
13
|
+
size_t begin;
|
|
14
|
+
size_t end;
|
|
15
|
+
common_string_range(size_t begin, size_t end) : begin(begin), end(end) {
|
|
16
|
+
if (begin > end) {
|
|
17
|
+
throw std::runtime_error("Invalid range");
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
// prevent default ctor
|
|
21
|
+
common_string_range() = delete;
|
|
22
|
+
bool empty() const {
|
|
23
|
+
return begin == end;
|
|
24
|
+
}
|
|
25
|
+
bool operator==(const common_string_range & other) const {
|
|
26
|
+
return begin == other.begin && end == other.end;
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
struct common_regex_match {
|
|
31
|
+
common_regex_match_type type = COMMON_REGEX_MATCH_TYPE_NONE;
|
|
32
|
+
std::vector<common_string_range> groups;
|
|
33
|
+
|
|
34
|
+
bool operator==(const common_regex_match & other) const {
|
|
35
|
+
return type == other.type && groups == other.groups;
|
|
36
|
+
}
|
|
37
|
+
bool operator!=(const common_regex_match & other) const {
|
|
38
|
+
return !(*this == other);
|
|
39
|
+
}
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
class common_regex {
|
|
43
|
+
std::string pattern;
|
|
44
|
+
std::regex rx;
|
|
45
|
+
std::regex rx_reversed_partial;
|
|
46
|
+
|
|
47
|
+
public:
|
|
48
|
+
explicit common_regex(const std::string & pattern);
|
|
49
|
+
|
|
50
|
+
common_regex_match search(const std::string & input, size_t pos, bool as_match = false) const;
|
|
51
|
+
|
|
52
|
+
const std::string & str() const { return pattern; }
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
// For testing only (pretty print of failures).
|
|
56
|
+
std::string regex_to_reversed_partial_regex(const std::string & pattern);
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
#include "sampling.h"
|
|
2
2
|
|
|
3
3
|
#include "common.h"
|
|
4
|
+
#include "log.h"
|
|
4
5
|
|
|
5
6
|
#include <cmath>
|
|
6
7
|
#include <unordered_map>
|
|
@@ -208,6 +209,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
208
209
|
trigger_patterns_c.data(), trigger_patterns_c.size(),
|
|
209
210
|
trigger_tokens.data(), trigger_tokens.size())
|
|
210
211
|
: llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
|
|
212
|
+
if (!grmr) {
|
|
213
|
+
return nullptr;
|
|
214
|
+
}
|
|
211
215
|
}
|
|
212
216
|
|
|
213
217
|
auto * result = new common_sampler {
|
|
@@ -226,51 +230,48 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
|
|
|
226
230
|
params.logit_bias.data()));
|
|
227
231
|
|
|
228
232
|
if (params.mirostat == 0) {
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
{
|
|
238
|
-
std::vector<const char *> c_breakers;
|
|
239
|
-
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
240
|
-
for (const auto & str : params.dry_sequence_breakers) {
|
|
241
|
-
c_breakers.push_back(str.c_str());
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
233
|
+
for (const auto & cnstr : params.samplers) {
|
|
234
|
+
switch (cnstr) {
|
|
235
|
+
case COMMON_SAMPLER_TYPE_DRY:
|
|
236
|
+
{
|
|
237
|
+
std::vector<const char *> c_breakers;
|
|
238
|
+
c_breakers.reserve(params.dry_sequence_breakers.size());
|
|
239
|
+
for (const auto & str : params.dry_sequence_breakers) {
|
|
240
|
+
c_breakers.push_back(str.c_str());
|
|
245
241
|
}
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
242
|
+
|
|
243
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
|
|
244
|
+
}
|
|
245
|
+
break;
|
|
246
|
+
case COMMON_SAMPLER_TYPE_TOP_K:
|
|
247
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
|
248
|
+
break;
|
|
249
|
+
case COMMON_SAMPLER_TYPE_TOP_P:
|
|
250
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
|
251
|
+
break;
|
|
252
|
+
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA:
|
|
253
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_top_n_sigma (params.top_n_sigma));
|
|
254
|
+
break;
|
|
255
|
+
case COMMON_SAMPLER_TYPE_MIN_P:
|
|
256
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
|
257
|
+
break;
|
|
258
|
+
case COMMON_SAMPLER_TYPE_XTC:
|
|
259
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
|
|
260
|
+
break;
|
|
261
|
+
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
|
262
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
|
263
|
+
break;
|
|
264
|
+
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
|
265
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
|
266
|
+
break;
|
|
267
|
+
case COMMON_SAMPLER_TYPE_INFILL:
|
|
268
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
|
|
269
|
+
break;
|
|
270
|
+
case COMMON_SAMPLER_TYPE_PENALTIES:
|
|
271
|
+
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties (params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
|
|
272
|
+
break;
|
|
273
|
+
default:
|
|
274
|
+
GGML_ASSERT(false && "unknown sampler type");
|
|
274
275
|
}
|
|
275
276
|
}
|
|
276
277
|
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
|
|
@@ -472,6 +473,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
|
|
472
473
|
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
|
473
474
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
|
474
475
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
|
476
|
+
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return 's';
|
|
475
477
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
|
476
478
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
|
477
479
|
case COMMON_SAMPLER_TYPE_XTC: return 'x';
|
|
@@ -487,6 +489,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
|
|
487
489
|
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
|
488
490
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
|
489
491
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
|
492
|
+
case COMMON_SAMPLER_TYPE_TOP_N_SIGMA: return "top_n_sigma";
|
|
490
493
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
|
491
494
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
|
492
495
|
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
|
|
@@ -501,6 +504,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|
|
501
504
|
{ "dry", COMMON_SAMPLER_TYPE_DRY },
|
|
502
505
|
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
503
506
|
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
507
|
+
{ "top_n_sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
504
508
|
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
505
509
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
|
506
510
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
@@ -514,6 +518,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|
|
514
518
|
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
|
515
519
|
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
|
516
520
|
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
|
521
|
+
{ "top-n-sigma", COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
517
522
|
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
|
518
523
|
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
519
524
|
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
@@ -530,14 +535,16 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
|
|
|
530
535
|
auto sampler = sampler_canonical_name_map.find(name);
|
|
531
536
|
if (sampler != sampler_canonical_name_map.end()) {
|
|
532
537
|
samplers.push_back(sampler->second);
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
538
|
+
continue;
|
|
539
|
+
}
|
|
540
|
+
if (allow_alt_names) {
|
|
541
|
+
sampler = sampler_alt_name_map.find(name);
|
|
542
|
+
if (sampler != sampler_alt_name_map.end()) {
|
|
543
|
+
samplers.push_back(sampler->second);
|
|
544
|
+
continue;
|
|
539
545
|
}
|
|
540
546
|
}
|
|
547
|
+
LOG_WRN("%s: unable to match sampler by name '%s'\n", __func__, name.c_str());
|
|
541
548
|
}
|
|
542
549
|
|
|
543
550
|
return samplers;
|
|
@@ -549,6 +556,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
|
|
549
556
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
|
550
557
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
|
551
558
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
|
559
|
+
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_N_SIGMA), COMMON_SAMPLER_TYPE_TOP_N_SIGMA },
|
|
552
560
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
|
553
561
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
|
|
554
562
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
|
|
@@ -563,6 +571,8 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
|
|
|
563
571
|
const auto sampler = sampler_name_map.find(c);
|
|
564
572
|
if (sampler != sampler_name_map.end()) {
|
|
565
573
|
samplers.push_back(sampler->second);
|
|
574
|
+
} else {
|
|
575
|
+
LOG_WRN("%s: unable to match sampler by char '%c'\n", __func__, c);
|
|
566
576
|
}
|
|
567
577
|
}
|
|
568
578
|
|
|
@@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo
|
|
|
132
132
|
|
|
133
133
|
|
|
134
134
|
#### Compile and run inside a Fedora Toolbox Container
|
|
135
|
-
We also have a [guide](./
|
|
135
|
+
We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
|
|
136
136
|
|
|
137
137
|
**Recommended for:**
|
|
138
|
-
|
|
139
|
-
-
|
|
140
|
-
-
|
|
138
|
+
- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
|
|
139
|
+
- (there are no supported CUDA packages for these systems)
|
|
140
|
+
- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
|
|
141
|
+
- (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
|
|
142
|
+
- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
|
|
141
143
|
- *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)
|
|
142
144
|
|
|
143
145
|
|
|
@@ -189,7 +191,7 @@ The following compilation options are also available to tweak performance:
|
|
|
189
191
|
|
|
190
192
|
| Option | Legal values | Default | Description |
|
|
191
193
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|
192
|
-
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
|
194
|
+
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
|
193
195
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
|
194
196
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
|
195
197
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
|
@@ -216,6 +218,7 @@ By default, all supported compute capabilities are enabled. To customize this be
|
|
|
216
218
|
|
|
217
219
|
```bash
|
|
218
220
|
cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
|
|
221
|
+
cmake --build build --config Release
|
|
219
222
|
```
|
|
220
223
|
|
|
221
224
|
This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
|
|
@@ -256,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
256
259
|
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
|
257
260
|
&& cmake --build build --config Release -- -j 16
|
|
258
261
|
```
|
|
259
|
-
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
|
260
|
-
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
|
261
262
|
|
|
262
263
|
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
|
|
263
264
|
|
|
@@ -293,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
|
|
|
293
294
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
|
294
295
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
|
295
296
|
|
|
297
|
+
### Unified Memory
|
|
298
|
+
|
|
299
|
+
On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
|
300
|
+
|
|
296
301
|
## Vulkan
|
|
297
302
|
|
|
298
303
|
**Windows**
|
|
@@ -433,6 +438,116 @@ llama_new_context_with_model: CANN compute buffer size = 1260.81 MiB
|
|
|
433
438
|
|
|
434
439
|
For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).
|
|
435
440
|
|
|
441
|
+
## Arm® KleidiAI™
|
|
442
|
+
KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
|
|
443
|
+
|
|
444
|
+
To enable KleidiAI, go to the llama.cpp directory and build using CMake
|
|
445
|
+
```bash
|
|
446
|
+
cmake -B build -DGGML_CPU_KLEIDIAI=ON
|
|
447
|
+
cmake --build build --config Release
|
|
448
|
+
```
|
|
449
|
+
You can verify that KleidiAI is being used by running
|
|
450
|
+
```bash
|
|
451
|
+
./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
|
|
452
|
+
```
|
|
453
|
+
If KleidiAI is enabled, the ouput will contain a line similar to:
|
|
454
|
+
```
|
|
455
|
+
load_tensors: CPU_KLEIDIAI model buffer size = 3474.00 MiB
|
|
456
|
+
```
|
|
457
|
+
KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
|
|
458
|
+
|
|
459
|
+
Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
|
|
460
|
+
|
|
461
|
+
## OpenCL
|
|
462
|
+
|
|
463
|
+
This provides GPU acceleration through OpenCL on recent Adreno GPU.
|
|
464
|
+
More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
|
|
465
|
+
|
|
466
|
+
### Android
|
|
467
|
+
|
|
468
|
+
Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
|
|
469
|
+
|
|
470
|
+
```sh
|
|
471
|
+
mkdir -p ~/dev/llm
|
|
472
|
+
cd ~/dev/llm
|
|
473
|
+
|
|
474
|
+
git clone https://github.com/KhronosGroup/OpenCL-Headers && \
|
|
475
|
+
cd OpenCL-Headers && \
|
|
476
|
+
cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
|
|
477
|
+
|
|
478
|
+
cd ~/dev/llm
|
|
479
|
+
|
|
480
|
+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
|
|
481
|
+
cd OpenCL-ICD-Loader && \
|
|
482
|
+
mkdir build_ndk && cd build_ndk && \
|
|
483
|
+
cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
|
|
484
|
+
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
|
485
|
+
-DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
|
|
486
|
+
-DANDROID_ABI=arm64-v8a \
|
|
487
|
+
-DANDROID_PLATFORM=24 \
|
|
488
|
+
-DANDROID_STL=c++_shared && \
|
|
489
|
+
ninja && \
|
|
490
|
+
cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
Then build llama.cpp with OpenCL enabled,
|
|
494
|
+
|
|
495
|
+
```sh
|
|
496
|
+
cd ~/dev/llm
|
|
497
|
+
|
|
498
|
+
git clone https://github.com/ggml-org/llama.cpp && \
|
|
499
|
+
cd llama.cpp && \
|
|
500
|
+
mkdir build-android && cd build-android
|
|
501
|
+
|
|
502
|
+
cmake .. -G Ninja \
|
|
503
|
+
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
|
504
|
+
-DANDROID_ABI=arm64-v8a \
|
|
505
|
+
-DANDROID_PLATFORM=android-28 \
|
|
506
|
+
-DBUILD_SHARED_LIBS=OFF \
|
|
507
|
+
-DGGML_OPENCL=ON
|
|
508
|
+
|
|
509
|
+
ninja
|
|
510
|
+
```
|
|
511
|
+
|
|
512
|
+
### Windows Arm64
|
|
513
|
+
|
|
514
|
+
First, install OpenCL headers and ICD loader library if not available,
|
|
515
|
+
|
|
516
|
+
```powershell
|
|
517
|
+
mkdir -p ~/dev/llm
|
|
518
|
+
|
|
519
|
+
cd ~/dev/llm
|
|
520
|
+
git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
|
|
521
|
+
mkdir build && cd build
|
|
522
|
+
cmake .. -G Ninja `
|
|
523
|
+
-DBUILD_TESTING=OFF `
|
|
524
|
+
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
|
|
525
|
+
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
|
|
526
|
+
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
|
|
527
|
+
cmake --build . --target install
|
|
528
|
+
|
|
529
|
+
cd ~/dev/llm
|
|
530
|
+
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
|
|
531
|
+
mkdir build && cd build
|
|
532
|
+
cmake .. -G Ninja `
|
|
533
|
+
-DCMAKE_BUILD_TYPE=Release `
|
|
534
|
+
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
|
|
535
|
+
-DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
|
|
536
|
+
cmake --build . --target install
|
|
537
|
+
```
|
|
538
|
+
|
|
539
|
+
Then build llama.cpp with OpenCL enabled,
|
|
540
|
+
|
|
541
|
+
```powershell
|
|
542
|
+
cmake .. -G Ninja `
|
|
543
|
+
-DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
|
|
544
|
+
-DCMAKE_BUILD_TYPE=Release `
|
|
545
|
+
-DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
|
|
546
|
+
-DBUILD_SHARED_LIBS=OFF `
|
|
547
|
+
-DGGML_OPENCL=ON
|
|
548
|
+
ninja
|
|
549
|
+
```
|
|
550
|
+
|
|
436
551
|
## Android
|
|
437
552
|
|
|
438
553
|
To read documentation for how to build on Android, [click here](./android.md)
|