@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -36,6 +36,46 @@ static uint64_t get_time_ns() {
|
|
|
36
36
|
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
|
37
37
|
}
|
|
38
38
|
|
|
39
|
+
static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
|
|
40
|
+
if (a.pattern != b.pattern) {
|
|
41
|
+
// cString comparison that may be null
|
|
42
|
+
if (a.pattern == nullptr || b.pattern == nullptr) {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
if (strcmp(a.pattern, b.pattern) != 0) {
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
if (a.buft != b.buft) {
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
return true;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
|
|
56
|
+
if (a.size() != b.size()) {
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
for (size_t i = 0; i < a.size(); i++) {
|
|
60
|
+
if (!tensor_buft_override_equal(a[i], b[i])) {
|
|
61
|
+
return false;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return true;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
|
|
68
|
+
if (a.size() != b.size()) {
|
|
69
|
+
return false;
|
|
70
|
+
}
|
|
71
|
+
for (size_t i = 0; i < a.size(); i++) {
|
|
72
|
+
if (!vec_tensor_buft_override_equal(a[i], b[i])) {
|
|
73
|
+
return false;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return true;
|
|
77
|
+
}
|
|
78
|
+
|
|
39
79
|
template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
|
|
40
80
|
std::ostringstream str;
|
|
41
81
|
for (size_t i = 0; i < values.size(); i++) {
|
|
@@ -155,15 +195,57 @@ static std::string pair_str(const std::pair<int, int> & p) {
|
|
|
155
195
|
return buf;
|
|
156
196
|
}
|
|
157
197
|
|
|
198
|
+
static std::vector<int> parse_int_range(const std::string & s) {
|
|
199
|
+
// first[-last[(+|*)step]]
|
|
200
|
+
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
|
|
201
|
+
|
|
202
|
+
std::smatch match;
|
|
203
|
+
std::string::const_iterator search_start(s.cbegin());
|
|
204
|
+
std::vector<int> result;
|
|
205
|
+
while (std::regex_search(search_start, s.cend(), match, range_regex)) {
|
|
206
|
+
int first = std::stoi(match[1]);
|
|
207
|
+
int last = match[2].matched ? std::stoi(match[2]) : first;
|
|
208
|
+
char op = match[3].matched ? match[3].str()[0] : '+';
|
|
209
|
+
int step = match[4].matched ? std::stoi(match[4]) : 1;
|
|
210
|
+
|
|
211
|
+
for (int i = first; i <= last;) {
|
|
212
|
+
result.push_back(i);
|
|
213
|
+
|
|
214
|
+
int prev_i = i;
|
|
215
|
+
|
|
216
|
+
if (op == '+') {
|
|
217
|
+
i += step;
|
|
218
|
+
} else if (op == '*') {
|
|
219
|
+
i *= step;
|
|
220
|
+
} else {
|
|
221
|
+
throw std::invalid_argument("invalid range format");
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (i <= prev_i) {
|
|
225
|
+
throw std::invalid_argument("invalid range");
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
search_start = match.suffix().first;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
if (search_start != s.cend()) {
|
|
232
|
+
throw std::invalid_argument("invalid range format");
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return result;
|
|
236
|
+
}
|
|
237
|
+
|
|
158
238
|
struct cmd_params {
|
|
159
239
|
std::vector<std::string> model;
|
|
160
240
|
std::vector<int> n_prompt;
|
|
161
241
|
std::vector<int> n_gen;
|
|
162
242
|
std::vector<std::pair<int, int>> n_pg;
|
|
243
|
+
std::vector<int> n_depth;
|
|
163
244
|
std::vector<int> n_batch;
|
|
164
245
|
std::vector<int> n_ubatch;
|
|
165
246
|
std::vector<ggml_type> type_k;
|
|
166
247
|
std::vector<ggml_type> type_v;
|
|
248
|
+
std::vector<float> defrag_thold;
|
|
167
249
|
std::vector<int> n_threads;
|
|
168
250
|
std::vector<std::string> cpu_mask;
|
|
169
251
|
std::vector<bool> cpu_strict;
|
|
@@ -175,8 +257,10 @@ struct cmd_params {
|
|
|
175
257
|
std::vector<bool> no_kv_offload;
|
|
176
258
|
std::vector<bool> flash_attn;
|
|
177
259
|
std::vector<std::vector<float>> tensor_split;
|
|
260
|
+
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
|
178
261
|
std::vector<bool> use_mmap;
|
|
179
262
|
std::vector<bool> embeddings;
|
|
263
|
+
std::vector<bool> no_op_offload;
|
|
180
264
|
ggml_numa_strategy numa;
|
|
181
265
|
int reps;
|
|
182
266
|
ggml_sched_priority prio;
|
|
@@ -192,10 +276,12 @@ static const cmd_params cmd_params_defaults = {
|
|
|
192
276
|
/* n_prompt */ { 512 },
|
|
193
277
|
/* n_gen */ { 128 },
|
|
194
278
|
/* n_pg */ {},
|
|
279
|
+
/* n_depth */ { 0 },
|
|
195
280
|
/* n_batch */ { 2048 },
|
|
196
281
|
/* n_ubatch */ { 512 },
|
|
197
282
|
/* type_k */ { GGML_TYPE_F16 },
|
|
198
283
|
/* type_v */ { GGML_TYPE_F16 },
|
|
284
|
+
/* defrag_thold */ { -1.0f },
|
|
199
285
|
/* n_threads */ { cpu_get_num_math() },
|
|
200
286
|
/* cpu_mask */ { "0x0" },
|
|
201
287
|
/* cpu_strict */ { false },
|
|
@@ -207,8 +293,10 @@ static const cmd_params cmd_params_defaults = {
|
|
|
207
293
|
/* no_kv_offload */ { false },
|
|
208
294
|
/* flash_attn */ { false },
|
|
209
295
|
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
|
296
|
+
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
|
210
297
|
/* use_mmap */ { true },
|
|
211
298
|
/* embeddings */ { false },
|
|
299
|
+
/* no_op_offload */ { false },
|
|
212
300
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
213
301
|
/* reps */ 5,
|
|
214
302
|
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
|
@@ -224,12 +312,29 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
224
312
|
printf("\n");
|
|
225
313
|
printf("options:\n");
|
|
226
314
|
printf(" -h, --help\n");
|
|
315
|
+
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
|
|
316
|
+
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
|
|
317
|
+
cmd_params_defaults.reps);
|
|
318
|
+
printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
|
|
319
|
+
cmd_params_defaults.prio);
|
|
320
|
+
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
|
|
321
|
+
cmd_params_defaults.delay);
|
|
322
|
+
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
|
|
323
|
+
output_format_str(cmd_params_defaults.output_format));
|
|
324
|
+
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
|
|
325
|
+
output_format_str(cmd_params_defaults.output_format_stderr));
|
|
326
|
+
printf(" -v, --verbose verbose output\n");
|
|
327
|
+
printf(" --progress print test progress indicators\n");
|
|
328
|
+
printf("\n");
|
|
329
|
+
printf("test parameters:\n");
|
|
227
330
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
228
331
|
printf(" -p, --n-prompt <n> (default: %s)\n",
|
|
229
332
|
join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
230
333
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
231
334
|
printf(" -pg <pp,tg> (default: %s)\n",
|
|
232
335
|
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
336
|
+
printf(" -d, --n-depth <n> (default: %s)\n",
|
|
337
|
+
join(cmd_params_defaults.n_depth, ",").c_str());
|
|
233
338
|
printf(" -b, --batch-size <n> (default: %s)\n",
|
|
234
339
|
join(cmd_params_defaults.n_batch, ",").c_str());
|
|
235
340
|
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
|
@@ -238,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
238
343
|
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
239
344
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
|
|
240
345
|
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
346
|
+
printf(" -dt, --defrag-thold <f> (default: %s)\n",
|
|
347
|
+
join(cmd_params_defaults.defrag_thold, ",").c_str());
|
|
241
348
|
printf(" -t, --threads <n> (default: %s)\n",
|
|
242
349
|
join(cmd_params_defaults.n_threads, ",").c_str());
|
|
243
350
|
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
|
|
@@ -261,23 +368,17 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
261
368
|
join(cmd_params_defaults.flash_attn, ",").c_str());
|
|
262
369
|
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
|
263
370
|
join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
264
|
-
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
|
265
371
|
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
|
266
372
|
join(cmd_params_defaults.embeddings, ",").c_str());
|
|
267
373
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
268
|
-
printf(" -
|
|
269
|
-
printf("
|
|
270
|
-
printf(" --
|
|
271
|
-
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
|
|
272
|
-
output_format_str(cmd_params_defaults.output_format));
|
|
273
|
-
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
|
|
274
|
-
output_format_str(cmd_params_defaults.output_format_stderr));
|
|
275
|
-
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
|
276
|
-
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
|
374
|
+
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
|
|
375
|
+
printf(" (default: disabled)\n");
|
|
376
|
+
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
|
|
277
377
|
printf("\n");
|
|
278
378
|
printf(
|
|
279
|
-
"Multiple values can be given for each parameter by separating them with ','
|
|
280
|
-
"multiple times
|
|
379
|
+
"Multiple values can be given for each parameter by separating them with ','\n"
|
|
380
|
+
"or by specifying the parameter multiple times. Ranges can be given as\n"
|
|
381
|
+
"'first-last' or 'first-last+step' or 'first-last*mult'.\n");
|
|
281
382
|
}
|
|
282
383
|
|
|
283
384
|
static ggml_type ggml_type_from_name(const std::string & s) {
|
|
@@ -331,179 +432,197 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
331
432
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
332
433
|
}
|
|
333
434
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
if (
|
|
339
|
-
|
|
340
|
-
break;
|
|
341
|
-
}
|
|
342
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
343
|
-
params.model.insert(params.model.end(), p.begin(), p.end());
|
|
344
|
-
} else if (arg == "-p" || arg == "--n-prompt") {
|
|
345
|
-
if (++i >= argc) {
|
|
346
|
-
invalid_param = true;
|
|
347
|
-
break;
|
|
348
|
-
}
|
|
349
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
350
|
-
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
|
351
|
-
} else if (arg == "-n" || arg == "--n-gen") {
|
|
352
|
-
if (++i >= argc) {
|
|
353
|
-
invalid_param = true;
|
|
354
|
-
break;
|
|
355
|
-
}
|
|
356
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
357
|
-
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
358
|
-
} else if (arg == "-pg") {
|
|
359
|
-
if (++i >= argc) {
|
|
360
|
-
invalid_param = true;
|
|
361
|
-
break;
|
|
362
|
-
}
|
|
363
|
-
auto p = string_split<std::string>(argv[i], ',');
|
|
364
|
-
if (p.size() != 2) {
|
|
365
|
-
invalid_param = true;
|
|
366
|
-
break;
|
|
367
|
-
}
|
|
368
|
-
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
|
369
|
-
} else if (arg == "-b" || arg == "--batch-size") {
|
|
370
|
-
if (++i >= argc) {
|
|
371
|
-
invalid_param = true;
|
|
372
|
-
break;
|
|
373
|
-
}
|
|
374
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
375
|
-
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
|
376
|
-
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
377
|
-
if (++i >= argc) {
|
|
378
|
-
invalid_param = true;
|
|
379
|
-
break;
|
|
380
|
-
}
|
|
381
|
-
auto p = string_split<int>(argv[i], split_delim);
|
|
382
|
-
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
|
383
|
-
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
384
|
-
if (++i >= argc) {
|
|
385
|
-
invalid_param = true;
|
|
386
|
-
break;
|
|
387
|
-
}
|
|
388
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
389
|
-
std::vector<ggml_type> types;
|
|
390
|
-
for (const auto & t : p) {
|
|
391
|
-
ggml_type gt = ggml_type_from_name(t);
|
|
392
|
-
if (gt == GGML_TYPE_COUNT) {
|
|
435
|
+
try {
|
|
436
|
+
if (arg == "-h" || arg == "--help") {
|
|
437
|
+
print_usage(argc, argv);
|
|
438
|
+
exit(0);
|
|
439
|
+
} else if (arg == "-m" || arg == "--model") {
|
|
440
|
+
if (++i >= argc) {
|
|
393
441
|
invalid_param = true;
|
|
394
442
|
break;
|
|
395
443
|
}
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
if (
|
|
399
|
-
|
|
400
|
-
}
|
|
401
|
-
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
402
|
-
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
403
|
-
if (++i >= argc) {
|
|
404
|
-
invalid_param = true;
|
|
405
|
-
break;
|
|
406
|
-
}
|
|
407
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
408
|
-
std::vector<ggml_type> types;
|
|
409
|
-
for (const auto & t : p) {
|
|
410
|
-
ggml_type gt = ggml_type_from_name(t);
|
|
411
|
-
if (gt == GGML_TYPE_COUNT) {
|
|
444
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
445
|
+
params.model.insert(params.model.end(), p.begin(), p.end());
|
|
446
|
+
} else if (arg == "-p" || arg == "--n-prompt") {
|
|
447
|
+
if (++i >= argc) {
|
|
412
448
|
invalid_param = true;
|
|
413
449
|
break;
|
|
414
450
|
}
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
if (
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
451
|
+
auto p = parse_int_range(argv[i]);
|
|
452
|
+
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
|
453
|
+
} else if (arg == "-n" || arg == "--n-gen") {
|
|
454
|
+
if (++i >= argc) {
|
|
455
|
+
invalid_param = true;
|
|
456
|
+
break;
|
|
457
|
+
}
|
|
458
|
+
auto p = parse_int_range(argv[i]);
|
|
459
|
+
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
460
|
+
} else if (arg == "-pg") {
|
|
461
|
+
if (++i >= argc) {
|
|
462
|
+
invalid_param = true;
|
|
463
|
+
break;
|
|
464
|
+
}
|
|
465
|
+
auto p = string_split<std::string>(argv[i], ',');
|
|
466
|
+
if (p.size() != 2) {
|
|
467
|
+
invalid_param = true;
|
|
468
|
+
break;
|
|
469
|
+
}
|
|
470
|
+
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
|
471
|
+
} else if (arg == "-d" || arg == "--n-depth") {
|
|
472
|
+
if (++i >= argc) {
|
|
473
|
+
invalid_param = true;
|
|
474
|
+
break;
|
|
475
|
+
}
|
|
476
|
+
auto p = parse_int_range(argv[i]);
|
|
477
|
+
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
|
478
|
+
} else if (arg == "-b" || arg == "--batch-size") {
|
|
479
|
+
if (++i >= argc) {
|
|
480
|
+
invalid_param = true;
|
|
481
|
+
break;
|
|
482
|
+
}
|
|
483
|
+
auto p = parse_int_range(argv[i]);
|
|
484
|
+
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
|
485
|
+
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
486
|
+
if (++i >= argc) {
|
|
487
|
+
invalid_param = true;
|
|
488
|
+
break;
|
|
489
|
+
}
|
|
490
|
+
auto p = parse_int_range(argv[i]);
|
|
491
|
+
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
|
492
|
+
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
493
|
+
if (++i >= argc) {
|
|
494
|
+
invalid_param = true;
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
498
|
+
|
|
499
|
+
std::vector<ggml_type> types;
|
|
500
|
+
for (const auto & t : p) {
|
|
501
|
+
ggml_type gt = ggml_type_from_name(t);
|
|
502
|
+
if (gt == GGML_TYPE_COUNT) {
|
|
503
|
+
invalid_param = true;
|
|
504
|
+
break;
|
|
505
|
+
}
|
|
506
|
+
types.push_back(gt);
|
|
507
|
+
}
|
|
508
|
+
if (invalid_param) {
|
|
509
|
+
break;
|
|
510
|
+
}
|
|
511
|
+
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
512
|
+
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
513
|
+
if (++i >= argc) {
|
|
514
|
+
invalid_param = true;
|
|
515
|
+
break;
|
|
516
|
+
}
|
|
517
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
518
|
+
|
|
519
|
+
std::vector<ggml_type> types;
|
|
520
|
+
for (const auto & t : p) {
|
|
521
|
+
ggml_type gt = ggml_type_from_name(t);
|
|
522
|
+
if (gt == GGML_TYPE_COUNT) {
|
|
523
|
+
invalid_param = true;
|
|
524
|
+
break;
|
|
525
|
+
}
|
|
526
|
+
types.push_back(gt);
|
|
527
|
+
}
|
|
528
|
+
if (invalid_param) {
|
|
529
|
+
break;
|
|
530
|
+
}
|
|
531
|
+
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
|
532
|
+
} else if (arg == "-dt" || arg == "--defrag-thold") {
|
|
533
|
+
if (++i >= argc) {
|
|
534
|
+
invalid_param = true;
|
|
535
|
+
break;
|
|
536
|
+
}
|
|
537
|
+
auto p = string_split<float>(argv[i], split_delim);
|
|
538
|
+
params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
|
|
539
|
+
} else if (arg == "-t" || arg == "--threads") {
|
|
540
|
+
if (++i >= argc) {
|
|
541
|
+
invalid_param = true;
|
|
542
|
+
break;
|
|
543
|
+
}
|
|
544
|
+
auto p = parse_int_range(argv[i]);
|
|
545
|
+
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
|
546
|
+
} else if (arg == "-C" || arg == "--cpu-mask") {
|
|
547
|
+
if (++i >= argc) {
|
|
548
|
+
invalid_param = true;
|
|
549
|
+
break;
|
|
550
|
+
}
|
|
551
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
552
|
+
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
|
553
|
+
} else if (arg == "--cpu-strict") {
|
|
554
|
+
if (++i >= argc) {
|
|
555
|
+
invalid_param = true;
|
|
556
|
+
break;
|
|
557
|
+
}
|
|
558
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
559
|
+
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
|
560
|
+
} else if (arg == "--poll") {
|
|
561
|
+
if (++i >= argc) {
|
|
562
|
+
invalid_param = true;
|
|
563
|
+
break;
|
|
564
|
+
}
|
|
565
|
+
auto p = parse_int_range(argv[i]);
|
|
566
|
+
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
|
567
|
+
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
568
|
+
if (++i >= argc) {
|
|
569
|
+
invalid_param = true;
|
|
570
|
+
break;
|
|
571
|
+
}
|
|
572
|
+
auto p = parse_int_range(argv[i]);
|
|
573
|
+
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
|
574
|
+
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
|
|
575
|
+
if (++i >= argc) {
|
|
576
|
+
invalid_param = true;
|
|
577
|
+
break;
|
|
578
|
+
}
|
|
579
|
+
params.rpc_servers.push_back(argv[i]);
|
|
580
|
+
} else if (arg == "-sm" || arg == "--split-mode") {
|
|
581
|
+
if (++i >= argc) {
|
|
582
|
+
invalid_param = true;
|
|
583
|
+
break;
|
|
584
|
+
}
|
|
585
|
+
auto p = string_split<std::string>(argv[i], split_delim);
|
|
586
|
+
|
|
587
|
+
std::vector<llama_split_mode> modes;
|
|
588
|
+
for (const auto & m : p) {
|
|
589
|
+
llama_split_mode mode;
|
|
590
|
+
if (m == "none") {
|
|
591
|
+
mode = LLAMA_SPLIT_MODE_NONE;
|
|
592
|
+
} else if (m == "layer") {
|
|
593
|
+
mode = LLAMA_SPLIT_MODE_LAYER;
|
|
594
|
+
} else if (m == "row") {
|
|
595
|
+
mode = LLAMA_SPLIT_MODE_ROW;
|
|
596
|
+
} else {
|
|
597
|
+
invalid_param = true;
|
|
598
|
+
break;
|
|
599
|
+
}
|
|
600
|
+
modes.push_back(mode);
|
|
601
|
+
}
|
|
602
|
+
if (invalid_param) {
|
|
603
|
+
break;
|
|
604
|
+
}
|
|
605
|
+
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
|
606
|
+
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
607
|
+
if (++i >= argc) {
|
|
608
|
+
invalid_param = true;
|
|
609
|
+
break;
|
|
610
|
+
}
|
|
611
|
+
params.main_gpu = parse_int_range(argv[i]);
|
|
612
|
+
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
613
|
+
if (++i >= argc) {
|
|
614
|
+
invalid_param = true;
|
|
615
|
+
break;
|
|
616
|
+
}
|
|
617
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
618
|
+
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
|
619
|
+
} else if (arg == "--numa") {
|
|
620
|
+
if (++i >= argc) {
|
|
478
621
|
invalid_param = true;
|
|
479
622
|
break;
|
|
480
623
|
}
|
|
481
|
-
modes.push_back(mode);
|
|
482
|
-
}
|
|
483
|
-
if (invalid_param) {
|
|
484
|
-
break;
|
|
485
|
-
}
|
|
486
|
-
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
|
487
|
-
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
488
|
-
if (++i >= argc) {
|
|
489
|
-
invalid_param = true;
|
|
490
|
-
break;
|
|
491
|
-
}
|
|
492
|
-
params.main_gpu = string_split<int>(argv[i], split_delim);
|
|
493
|
-
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
494
|
-
if (++i >= argc) {
|
|
495
|
-
invalid_param = true;
|
|
496
|
-
break;
|
|
497
|
-
}
|
|
498
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
499
|
-
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
|
500
|
-
} else if (arg == "--numa") {
|
|
501
|
-
if (++i >= argc) {
|
|
502
|
-
invalid_param = true;
|
|
503
|
-
break;
|
|
504
|
-
} else {
|
|
505
624
|
std::string value(argv[i]);
|
|
506
|
-
|
|
625
|
+
if (value == "distribute" || value == "") {
|
|
507
626
|
params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
|
|
508
627
|
} else if (value == "isolate") {
|
|
509
628
|
params.numa = GGML_NUMA_STRATEGY_ISOLATE;
|
|
@@ -513,89 +632,183 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
513
632
|
invalid_param = true;
|
|
514
633
|
break;
|
|
515
634
|
}
|
|
516
|
-
}
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
635
|
+
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
636
|
+
if (++i >= argc) {
|
|
637
|
+
invalid_param = true;
|
|
638
|
+
break;
|
|
639
|
+
}
|
|
640
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
641
|
+
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
|
642
|
+
} else if (arg == "-mmp" || arg == "--mmap") {
|
|
643
|
+
if (++i >= argc) {
|
|
644
|
+
invalid_param = true;
|
|
645
|
+
break;
|
|
646
|
+
}
|
|
647
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
648
|
+
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
|
649
|
+
} else if (arg == "-embd" || arg == "--embeddings") {
|
|
650
|
+
if (++i >= argc) {
|
|
651
|
+
invalid_param = true;
|
|
652
|
+
break;
|
|
653
|
+
}
|
|
654
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
655
|
+
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
|
656
|
+
} else if (arg == "-nopo" || arg == "--no-op-offload") {
|
|
657
|
+
if (++i >= argc) {
|
|
658
|
+
invalid_param = true;
|
|
659
|
+
break;
|
|
660
|
+
}
|
|
661
|
+
auto p = string_split<bool>(argv[i], split_delim);
|
|
662
|
+
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
|
|
663
|
+
} else if (arg == "-ts" || arg == "--tensor-split") {
|
|
664
|
+
if (++i >= argc) {
|
|
665
|
+
invalid_param = true;
|
|
666
|
+
break;
|
|
667
|
+
}
|
|
668
|
+
for (auto ts : string_split<std::string>(argv[i], split_delim)) {
|
|
669
|
+
// split string by ; and /
|
|
670
|
+
const std::regex regex{ R"([;/]+)" };
|
|
671
|
+
std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
|
|
672
|
+
std::vector<std::string> split_arg{ it, {} };
|
|
673
|
+
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
|
674
|
+
|
|
675
|
+
std::vector<float> tensor_split(llama_max_devices());
|
|
676
|
+
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
677
|
+
if (i < split_arg.size()) {
|
|
678
|
+
tensor_split[i] = std::stof(split_arg[i]);
|
|
679
|
+
} else {
|
|
680
|
+
tensor_split[i] = 0.0f;
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
params.tensor_split.push_back(tensor_split);
|
|
684
|
+
}
|
|
685
|
+
} else if (arg == "-ot" || arg == "--override-tensor") {
|
|
686
|
+
if (++i >= argc) {
|
|
687
|
+
invalid_param = true;
|
|
688
|
+
break;
|
|
689
|
+
}
|
|
690
|
+
auto * value = argv[i];
|
|
691
|
+
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
|
692
|
+
if (buft_list.empty()) {
|
|
693
|
+
// enumerate all the devices and add their buffer types to the list
|
|
694
|
+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
695
|
+
auto * dev = ggml_backend_dev_get(i);
|
|
696
|
+
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
697
|
+
if (buft) {
|
|
698
|
+
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
auto override_group_span_len = std::strcspn(value, ",");
|
|
703
|
+
bool last_group = false;
|
|
704
|
+
do {
|
|
705
|
+
if (override_group_span_len == 0) {
|
|
706
|
+
// Adds an empty override-tensors for an empty span
|
|
707
|
+
params.tensor_buft_overrides.push_back({{}});
|
|
708
|
+
if (value[override_group_span_len] == '\0') {
|
|
709
|
+
value = &value[override_group_span_len];
|
|
710
|
+
last_group = true;
|
|
711
|
+
} else {
|
|
712
|
+
value = &value[override_group_span_len + 1];
|
|
713
|
+
override_group_span_len = std::strcspn(value, ",");
|
|
714
|
+
}
|
|
715
|
+
continue;
|
|
716
|
+
}
|
|
717
|
+
// Stamps null terminators into the argv
|
|
718
|
+
// value for this option to avoid the
|
|
719
|
+
// memory leak present in the implementation
|
|
720
|
+
// over in arg.cpp. Acceptable because we
|
|
721
|
+
// only parse these args once in this program.
|
|
722
|
+
auto * override_group = value;
|
|
723
|
+
if (value[override_group_span_len] == '\0') {
|
|
724
|
+
value = &value[override_group_span_len];
|
|
725
|
+
last_group = true;
|
|
554
726
|
} else {
|
|
555
|
-
|
|
727
|
+
value[override_group_span_len] = '\0';
|
|
728
|
+
value = &value[override_group_span_len + 1];
|
|
729
|
+
}
|
|
730
|
+
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
|
|
731
|
+
auto override_span_len = std::strcspn(override_group, ";");
|
|
732
|
+
while (override_span_len > 0) {
|
|
733
|
+
auto * override = override_group;
|
|
734
|
+
if (override_group[override_span_len] != '\0') {
|
|
735
|
+
override_group[override_span_len] = '\0';
|
|
736
|
+
override_group = &override_group[override_span_len + 1];
|
|
737
|
+
} else {
|
|
738
|
+
override_group = &override_group[override_span_len];
|
|
739
|
+
}
|
|
740
|
+
auto tensor_name_span_len = std::strcspn(override, "=");
|
|
741
|
+
if (tensor_name_span_len >= override_span_len) {
|
|
742
|
+
invalid_param = true;
|
|
743
|
+
break;
|
|
744
|
+
}
|
|
745
|
+
override[tensor_name_span_len] = '\0';
|
|
746
|
+
auto * tensor_name = override;
|
|
747
|
+
auto * buffer_type = &override[tensor_name_span_len + 1];
|
|
748
|
+
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
749
|
+
printf("error: unrecognized buffer type '%s'\n", buffer_type);
|
|
750
|
+
printf("Available buffer types:\n");
|
|
751
|
+
for (const auto & it : buft_list) {
|
|
752
|
+
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
753
|
+
}
|
|
754
|
+
invalid_param = true;
|
|
755
|
+
break;
|
|
756
|
+
}
|
|
757
|
+
group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
|
|
758
|
+
override_span_len = std::strcspn(override_group, ";");
|
|
556
759
|
}
|
|
760
|
+
if (invalid_param) {
|
|
761
|
+
break;
|
|
762
|
+
}
|
|
763
|
+
group_tensor_buft_overrides.push_back({nullptr,nullptr});
|
|
764
|
+
params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
|
|
765
|
+
override_group_span_len = std::strcspn(value, ",");
|
|
766
|
+
} while (!last_group);
|
|
767
|
+
} else if (arg == "-r" || arg == "--repetitions") {
|
|
768
|
+
if (++i >= argc) {
|
|
769
|
+
invalid_param = true;
|
|
770
|
+
break;
|
|
557
771
|
}
|
|
558
|
-
params.
|
|
559
|
-
}
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
if (
|
|
772
|
+
params.reps = std::stoi(argv[i]);
|
|
773
|
+
} else if (arg == "--prio") {
|
|
774
|
+
if (++i >= argc) {
|
|
775
|
+
invalid_param = true;
|
|
776
|
+
break;
|
|
777
|
+
}
|
|
778
|
+
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
|
779
|
+
} else if (arg == "--delay") {
|
|
780
|
+
if (++i >= argc) {
|
|
781
|
+
invalid_param = true;
|
|
782
|
+
break;
|
|
783
|
+
}
|
|
784
|
+
params.delay = std::stoi(argv[i]);
|
|
785
|
+
} else if (arg == "-o" || arg == "--output") {
|
|
786
|
+
if (++i >= argc) {
|
|
787
|
+
invalid_param = true;
|
|
788
|
+
break;
|
|
789
|
+
}
|
|
790
|
+
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
|
791
|
+
} else if (arg == "-oe" || arg == "--output-err") {
|
|
792
|
+
if (++i >= argc) {
|
|
793
|
+
invalid_param = true;
|
|
794
|
+
break;
|
|
795
|
+
}
|
|
796
|
+
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
|
797
|
+
} else if (arg == "-v" || arg == "--verbose") {
|
|
798
|
+
params.verbose = true;
|
|
799
|
+
} else if (arg == "--progress") {
|
|
800
|
+
params.progress = true;
|
|
801
|
+
} else {
|
|
586
802
|
invalid_param = true;
|
|
587
803
|
break;
|
|
588
804
|
}
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
params.verbose = true;
|
|
592
|
-
} else if (arg == "--progress") {
|
|
593
|
-
params.progress = true;
|
|
594
|
-
} else {
|
|
805
|
+
} catch (const std::exception & e) {
|
|
806
|
+
fprintf(stderr, "error: %s\n", e.what());
|
|
595
807
|
invalid_param = true;
|
|
596
808
|
break;
|
|
597
809
|
}
|
|
598
810
|
}
|
|
811
|
+
|
|
599
812
|
if (invalid_param) {
|
|
600
813
|
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
601
814
|
print_usage(argc, argv);
|
|
@@ -615,6 +828,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
615
828
|
if (params.n_pg.empty()) {
|
|
616
829
|
params.n_pg = cmd_params_defaults.n_pg;
|
|
617
830
|
}
|
|
831
|
+
if (params.n_depth.empty()) {
|
|
832
|
+
params.n_depth = cmd_params_defaults.n_depth;
|
|
833
|
+
}
|
|
618
834
|
if (params.n_batch.empty()) {
|
|
619
835
|
params.n_batch = cmd_params_defaults.n_batch;
|
|
620
836
|
}
|
|
@@ -627,6 +843,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
627
843
|
if (params.type_v.empty()) {
|
|
628
844
|
params.type_v = cmd_params_defaults.type_v;
|
|
629
845
|
}
|
|
846
|
+
if (params.defrag_thold.empty()) {
|
|
847
|
+
params.defrag_thold = cmd_params_defaults.defrag_thold;
|
|
848
|
+
}
|
|
630
849
|
if (params.n_gpu_layers.empty()) {
|
|
631
850
|
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
|
|
632
851
|
}
|
|
@@ -648,12 +867,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
648
867
|
if (params.tensor_split.empty()) {
|
|
649
868
|
params.tensor_split = cmd_params_defaults.tensor_split;
|
|
650
869
|
}
|
|
870
|
+
if (params.tensor_buft_overrides.empty()) {
|
|
871
|
+
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
|
|
872
|
+
}
|
|
651
873
|
if (params.use_mmap.empty()) {
|
|
652
874
|
params.use_mmap = cmd_params_defaults.use_mmap;
|
|
653
875
|
}
|
|
654
876
|
if (params.embeddings.empty()) {
|
|
655
877
|
params.embeddings = cmd_params_defaults.embeddings;
|
|
656
878
|
}
|
|
879
|
+
if (params.no_op_offload.empty()) {
|
|
880
|
+
params.no_op_offload = cmd_params_defaults.no_op_offload;
|
|
881
|
+
}
|
|
657
882
|
if (params.n_threads.empty()) {
|
|
658
883
|
params.n_threads = cmd_params_defaults.n_threads;
|
|
659
884
|
}
|
|
@@ -674,10 +899,12 @@ struct cmd_params_instance {
|
|
|
674
899
|
std::string model;
|
|
675
900
|
int n_prompt;
|
|
676
901
|
int n_gen;
|
|
902
|
+
int n_depth;
|
|
677
903
|
int n_batch;
|
|
678
904
|
int n_ubatch;
|
|
679
905
|
ggml_type type_k;
|
|
680
906
|
ggml_type type_v;
|
|
907
|
+
float defrag_thold;
|
|
681
908
|
int n_threads;
|
|
682
909
|
std::string cpu_mask;
|
|
683
910
|
bool cpu_strict;
|
|
@@ -689,8 +916,10 @@ struct cmd_params_instance {
|
|
|
689
916
|
bool no_kv_offload;
|
|
690
917
|
bool flash_attn;
|
|
691
918
|
std::vector<float> tensor_split;
|
|
919
|
+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
692
920
|
bool use_mmap;
|
|
693
921
|
bool embeddings;
|
|
922
|
+
bool no_op_offload;
|
|
694
923
|
|
|
695
924
|
llama_model_params to_llama_mparams() const {
|
|
696
925
|
llama_model_params mparams = llama_model_default_params();
|
|
@@ -733,26 +962,35 @@ struct cmd_params_instance {
|
|
|
733
962
|
mparams.tensor_split = tensor_split.data();
|
|
734
963
|
mparams.use_mmap = use_mmap;
|
|
735
964
|
|
|
965
|
+
if (tensor_buft_overrides.empty()) {
|
|
966
|
+
mparams.tensor_buft_overrides = nullptr;
|
|
967
|
+
} else {
|
|
968
|
+
GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
|
969
|
+
mparams.tensor_buft_overrides = tensor_buft_overrides.data();
|
|
970
|
+
}
|
|
971
|
+
|
|
736
972
|
return mparams;
|
|
737
973
|
}
|
|
738
974
|
|
|
739
975
|
bool equal_mparams(const cmd_params_instance & other) const {
|
|
740
976
|
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
|
|
741
977
|
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
|
|
742
|
-
tensor_split == other.tensor_split;
|
|
978
|
+
tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
|
743
979
|
}
|
|
744
980
|
|
|
745
981
|
llama_context_params to_llama_cparams() const {
|
|
746
982
|
llama_context_params cparams = llama_context_default_params();
|
|
747
983
|
|
|
748
|
-
cparams.n_ctx
|
|
749
|
-
cparams.n_batch
|
|
750
|
-
cparams.n_ubatch
|
|
751
|
-
cparams.type_k
|
|
752
|
-
cparams.type_v
|
|
753
|
-
cparams.
|
|
754
|
-
cparams.
|
|
755
|
-
cparams.
|
|
984
|
+
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
|
985
|
+
cparams.n_batch = n_batch;
|
|
986
|
+
cparams.n_ubatch = n_ubatch;
|
|
987
|
+
cparams.type_k = type_k;
|
|
988
|
+
cparams.type_v = type_v;
|
|
989
|
+
cparams.defrag_thold = defrag_thold;
|
|
990
|
+
cparams.offload_kqv = !no_kv_offload;
|
|
991
|
+
cparams.flash_attn = flash_attn;
|
|
992
|
+
cparams.embeddings = embeddings;
|
|
993
|
+
cparams.op_offload = !no_op_offload;
|
|
756
994
|
|
|
757
995
|
return cparams;
|
|
758
996
|
}
|
|
@@ -769,17 +1007,21 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
769
1007
|
for (const auto & sm : params.split_mode)
|
|
770
1008
|
for (const auto & mg : params.main_gpu)
|
|
771
1009
|
for (const auto & ts : params.tensor_split)
|
|
1010
|
+
for (const auto & ot : params.tensor_buft_overrides)
|
|
772
1011
|
for (const auto & mmp : params.use_mmap)
|
|
773
1012
|
for (const auto & embd : params.embeddings)
|
|
1013
|
+
for (const auto & nopo : params.no_op_offload)
|
|
774
1014
|
for (const auto & nb : params.n_batch)
|
|
775
1015
|
for (const auto & nub : params.n_ubatch)
|
|
776
1016
|
for (const auto & tk : params.type_k)
|
|
777
1017
|
for (const auto & tv : params.type_v)
|
|
1018
|
+
for (const auto & defrag_thold : params.defrag_thold)
|
|
778
1019
|
for (const auto & nkvo : params.no_kv_offload)
|
|
779
1020
|
for (const auto & fa : params.flash_attn)
|
|
780
1021
|
for (const auto & nt : params.n_threads)
|
|
781
1022
|
for (const auto & cm : params.cpu_mask)
|
|
782
1023
|
for (const auto & cs : params.cpu_strict)
|
|
1024
|
+
for (const auto & nd : params.n_depth)
|
|
783
1025
|
for (const auto & pl : params.poll) {
|
|
784
1026
|
for (const auto & n_prompt : params.n_prompt) {
|
|
785
1027
|
if (n_prompt == 0) {
|
|
@@ -789,10 +1031,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
789
1031
|
/* .model = */ m,
|
|
790
1032
|
/* .n_prompt = */ n_prompt,
|
|
791
1033
|
/* .n_gen = */ 0,
|
|
1034
|
+
/* .n_depth = */ nd,
|
|
792
1035
|
/* .n_batch = */ nb,
|
|
793
1036
|
/* .n_ubatch = */ nub,
|
|
794
1037
|
/* .type_k = */ tk,
|
|
795
1038
|
/* .type_v = */ tv,
|
|
1039
|
+
/* .defrag_thold = */ defrag_thold,
|
|
796
1040
|
/* .n_threads = */ nt,
|
|
797
1041
|
/* .cpu_mask = */ cm,
|
|
798
1042
|
/* .cpu_strict = */ cs,
|
|
@@ -804,8 +1048,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
804
1048
|
/* .no_kv_offload= */ nkvo,
|
|
805
1049
|
/* .flash_attn = */ fa,
|
|
806
1050
|
/* .tensor_split = */ ts,
|
|
1051
|
+
/* .tensor_buft_overrides = */ ot,
|
|
807
1052
|
/* .use_mmap = */ mmp,
|
|
808
1053
|
/* .embeddings = */ embd,
|
|
1054
|
+
/* .no_op_offload= */ nopo,
|
|
809
1055
|
};
|
|
810
1056
|
instances.push_back(instance);
|
|
811
1057
|
}
|
|
@@ -818,10 +1064,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
818
1064
|
/* .model = */ m,
|
|
819
1065
|
/* .n_prompt = */ 0,
|
|
820
1066
|
/* .n_gen = */ n_gen,
|
|
1067
|
+
/* .n_depth = */ nd,
|
|
821
1068
|
/* .n_batch = */ nb,
|
|
822
1069
|
/* .n_ubatch = */ nub,
|
|
823
1070
|
/* .type_k = */ tk,
|
|
824
1071
|
/* .type_v = */ tv,
|
|
1072
|
+
/* .defrag_thold = */ defrag_thold,
|
|
825
1073
|
/* .n_threads = */ nt,
|
|
826
1074
|
/* .cpu_mask = */ cm,
|
|
827
1075
|
/* .cpu_strict = */ cs,
|
|
@@ -833,8 +1081,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
833
1081
|
/* .no_kv_offload= */ nkvo,
|
|
834
1082
|
/* .flash_attn = */ fa,
|
|
835
1083
|
/* .tensor_split = */ ts,
|
|
1084
|
+
/* .tensor_buft_overrides = */ ot,
|
|
836
1085
|
/* .use_mmap = */ mmp,
|
|
837
1086
|
/* .embeddings = */ embd,
|
|
1087
|
+
/* .no_op_offload= */ nopo,
|
|
838
1088
|
};
|
|
839
1089
|
instances.push_back(instance);
|
|
840
1090
|
}
|
|
@@ -847,10 +1097,12 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
847
1097
|
/* .model = */ m,
|
|
848
1098
|
/* .n_prompt = */ n_pg.first,
|
|
849
1099
|
/* .n_gen = */ n_pg.second,
|
|
1100
|
+
/* .n_depth = */ nd,
|
|
850
1101
|
/* .n_batch = */ nb,
|
|
851
1102
|
/* .n_ubatch = */ nub,
|
|
852
1103
|
/* .type_k = */ tk,
|
|
853
1104
|
/* .type_v = */ tv,
|
|
1105
|
+
/* .defrag_thold = */ defrag_thold,
|
|
854
1106
|
/* .n_threads = */ nt,
|
|
855
1107
|
/* .cpu_mask = */ cm,
|
|
856
1108
|
/* .cpu_strict = */ cs,
|
|
@@ -862,8 +1114,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
862
1114
|
/* .no_kv_offload= */ nkvo,
|
|
863
1115
|
/* .flash_attn = */ fa,
|
|
864
1116
|
/* .tensor_split = */ ts,
|
|
1117
|
+
/* .tensor_buft_overrides = */ ot,
|
|
865
1118
|
/* .use_mmap = */ mmp,
|
|
866
1119
|
/* .embeddings = */ embd,
|
|
1120
|
+
/* .no_op_offload= */ nopo,
|
|
867
1121
|
};
|
|
868
1122
|
instances.push_back(instance);
|
|
869
1123
|
}
|
|
@@ -890,16 +1144,20 @@ struct test {
|
|
|
890
1144
|
int poll;
|
|
891
1145
|
ggml_type type_k;
|
|
892
1146
|
ggml_type type_v;
|
|
1147
|
+
float defrag_thold;
|
|
893
1148
|
int n_gpu_layers;
|
|
894
1149
|
llama_split_mode split_mode;
|
|
895
1150
|
int main_gpu;
|
|
896
1151
|
bool no_kv_offload;
|
|
897
1152
|
bool flash_attn;
|
|
898
1153
|
std::vector<float> tensor_split;
|
|
1154
|
+
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
899
1155
|
bool use_mmap;
|
|
900
1156
|
bool embeddings;
|
|
1157
|
+
bool no_op_offload;
|
|
901
1158
|
int n_prompt;
|
|
902
1159
|
int n_gen;
|
|
1160
|
+
int n_depth;
|
|
903
1161
|
std::string test_time;
|
|
904
1162
|
std::vector<uint64_t> samples_ns;
|
|
905
1163
|
|
|
@@ -921,16 +1179,20 @@ struct test {
|
|
|
921
1179
|
poll = inst.poll;
|
|
922
1180
|
type_k = inst.type_k;
|
|
923
1181
|
type_v = inst.type_v;
|
|
1182
|
+
defrag_thold = inst.defrag_thold;
|
|
924
1183
|
n_gpu_layers = inst.n_gpu_layers;
|
|
925
1184
|
split_mode = inst.split_mode;
|
|
926
1185
|
main_gpu = inst.main_gpu;
|
|
927
1186
|
no_kv_offload = inst.no_kv_offload;
|
|
928
1187
|
flash_attn = inst.flash_attn;
|
|
929
1188
|
tensor_split = inst.tensor_split;
|
|
1189
|
+
tensor_buft_overrides = inst.tensor_buft_overrides;
|
|
930
1190
|
use_mmap = inst.use_mmap;
|
|
931
1191
|
embeddings = inst.embeddings;
|
|
1192
|
+
no_op_offload = inst.no_op_offload;
|
|
932
1193
|
n_prompt = inst.n_prompt;
|
|
933
1194
|
n_gen = inst.n_gen;
|
|
1195
|
+
n_depth = inst.n_depth;
|
|
934
1196
|
// RFC 3339 date-time format
|
|
935
1197
|
time_t t = time(NULL);
|
|
936
1198
|
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
|
@@ -972,9 +1234,10 @@ struct test {
|
|
|
972
1234
|
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
|
973
1235
|
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
|
974
1236
|
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
|
975
|
-
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "
|
|
976
|
-
"
|
|
977
|
-
"
|
|
1237
|
+
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
|
1238
|
+
"defrag_thold",
|
|
1239
|
+
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
|
|
1240
|
+
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
|
978
1241
|
};
|
|
979
1242
|
return fields;
|
|
980
1243
|
}
|
|
@@ -984,15 +1247,15 @@ struct test {
|
|
|
984
1247
|
static field_type get_field_type(const std::string & field) {
|
|
985
1248
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
|
986
1249
|
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
|
987
|
-
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "
|
|
988
|
-
field == "stddev_ns") {
|
|
1250
|
+
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
|
|
1251
|
+
field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
|
|
989
1252
|
return INT;
|
|
990
1253
|
}
|
|
991
1254
|
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
|
992
1255
|
field == "use_mmap" || field == "embeddings") {
|
|
993
1256
|
return BOOL;
|
|
994
1257
|
}
|
|
995
|
-
if (field == "avg_ts" || field == "stddev_ts") {
|
|
1258
|
+
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
|
|
996
1259
|
return FLOAT;
|
|
997
1260
|
}
|
|
998
1261
|
return STRING;
|
|
@@ -1000,6 +1263,7 @@ struct test {
|
|
|
1000
1263
|
|
|
1001
1264
|
std::vector<std::string> get_values() const {
|
|
1002
1265
|
std::string tensor_split_str;
|
|
1266
|
+
std::string tensor_buft_overrides_str;
|
|
1003
1267
|
int max_nonzero = 0;
|
|
1004
1268
|
for (size_t i = 0; i < llama_max_devices(); i++) {
|
|
1005
1269
|
if (tensor_split[i] > 0) {
|
|
@@ -1014,6 +1278,26 @@ struct test {
|
|
|
1014
1278
|
tensor_split_str += "/";
|
|
1015
1279
|
}
|
|
1016
1280
|
}
|
|
1281
|
+
if (tensor_buft_overrides.size() == 1) {
|
|
1282
|
+
// Last element of tensor_buft_overrides is always a null pattern
|
|
1283
|
+
// so if it is only one element long, it must be a null pattern.
|
|
1284
|
+
GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
|
|
1285
|
+
tensor_buft_overrides_str += "none";
|
|
1286
|
+
} else {
|
|
1287
|
+
for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
|
|
1288
|
+
// Last element of tensor_buft_overrides is always a null pattern
|
|
1289
|
+
if (tensor_buft_overrides[i].pattern == nullptr) {
|
|
1290
|
+
tensor_buft_overrides_str += "none";
|
|
1291
|
+
} else {
|
|
1292
|
+
tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
|
|
1293
|
+
tensor_buft_overrides_str += "=";
|
|
1294
|
+
tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
|
|
1295
|
+
}
|
|
1296
|
+
if (i + 2 < tensor_buft_overrides.size()) {
|
|
1297
|
+
tensor_buft_overrides_str += ";";
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1017
1301
|
std::vector<std::string> values = { build_commit,
|
|
1018
1302
|
std::to_string(build_number),
|
|
1019
1303
|
cpu_info,
|
|
@@ -1037,10 +1321,14 @@ struct test {
|
|
|
1037
1321
|
std::to_string(no_kv_offload),
|
|
1038
1322
|
std::to_string(flash_attn),
|
|
1039
1323
|
tensor_split_str,
|
|
1324
|
+
tensor_buft_overrides_str,
|
|
1325
|
+
std::to_string(defrag_thold),
|
|
1040
1326
|
std::to_string(use_mmap),
|
|
1041
1327
|
std::to_string(embeddings),
|
|
1328
|
+
std::to_string(no_op_offload),
|
|
1042
1329
|
std::to_string(n_prompt),
|
|
1043
1330
|
std::to_string(n_gen),
|
|
1331
|
+
std::to_string(n_depth),
|
|
1044
1332
|
test_time,
|
|
1045
1333
|
std::to_string(avg_ns()),
|
|
1046
1334
|
std::to_string(stdev_ns()),
|
|
@@ -1218,7 +1506,10 @@ struct markdown_printer : public printer {
|
|
|
1218
1506
|
return 4;
|
|
1219
1507
|
}
|
|
1220
1508
|
if (field == "test") {
|
|
1221
|
-
return
|
|
1509
|
+
return 15;
|
|
1510
|
+
}
|
|
1511
|
+
if (field == "no_op_offload") {
|
|
1512
|
+
return 4;
|
|
1222
1513
|
}
|
|
1223
1514
|
|
|
1224
1515
|
int width = std::max((int) field.length(), 10);
|
|
@@ -1251,9 +1542,15 @@ struct markdown_printer : public printer {
|
|
|
1251
1542
|
if (field == "embeddings") {
|
|
1252
1543
|
return "embd";
|
|
1253
1544
|
}
|
|
1545
|
+
if (field == "no_op_offload") {
|
|
1546
|
+
return "nopo";
|
|
1547
|
+
}
|
|
1254
1548
|
if (field == "tensor_split") {
|
|
1255
1549
|
return "ts";
|
|
1256
1550
|
}
|
|
1551
|
+
if (field == "tensor_buft_overrides") {
|
|
1552
|
+
return "ot";
|
|
1553
|
+
}
|
|
1257
1554
|
return field;
|
|
1258
1555
|
}
|
|
1259
1556
|
|
|
@@ -1292,6 +1589,9 @@ struct markdown_printer : public printer {
|
|
|
1292
1589
|
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
|
1293
1590
|
fields.emplace_back("type_v");
|
|
1294
1591
|
}
|
|
1592
|
+
if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
|
|
1593
|
+
fields.emplace_back("defrag_thold");
|
|
1594
|
+
}
|
|
1295
1595
|
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
|
1296
1596
|
fields.emplace_back("main_gpu");
|
|
1297
1597
|
}
|
|
@@ -1307,12 +1607,18 @@ struct markdown_printer : public printer {
|
|
|
1307
1607
|
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
|
1308
1608
|
fields.emplace_back("tensor_split");
|
|
1309
1609
|
}
|
|
1610
|
+
if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
|
|
1611
|
+
fields.emplace_back("tensor_buft_overrides");
|
|
1612
|
+
}
|
|
1310
1613
|
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
|
1311
1614
|
fields.emplace_back("use_mmap");
|
|
1312
1615
|
}
|
|
1313
1616
|
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
|
1314
1617
|
fields.emplace_back("embeddings");
|
|
1315
1618
|
}
|
|
1619
|
+
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
|
|
1620
|
+
fields.emplace_back("no_op_offload");
|
|
1621
|
+
}
|
|
1316
1622
|
fields.emplace_back("test");
|
|
1317
1623
|
fields.emplace_back("t/s");
|
|
1318
1624
|
|
|
@@ -1362,6 +1668,10 @@ struct markdown_printer : public printer {
|
|
|
1362
1668
|
} else {
|
|
1363
1669
|
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
|
1364
1670
|
}
|
|
1671
|
+
if (t.n_depth > 0) {
|
|
1672
|
+
int len = strlen(buf);
|
|
1673
|
+
snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
|
|
1674
|
+
}
|
|
1365
1675
|
value = buf;
|
|
1366
1676
|
} else if (field == "t/s") {
|
|
1367
1677
|
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
|
@@ -1427,7 +1737,7 @@ struct sql_printer : public printer {
|
|
|
1427
1737
|
}
|
|
1428
1738
|
};
|
|
1429
1739
|
|
|
1430
|
-
static
|
|
1740
|
+
static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
|
|
1431
1741
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1432
1742
|
|
|
1433
1743
|
const llama_model * model = llama_get_model(ctx);
|
|
@@ -1444,14 +1754,19 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
|
|
|
1444
1754
|
for (int i = 1; i < n_tokens; i++) {
|
|
1445
1755
|
tokens[i] = std::rand() % n_vocab;
|
|
1446
1756
|
}
|
|
1447
|
-
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
|
|
1757
|
+
int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
|
|
1758
|
+
if (res != 0) {
|
|
1759
|
+
fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
|
|
1760
|
+
return false;
|
|
1761
|
+
}
|
|
1448
1762
|
n_processed += n_tokens;
|
|
1449
1763
|
}
|
|
1450
1764
|
|
|
1451
1765
|
llama_synchronize(ctx);
|
|
1766
|
+
return true;
|
|
1452
1767
|
}
|
|
1453
1768
|
|
|
1454
|
-
static
|
|
1769
|
+
static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
1455
1770
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1456
1771
|
|
|
1457
1772
|
const llama_model * model = llama_get_model(ctx);
|
|
@@ -1461,10 +1776,15 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
|
1461
1776
|
llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
|
|
1462
1777
|
|
|
1463
1778
|
for (int i = 0; i < n_gen; i++) {
|
|
1464
|
-
llama_decode(ctx, llama_batch_get_one(&token, 1));
|
|
1779
|
+
int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
|
|
1780
|
+
if (res != 0) {
|
|
1781
|
+
fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
|
|
1782
|
+
return false;
|
|
1783
|
+
}
|
|
1465
1784
|
llama_synchronize(ctx);
|
|
1466
1785
|
token = std::rand() % n_vocab;
|
|
1467
1786
|
}
|
|
1787
|
+
return true;
|
|
1468
1788
|
}
|
|
1469
1789
|
|
|
1470
1790
|
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
|
|
@@ -1507,10 +1827,11 @@ int main(int argc, char ** argv) {
|
|
|
1507
1827
|
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
|
|
1508
1828
|
#endif
|
|
1509
1829
|
|
|
1510
|
-
cmd_params params = parse_cmd_params(argc, argv);
|
|
1511
|
-
|
|
1512
1830
|
// initialize backends
|
|
1513
1831
|
ggml_backend_load_all();
|
|
1832
|
+
|
|
1833
|
+
cmd_params params = parse_cmd_params(argc, argv);
|
|
1834
|
+
|
|
1514
1835
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1515
1836
|
if (!cpu_dev) {
|
|
1516
1837
|
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
|
@@ -1608,18 +1929,38 @@ int main(int argc, char ** argv) {
|
|
|
1608
1929
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
|
|
1609
1930
|
}
|
|
1610
1931
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
|
1611
|
-
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1932
|
+
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1933
|
+
if (!res) {
|
|
1934
|
+
fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
|
|
1935
|
+
exit(1);
|
|
1936
|
+
}
|
|
1612
1937
|
}
|
|
1613
1938
|
if (t.n_gen > 0) {
|
|
1614
1939
|
if (params.progress) {
|
|
1615
1940
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
|
|
1616
1941
|
}
|
|
1617
|
-
test_gen(ctx, 1, t.n_threads);
|
|
1942
|
+
bool res = test_gen(ctx, 1, t.n_threads);
|
|
1943
|
+
if (!res) {
|
|
1944
|
+
fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
|
|
1945
|
+
exit(1);
|
|
1946
|
+
}
|
|
1618
1947
|
}
|
|
1619
1948
|
|
|
1620
1949
|
for (int i = 0; i < params.reps; i++) {
|
|
1621
1950
|
llama_kv_self_clear(ctx);
|
|
1622
1951
|
|
|
1952
|
+
if (t.n_depth > 0) {
|
|
1953
|
+
if (params.progress) {
|
|
1954
|
+
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
|
1955
|
+
i + 1, params.reps);
|
|
1956
|
+
}
|
|
1957
|
+
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
|
1958
|
+
if (!res) {
|
|
1959
|
+
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
|
|
1960
|
+
exit(1);
|
|
1961
|
+
}
|
|
1962
|
+
}
|
|
1963
|
+
|
|
1623
1964
|
uint64_t t_start = get_time_ns();
|
|
1624
1965
|
|
|
1625
1966
|
if (t.n_prompt > 0) {
|
|
@@ -1627,14 +1968,22 @@ int main(int argc, char ** argv) {
|
|
|
1627
1968
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
|
|
1628
1969
|
i + 1, params.reps);
|
|
1629
1970
|
}
|
|
1630
|
-
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1971
|
+
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1972
|
+
if (!res) {
|
|
1973
|
+
fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
|
|
1974
|
+
exit(1);
|
|
1975
|
+
}
|
|
1631
1976
|
}
|
|
1632
1977
|
if (t.n_gen > 0) {
|
|
1633
1978
|
if (params.progress) {
|
|
1634
1979
|
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
|
|
1635
1980
|
i + 1, params.reps);
|
|
1636
1981
|
}
|
|
1637
|
-
test_gen(ctx, t.n_gen, t.n_threads);
|
|
1982
|
+
bool res = test_gen(ctx, t.n_gen, t.n_threads);
|
|
1983
|
+
if (!res) {
|
|
1984
|
+
fprintf(stderr, "%s: error: failed to run gen\n", __func__);
|
|
1985
|
+
exit(1);
|
|
1986
|
+
}
|
|
1638
1987
|
}
|
|
1639
1988
|
|
|
1640
1989
|
uint64_t t_ns = get_time_ns() - t_start;
|