@fugood/llama.node 0.6.3 → 1.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +40 -30
- package/README.md +4 -1
- package/lib/binding.js +41 -29
- package/lib/binding.ts +26 -25
- package/package.json +40 -7
- package/scripts/build.js +47 -0
- package/scripts/llama.cpp.patch +109 -0
- package/src/anyascii.c +22223 -0
- package/src/anyascii.h +42 -0
- package/src/tts_utils.cpp +20 -7
- package/src/tts_utils.h +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
- package/src/llama.cpp/.github/workflows/build.yml +0 -1078
- package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
- package/src/llama.cpp/.github/workflows/docker.yml +0 -178
- package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
- package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
- package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
- package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
- package/src/llama.cpp/.github/workflows/release.yml +0 -739
- package/src/llama.cpp/.github/workflows/server.yml +0 -237
- package/src/llama.cpp/.github/workflows/winget.yml +0 -42
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
- package/src/llama.cpp/cmake/build-info.cmake +0 -64
- package/src/llama.cpp/cmake/common.cmake +0 -35
- package/src/llama.cpp/cmake/git-vars.cmake +0 -22
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
- package/src/llama.cpp/common/build-info.cpp.in +0 -4
- package/src/llama.cpp/docs/build.md +0 -561
- package/src/llama.cpp/examples/CMakeLists.txt +0 -43
- package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/batched/batched.cpp +0 -246
- package/src/llama.cpp/examples/chat-13B.bat +0 -57
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
- package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
- package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/simple/simple.cpp +0 -206
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
- package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/sycl/build.sh +0 -23
- package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
- package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
- package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
- package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/training/finetune.cpp +0 -96
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
- package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
- package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
- package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
- package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
- package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
- package/src/llama.cpp/ggml/src/ggml.c +0 -6550
- package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
- package/src/llama.cpp/models/.editorconfig +0 -1
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
- package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
- package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/src/llama.cpp/prompts/alpaca.txt +0 -1
- package/src/llama.cpp/prompts/assistant.txt +0 -31
- package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/src/llama.cpp/prompts/chat.txt +0 -28
- package/src/llama.cpp/prompts/dan-modified.txt +0 -1
- package/src/llama.cpp/prompts/dan.txt +0 -1
- package/src/llama.cpp/prompts/mnemonics.txt +0 -93
- package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/src/llama.cpp/prompts/reason-act.txt +0 -18
- package/src/llama.cpp/requirements/requirements-all.txt +0 -15
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
- package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
- package/src/llama.cpp/requirements.txt +0 -13
- package/src/llama.cpp/scripts/build-info.sh +0 -30
- package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
- package/src/llama.cpp/scripts/xxd.cmake +0 -16
- package/src/llama.cpp/tests/CMakeLists.txt +0 -177
- package/src/llama.cpp/tests/get-model.cpp +0 -21
- package/src/llama.cpp/tests/get-model.h +0 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
- package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
- package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
- package/src/llama.cpp/tests/test-barrier.cpp +0 -94
- package/src/llama.cpp/tests/test-c.c +0 -7
- package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
- package/src/llama.cpp/tests/test-chat.cpp +0 -985
- package/src/llama.cpp/tests/test-double-float.cpp +0 -57
- package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
- package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
- package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
- package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
- package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
- package/src/llama.cpp/tests/test-log.cpp +0 -39
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
- package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
- package/src/llama.cpp/tests/test-opt.cpp +0 -904
- package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
- package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
- package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
- package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
- package/src/llama.cpp/tests/test-rope.cpp +0 -262
- package/src/llama.cpp/tests/test-sampling.cpp +0 -399
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
- package/src/llama.cpp/tools/CMakeLists.txt +0 -39
- package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
- package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
- package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
- package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
- package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
- package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
- package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
- package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
- package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
- package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
- package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
- package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/main/main.cpp +0 -977
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
- package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
- package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
- package/src/llama.cpp/tools/mtmd/clip.h +0 -101
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
- package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
- package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
- package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
- package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
- package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
- package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
- package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
- package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
- package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
- package/src/llama.cpp/tools/run/run.cpp +0 -1261
- package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
- package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
- package/src/llama.cpp/tools/server/httplib.h +0 -10506
- package/src/llama.cpp/tools/server/server.cpp +0 -4966
- package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
- package/src/llama.cpp/tools/server/utils.hpp +0 -1337
- package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
- package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
|
@@ -1,2024 +0,0 @@
|
|
|
1
|
-
#include <algorithm>
|
|
2
|
-
#include <array>
|
|
3
|
-
#include <cassert>
|
|
4
|
-
#include <chrono>
|
|
5
|
-
#include <cinttypes>
|
|
6
|
-
#include <clocale>
|
|
7
|
-
#include <cmath>
|
|
8
|
-
#include <cstdio>
|
|
9
|
-
#include <cstdlib>
|
|
10
|
-
#include <cstring>
|
|
11
|
-
#include <ctime>
|
|
12
|
-
#include <iterator>
|
|
13
|
-
#include <map>
|
|
14
|
-
#include <numeric>
|
|
15
|
-
#include <regex>
|
|
16
|
-
#include <sstream>
|
|
17
|
-
#include <string>
|
|
18
|
-
#include <thread>
|
|
19
|
-
#include <vector>
|
|
20
|
-
|
|
21
|
-
#include "common.h"
|
|
22
|
-
#include "ggml.h"
|
|
23
|
-
#include "llama.h"
|
|
24
|
-
|
|
25
|
-
#ifdef _WIN32
|
|
26
|
-
# define WIN32_LEAN_AND_MEAN
|
|
27
|
-
# ifndef NOMINMAX
|
|
28
|
-
# define NOMINMAX
|
|
29
|
-
# endif
|
|
30
|
-
# include <windows.h>
|
|
31
|
-
#endif
|
|
32
|
-
|
|
33
|
-
// utils
|
|
34
|
-
static uint64_t get_time_ns() {
|
|
35
|
-
using clock = std::chrono::high_resolution_clock;
|
|
36
|
-
return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
|
|
40
|
-
if (a.pattern != b.pattern) {
|
|
41
|
-
// cString comparison that may be null
|
|
42
|
-
if (a.pattern == nullptr || b.pattern == nullptr) {
|
|
43
|
-
return false;
|
|
44
|
-
}
|
|
45
|
-
if (strcmp(a.pattern, b.pattern) != 0) {
|
|
46
|
-
return false;
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
if (a.buft != b.buft) {
|
|
50
|
-
return false;
|
|
51
|
-
}
|
|
52
|
-
return true;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
|
|
56
|
-
if (a.size() != b.size()) {
|
|
57
|
-
return false;
|
|
58
|
-
}
|
|
59
|
-
for (size_t i = 0; i < a.size(); i++) {
|
|
60
|
-
if (!tensor_buft_override_equal(a[i], b[i])) {
|
|
61
|
-
return false;
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
return true;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
|
|
68
|
-
if (a.size() != b.size()) {
|
|
69
|
-
return false;
|
|
70
|
-
}
|
|
71
|
-
for (size_t i = 0; i < a.size(); i++) {
|
|
72
|
-
if (!vec_tensor_buft_override_equal(a[i], b[i])) {
|
|
73
|
-
return false;
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
return true;
|
|
77
|
-
}
|
|
78
|
-
|
|
79
|
-
template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
|
|
80
|
-
std::ostringstream str;
|
|
81
|
-
for (size_t i = 0; i < values.size(); i++) {
|
|
82
|
-
str << values[i];
|
|
83
|
-
if (i < values.size() - 1) {
|
|
84
|
-
str << delim;
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
return str.str();
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
template <typename T, typename F> static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
|
91
|
-
std::vector<std::string> str_values;
|
|
92
|
-
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
|
93
|
-
return str_values;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
template <typename T> static T avg(const std::vector<T> & v) {
|
|
97
|
-
if (v.empty()) {
|
|
98
|
-
return 0;
|
|
99
|
-
}
|
|
100
|
-
T sum = std::accumulate(v.begin(), v.end(), T(0));
|
|
101
|
-
return sum / (T) v.size();
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
template <typename T> static T stdev(const std::vector<T> & v) {
|
|
105
|
-
if (v.size() <= 1) {
|
|
106
|
-
return 0;
|
|
107
|
-
}
|
|
108
|
-
T mean = avg(v);
|
|
109
|
-
T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
|
|
110
|
-
T stdev = std::sqrt(sq_sum / (T) (v.size() - 1) - mean * mean * (T) v.size() / (T) (v.size() - 1));
|
|
111
|
-
return stdev;
|
|
112
|
-
}
|
|
113
|
-
|
|
114
|
-
static std::string get_cpu_info() {
|
|
115
|
-
std::vector<std::string> cpu_list;
|
|
116
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
117
|
-
auto * dev = ggml_backend_dev_get(i);
|
|
118
|
-
auto dev_type = ggml_backend_dev_type(dev);
|
|
119
|
-
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
|
|
120
|
-
cpu_list.push_back(ggml_backend_dev_description(dev));
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
return join(cpu_list, ", ");
|
|
124
|
-
}
|
|
125
|
-
|
|
126
|
-
static std::string get_gpu_info() {
|
|
127
|
-
std::vector<std::string> gpu_list;
|
|
128
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
|
129
|
-
auto * dev = ggml_backend_dev_get(i);
|
|
130
|
-
auto dev_type = ggml_backend_dev_type(dev);
|
|
131
|
-
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
|
|
132
|
-
gpu_list.push_back(ggml_backend_dev_description(dev));
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
return join(gpu_list, ", ");
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
// command line params
|
|
139
|
-
enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
|
|
140
|
-
|
|
141
|
-
static const char * output_format_str(output_formats format) {
|
|
142
|
-
switch (format) {
|
|
143
|
-
case NONE:
|
|
144
|
-
return "none";
|
|
145
|
-
case CSV:
|
|
146
|
-
return "csv";
|
|
147
|
-
case JSON:
|
|
148
|
-
return "json";
|
|
149
|
-
case JSONL:
|
|
150
|
-
return "jsonl";
|
|
151
|
-
case MARKDOWN:
|
|
152
|
-
return "md";
|
|
153
|
-
case SQL:
|
|
154
|
-
return "sql";
|
|
155
|
-
default:
|
|
156
|
-
GGML_ABORT("invalid output format");
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
static bool output_format_from_str(const std::string & s, output_formats & format) {
|
|
161
|
-
if (s == "none") {
|
|
162
|
-
format = NONE;
|
|
163
|
-
} else if (s == "csv") {
|
|
164
|
-
format = CSV;
|
|
165
|
-
} else if (s == "json") {
|
|
166
|
-
format = JSON;
|
|
167
|
-
} else if (s == "jsonl") {
|
|
168
|
-
format = JSONL;
|
|
169
|
-
} else if (s == "md") {
|
|
170
|
-
format = MARKDOWN;
|
|
171
|
-
} else if (s == "sql") {
|
|
172
|
-
format = SQL;
|
|
173
|
-
} else {
|
|
174
|
-
return false;
|
|
175
|
-
}
|
|
176
|
-
return true;
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
static const char * split_mode_str(llama_split_mode mode) {
|
|
180
|
-
switch (mode) {
|
|
181
|
-
case LLAMA_SPLIT_MODE_NONE:
|
|
182
|
-
return "none";
|
|
183
|
-
case LLAMA_SPLIT_MODE_LAYER:
|
|
184
|
-
return "layer";
|
|
185
|
-
case LLAMA_SPLIT_MODE_ROW:
|
|
186
|
-
return "row";
|
|
187
|
-
default:
|
|
188
|
-
GGML_ABORT("invalid split mode");
|
|
189
|
-
}
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
static std::string pair_str(const std::pair<int, int> & p) {
|
|
193
|
-
static char buf[32];
|
|
194
|
-
snprintf(buf, sizeof(buf), "%d,%d", p.first, p.second);
|
|
195
|
-
return buf;
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
static std::vector<int> parse_int_range(const std::string & s) {
|
|
199
|
-
// first[-last[(+|*)step]]
|
|
200
|
-
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
|
|
201
|
-
|
|
202
|
-
std::smatch match;
|
|
203
|
-
std::string::const_iterator search_start(s.cbegin());
|
|
204
|
-
std::vector<int> result;
|
|
205
|
-
while (std::regex_search(search_start, s.cend(), match, range_regex)) {
|
|
206
|
-
int first = std::stoi(match[1]);
|
|
207
|
-
int last = match[2].matched ? std::stoi(match[2]) : first;
|
|
208
|
-
char op = match[3].matched ? match[3].str()[0] : '+';
|
|
209
|
-
int step = match[4].matched ? std::stoi(match[4]) : 1;
|
|
210
|
-
|
|
211
|
-
for (int i = first; i <= last;) {
|
|
212
|
-
result.push_back(i);
|
|
213
|
-
|
|
214
|
-
int prev_i = i;
|
|
215
|
-
|
|
216
|
-
if (op == '+') {
|
|
217
|
-
i += step;
|
|
218
|
-
} else if (op == '*') {
|
|
219
|
-
i *= step;
|
|
220
|
-
} else {
|
|
221
|
-
throw std::invalid_argument("invalid range format");
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
if (i <= prev_i) {
|
|
225
|
-
throw std::invalid_argument("invalid range");
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
search_start = match.suffix().first;
|
|
229
|
-
}
|
|
230
|
-
|
|
231
|
-
if (search_start != s.cend()) {
|
|
232
|
-
throw std::invalid_argument("invalid range format");
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
return result;
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
struct cmd_params {
|
|
239
|
-
std::vector<std::string> model;
|
|
240
|
-
std::vector<int> n_prompt;
|
|
241
|
-
std::vector<int> n_gen;
|
|
242
|
-
std::vector<std::pair<int, int>> n_pg;
|
|
243
|
-
std::vector<int> n_depth;
|
|
244
|
-
std::vector<int> n_batch;
|
|
245
|
-
std::vector<int> n_ubatch;
|
|
246
|
-
std::vector<ggml_type> type_k;
|
|
247
|
-
std::vector<ggml_type> type_v;
|
|
248
|
-
std::vector<float> defrag_thold;
|
|
249
|
-
std::vector<int> n_threads;
|
|
250
|
-
std::vector<std::string> cpu_mask;
|
|
251
|
-
std::vector<bool> cpu_strict;
|
|
252
|
-
std::vector<int> poll;
|
|
253
|
-
std::vector<int> n_gpu_layers;
|
|
254
|
-
std::vector<std::string> rpc_servers;
|
|
255
|
-
std::vector<llama_split_mode> split_mode;
|
|
256
|
-
std::vector<int> main_gpu;
|
|
257
|
-
std::vector<bool> no_kv_offload;
|
|
258
|
-
std::vector<bool> flash_attn;
|
|
259
|
-
std::vector<std::vector<float>> tensor_split;
|
|
260
|
-
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
|
|
261
|
-
std::vector<bool> use_mmap;
|
|
262
|
-
std::vector<bool> embeddings;
|
|
263
|
-
std::vector<bool> no_op_offload;
|
|
264
|
-
ggml_numa_strategy numa;
|
|
265
|
-
int reps;
|
|
266
|
-
ggml_sched_priority prio;
|
|
267
|
-
int delay;
|
|
268
|
-
bool verbose;
|
|
269
|
-
bool progress;
|
|
270
|
-
output_formats output_format;
|
|
271
|
-
output_formats output_format_stderr;
|
|
272
|
-
};
|
|
273
|
-
|
|
274
|
-
static const cmd_params cmd_params_defaults = {
|
|
275
|
-
/* model */ { "models/7B/ggml-model-q4_0.gguf" },
|
|
276
|
-
/* n_prompt */ { 512 },
|
|
277
|
-
/* n_gen */ { 128 },
|
|
278
|
-
/* n_pg */ {},
|
|
279
|
-
/* n_depth */ { 0 },
|
|
280
|
-
/* n_batch */ { 2048 },
|
|
281
|
-
/* n_ubatch */ { 512 },
|
|
282
|
-
/* type_k */ { GGML_TYPE_F16 },
|
|
283
|
-
/* type_v */ { GGML_TYPE_F16 },
|
|
284
|
-
/* defrag_thold */ { -1.0f },
|
|
285
|
-
/* n_threads */ { cpu_get_num_math() },
|
|
286
|
-
/* cpu_mask */ { "0x0" },
|
|
287
|
-
/* cpu_strict */ { false },
|
|
288
|
-
/* poll */ { 50 },
|
|
289
|
-
/* n_gpu_layers */ { 99 },
|
|
290
|
-
/* rpc_servers */ { "" },
|
|
291
|
-
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
|
|
292
|
-
/* main_gpu */ { 0 },
|
|
293
|
-
/* no_kv_offload */ { false },
|
|
294
|
-
/* flash_attn */ { false },
|
|
295
|
-
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
|
296
|
-
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
|
297
|
-
/* use_mmap */ { true },
|
|
298
|
-
/* embeddings */ { false },
|
|
299
|
-
/* no_op_offload */ { false },
|
|
300
|
-
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
301
|
-
/* reps */ 5,
|
|
302
|
-
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
|
303
|
-
/* delay */ 0,
|
|
304
|
-
/* verbose */ false,
|
|
305
|
-
/* progress */ false,
|
|
306
|
-
/* output_format */ MARKDOWN,
|
|
307
|
-
/* output_format_stderr */ NONE,
|
|
308
|
-
};
|
|
309
|
-
|
|
310
|
-
static void print_usage(int /* argc */, char ** argv) {
|
|
311
|
-
printf("usage: %s [options]\n", argv[0]);
|
|
312
|
-
printf("\n");
|
|
313
|
-
printf("options:\n");
|
|
314
|
-
printf(" -h, --help\n");
|
|
315
|
-
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
|
|
316
|
-
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
|
|
317
|
-
cmd_params_defaults.reps);
|
|
318
|
-
printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
|
|
319
|
-
cmd_params_defaults.prio);
|
|
320
|
-
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
|
|
321
|
-
cmd_params_defaults.delay);
|
|
322
|
-
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
|
|
323
|
-
output_format_str(cmd_params_defaults.output_format));
|
|
324
|
-
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
|
|
325
|
-
output_format_str(cmd_params_defaults.output_format_stderr));
|
|
326
|
-
printf(" -v, --verbose verbose output\n");
|
|
327
|
-
printf(" --progress print test progress indicators\n");
|
|
328
|
-
printf("\n");
|
|
329
|
-
printf("test parameters:\n");
|
|
330
|
-
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
|
331
|
-
printf(" -p, --n-prompt <n> (default: %s)\n",
|
|
332
|
-
join(cmd_params_defaults.n_prompt, ",").c_str());
|
|
333
|
-
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
|
334
|
-
printf(" -pg <pp,tg> (default: %s)\n",
|
|
335
|
-
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
|
336
|
-
printf(" -d, --n-depth <n> (default: %s)\n",
|
|
337
|
-
join(cmd_params_defaults.n_depth, ",").c_str());
|
|
338
|
-
printf(" -b, --batch-size <n> (default: %s)\n",
|
|
339
|
-
join(cmd_params_defaults.n_batch, ",").c_str());
|
|
340
|
-
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
|
341
|
-
join(cmd_params_defaults.n_ubatch, ",").c_str());
|
|
342
|
-
printf(" -ctk, --cache-type-k <t> (default: %s)\n",
|
|
343
|
-
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
|
344
|
-
printf(" -ctv, --cache-type-v <t> (default: %s)\n",
|
|
345
|
-
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
|
346
|
-
printf(" -dt, --defrag-thold <f> (default: %s)\n",
|
|
347
|
-
join(cmd_params_defaults.defrag_thold, ",").c_str());
|
|
348
|
-
printf(" -t, --threads <n> (default: %s)\n",
|
|
349
|
-
join(cmd_params_defaults.n_threads, ",").c_str());
|
|
350
|
-
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
|
|
351
|
-
join(cmd_params_defaults.cpu_mask, ",").c_str());
|
|
352
|
-
printf(" --cpu-strict <0|1> (default: %s)\n",
|
|
353
|
-
join(cmd_params_defaults.cpu_strict, ",").c_str());
|
|
354
|
-
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
|
355
|
-
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n",
|
|
356
|
-
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
|
357
|
-
if (llama_supports_rpc()) {
|
|
358
|
-
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
|
|
359
|
-
join(cmd_params_defaults.rpc_servers, ",").c_str());
|
|
360
|
-
}
|
|
361
|
-
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
|
|
362
|
-
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
363
|
-
printf(" -mg, --main-gpu <i> (default: %s)\n",
|
|
364
|
-
join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
365
|
-
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n",
|
|
366
|
-
join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
367
|
-
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
|
|
368
|
-
join(cmd_params_defaults.flash_attn, ",").c_str());
|
|
369
|
-
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
|
370
|
-
join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
371
|
-
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
|
372
|
-
join(cmd_params_defaults.embeddings, ",").c_str());
|
|
373
|
-
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
374
|
-
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
|
|
375
|
-
printf(" (default: disabled)\n");
|
|
376
|
-
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
|
|
377
|
-
printf("\n");
|
|
378
|
-
printf(
|
|
379
|
-
"Multiple values can be given for each parameter by separating them with ','\n"
|
|
380
|
-
"or by specifying the parameter multiple times. Ranges can be given as\n"
|
|
381
|
-
"'first-last' or 'first-last+step' or 'first-last*mult'.\n");
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
static ggml_type ggml_type_from_name(const std::string & s) {
|
|
385
|
-
if (s == "f16") {
|
|
386
|
-
return GGML_TYPE_F16;
|
|
387
|
-
}
|
|
388
|
-
if (s == "bf16") {
|
|
389
|
-
return GGML_TYPE_BF16;
|
|
390
|
-
}
|
|
391
|
-
if (s == "q8_0") {
|
|
392
|
-
return GGML_TYPE_Q8_0;
|
|
393
|
-
}
|
|
394
|
-
if (s == "q4_0") {
|
|
395
|
-
return GGML_TYPE_Q4_0;
|
|
396
|
-
}
|
|
397
|
-
if (s == "q4_1") {
|
|
398
|
-
return GGML_TYPE_Q4_1;
|
|
399
|
-
}
|
|
400
|
-
if (s == "q5_0") {
|
|
401
|
-
return GGML_TYPE_Q5_0;
|
|
402
|
-
}
|
|
403
|
-
if (s == "q5_1") {
|
|
404
|
-
return GGML_TYPE_Q5_1;
|
|
405
|
-
}
|
|
406
|
-
if (s == "iq4_nl") {
|
|
407
|
-
return GGML_TYPE_IQ4_NL;
|
|
408
|
-
}
|
|
409
|
-
|
|
410
|
-
return GGML_TYPE_COUNT;
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
414
|
-
cmd_params params;
|
|
415
|
-
std::string arg;
|
|
416
|
-
bool invalid_param = false;
|
|
417
|
-
const std::string arg_prefix = "--";
|
|
418
|
-
const char split_delim = ',';
|
|
419
|
-
|
|
420
|
-
params.verbose = cmd_params_defaults.verbose;
|
|
421
|
-
params.output_format = cmd_params_defaults.output_format;
|
|
422
|
-
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
|
423
|
-
params.reps = cmd_params_defaults.reps;
|
|
424
|
-
params.numa = cmd_params_defaults.numa;
|
|
425
|
-
params.prio = cmd_params_defaults.prio;
|
|
426
|
-
params.delay = cmd_params_defaults.delay;
|
|
427
|
-
params.progress = cmd_params_defaults.progress;
|
|
428
|
-
|
|
429
|
-
for (int i = 1; i < argc; i++) {
|
|
430
|
-
arg = argv[i];
|
|
431
|
-
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
|
432
|
-
std::replace(arg.begin(), arg.end(), '_', '-');
|
|
433
|
-
}
|
|
434
|
-
|
|
435
|
-
try {
|
|
436
|
-
if (arg == "-h" || arg == "--help") {
|
|
437
|
-
print_usage(argc, argv);
|
|
438
|
-
exit(0);
|
|
439
|
-
} else if (arg == "-m" || arg == "--model") {
|
|
440
|
-
if (++i >= argc) {
|
|
441
|
-
invalid_param = true;
|
|
442
|
-
break;
|
|
443
|
-
}
|
|
444
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
445
|
-
params.model.insert(params.model.end(), p.begin(), p.end());
|
|
446
|
-
} else if (arg == "-p" || arg == "--n-prompt") {
|
|
447
|
-
if (++i >= argc) {
|
|
448
|
-
invalid_param = true;
|
|
449
|
-
break;
|
|
450
|
-
}
|
|
451
|
-
auto p = parse_int_range(argv[i]);
|
|
452
|
-
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
|
453
|
-
} else if (arg == "-n" || arg == "--n-gen") {
|
|
454
|
-
if (++i >= argc) {
|
|
455
|
-
invalid_param = true;
|
|
456
|
-
break;
|
|
457
|
-
}
|
|
458
|
-
auto p = parse_int_range(argv[i]);
|
|
459
|
-
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
|
460
|
-
} else if (arg == "-pg") {
|
|
461
|
-
if (++i >= argc) {
|
|
462
|
-
invalid_param = true;
|
|
463
|
-
break;
|
|
464
|
-
}
|
|
465
|
-
auto p = string_split<std::string>(argv[i], ',');
|
|
466
|
-
if (p.size() != 2) {
|
|
467
|
-
invalid_param = true;
|
|
468
|
-
break;
|
|
469
|
-
}
|
|
470
|
-
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
|
471
|
-
} else if (arg == "-d" || arg == "--n-depth") {
|
|
472
|
-
if (++i >= argc) {
|
|
473
|
-
invalid_param = true;
|
|
474
|
-
break;
|
|
475
|
-
}
|
|
476
|
-
auto p = parse_int_range(argv[i]);
|
|
477
|
-
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
|
478
|
-
} else if (arg == "-b" || arg == "--batch-size") {
|
|
479
|
-
if (++i >= argc) {
|
|
480
|
-
invalid_param = true;
|
|
481
|
-
break;
|
|
482
|
-
}
|
|
483
|
-
auto p = parse_int_range(argv[i]);
|
|
484
|
-
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
|
485
|
-
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
|
486
|
-
if (++i >= argc) {
|
|
487
|
-
invalid_param = true;
|
|
488
|
-
break;
|
|
489
|
-
}
|
|
490
|
-
auto p = parse_int_range(argv[i]);
|
|
491
|
-
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
|
492
|
-
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
|
493
|
-
if (++i >= argc) {
|
|
494
|
-
invalid_param = true;
|
|
495
|
-
break;
|
|
496
|
-
}
|
|
497
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
498
|
-
|
|
499
|
-
std::vector<ggml_type> types;
|
|
500
|
-
for (const auto & t : p) {
|
|
501
|
-
ggml_type gt = ggml_type_from_name(t);
|
|
502
|
-
if (gt == GGML_TYPE_COUNT) {
|
|
503
|
-
invalid_param = true;
|
|
504
|
-
break;
|
|
505
|
-
}
|
|
506
|
-
types.push_back(gt);
|
|
507
|
-
}
|
|
508
|
-
if (invalid_param) {
|
|
509
|
-
break;
|
|
510
|
-
}
|
|
511
|
-
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
|
512
|
-
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
|
513
|
-
if (++i >= argc) {
|
|
514
|
-
invalid_param = true;
|
|
515
|
-
break;
|
|
516
|
-
}
|
|
517
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
518
|
-
|
|
519
|
-
std::vector<ggml_type> types;
|
|
520
|
-
for (const auto & t : p) {
|
|
521
|
-
ggml_type gt = ggml_type_from_name(t);
|
|
522
|
-
if (gt == GGML_TYPE_COUNT) {
|
|
523
|
-
invalid_param = true;
|
|
524
|
-
break;
|
|
525
|
-
}
|
|
526
|
-
types.push_back(gt);
|
|
527
|
-
}
|
|
528
|
-
if (invalid_param) {
|
|
529
|
-
break;
|
|
530
|
-
}
|
|
531
|
-
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
|
532
|
-
} else if (arg == "-dt" || arg == "--defrag-thold") {
|
|
533
|
-
if (++i >= argc) {
|
|
534
|
-
invalid_param = true;
|
|
535
|
-
break;
|
|
536
|
-
}
|
|
537
|
-
auto p = string_split<float>(argv[i], split_delim);
|
|
538
|
-
params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
|
|
539
|
-
} else if (arg == "-t" || arg == "--threads") {
|
|
540
|
-
if (++i >= argc) {
|
|
541
|
-
invalid_param = true;
|
|
542
|
-
break;
|
|
543
|
-
}
|
|
544
|
-
auto p = parse_int_range(argv[i]);
|
|
545
|
-
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
|
546
|
-
} else if (arg == "-C" || arg == "--cpu-mask") {
|
|
547
|
-
if (++i >= argc) {
|
|
548
|
-
invalid_param = true;
|
|
549
|
-
break;
|
|
550
|
-
}
|
|
551
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
552
|
-
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
|
553
|
-
} else if (arg == "--cpu-strict") {
|
|
554
|
-
if (++i >= argc) {
|
|
555
|
-
invalid_param = true;
|
|
556
|
-
break;
|
|
557
|
-
}
|
|
558
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
559
|
-
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
|
560
|
-
} else if (arg == "--poll") {
|
|
561
|
-
if (++i >= argc) {
|
|
562
|
-
invalid_param = true;
|
|
563
|
-
break;
|
|
564
|
-
}
|
|
565
|
-
auto p = parse_int_range(argv[i]);
|
|
566
|
-
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
|
567
|
-
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
568
|
-
if (++i >= argc) {
|
|
569
|
-
invalid_param = true;
|
|
570
|
-
break;
|
|
571
|
-
}
|
|
572
|
-
auto p = parse_int_range(argv[i]);
|
|
573
|
-
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
|
574
|
-
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
|
|
575
|
-
if (++i >= argc) {
|
|
576
|
-
invalid_param = true;
|
|
577
|
-
break;
|
|
578
|
-
}
|
|
579
|
-
params.rpc_servers.push_back(argv[i]);
|
|
580
|
-
} else if (arg == "-sm" || arg == "--split-mode") {
|
|
581
|
-
if (++i >= argc) {
|
|
582
|
-
invalid_param = true;
|
|
583
|
-
break;
|
|
584
|
-
}
|
|
585
|
-
auto p = string_split<std::string>(argv[i], split_delim);
|
|
586
|
-
|
|
587
|
-
std::vector<llama_split_mode> modes;
|
|
588
|
-
for (const auto & m : p) {
|
|
589
|
-
llama_split_mode mode;
|
|
590
|
-
if (m == "none") {
|
|
591
|
-
mode = LLAMA_SPLIT_MODE_NONE;
|
|
592
|
-
} else if (m == "layer") {
|
|
593
|
-
mode = LLAMA_SPLIT_MODE_LAYER;
|
|
594
|
-
} else if (m == "row") {
|
|
595
|
-
mode = LLAMA_SPLIT_MODE_ROW;
|
|
596
|
-
} else {
|
|
597
|
-
invalid_param = true;
|
|
598
|
-
break;
|
|
599
|
-
}
|
|
600
|
-
modes.push_back(mode);
|
|
601
|
-
}
|
|
602
|
-
if (invalid_param) {
|
|
603
|
-
break;
|
|
604
|
-
}
|
|
605
|
-
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
|
|
606
|
-
} else if (arg == "-mg" || arg == "--main-gpu") {
|
|
607
|
-
if (++i >= argc) {
|
|
608
|
-
invalid_param = true;
|
|
609
|
-
break;
|
|
610
|
-
}
|
|
611
|
-
params.main_gpu = parse_int_range(argv[i]);
|
|
612
|
-
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
|
613
|
-
if (++i >= argc) {
|
|
614
|
-
invalid_param = true;
|
|
615
|
-
break;
|
|
616
|
-
}
|
|
617
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
618
|
-
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
|
619
|
-
} else if (arg == "--numa") {
|
|
620
|
-
if (++i >= argc) {
|
|
621
|
-
invalid_param = true;
|
|
622
|
-
break;
|
|
623
|
-
}
|
|
624
|
-
std::string value(argv[i]);
|
|
625
|
-
if (value == "distribute" || value == "") {
|
|
626
|
-
params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
|
|
627
|
-
} else if (value == "isolate") {
|
|
628
|
-
params.numa = GGML_NUMA_STRATEGY_ISOLATE;
|
|
629
|
-
} else if (value == "numactl") {
|
|
630
|
-
params.numa = GGML_NUMA_STRATEGY_NUMACTL;
|
|
631
|
-
} else {
|
|
632
|
-
invalid_param = true;
|
|
633
|
-
break;
|
|
634
|
-
}
|
|
635
|
-
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
636
|
-
if (++i >= argc) {
|
|
637
|
-
invalid_param = true;
|
|
638
|
-
break;
|
|
639
|
-
}
|
|
640
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
641
|
-
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
|
642
|
-
} else if (arg == "-mmp" || arg == "--mmap") {
|
|
643
|
-
if (++i >= argc) {
|
|
644
|
-
invalid_param = true;
|
|
645
|
-
break;
|
|
646
|
-
}
|
|
647
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
648
|
-
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
|
|
649
|
-
} else if (arg == "-embd" || arg == "--embeddings") {
|
|
650
|
-
if (++i >= argc) {
|
|
651
|
-
invalid_param = true;
|
|
652
|
-
break;
|
|
653
|
-
}
|
|
654
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
655
|
-
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
|
|
656
|
-
} else if (arg == "-nopo" || arg == "--no-op-offload") {
|
|
657
|
-
if (++i >= argc) {
|
|
658
|
-
invalid_param = true;
|
|
659
|
-
break;
|
|
660
|
-
}
|
|
661
|
-
auto p = string_split<bool>(argv[i], split_delim);
|
|
662
|
-
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
|
|
663
|
-
} else if (arg == "-ts" || arg == "--tensor-split") {
|
|
664
|
-
if (++i >= argc) {
|
|
665
|
-
invalid_param = true;
|
|
666
|
-
break;
|
|
667
|
-
}
|
|
668
|
-
for (auto ts : string_split<std::string>(argv[i], split_delim)) {
|
|
669
|
-
// split string by ; and /
|
|
670
|
-
const std::regex regex{ R"([;/]+)" };
|
|
671
|
-
std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
|
|
672
|
-
std::vector<std::string> split_arg{ it, {} };
|
|
673
|
-
GGML_ASSERT(split_arg.size() <= llama_max_devices());
|
|
674
|
-
|
|
675
|
-
std::vector<float> tensor_split(llama_max_devices());
|
|
676
|
-
for (size_t i = 0; i < llama_max_devices(); ++i) {
|
|
677
|
-
if (i < split_arg.size()) {
|
|
678
|
-
tensor_split[i] = std::stof(split_arg[i]);
|
|
679
|
-
} else {
|
|
680
|
-
tensor_split[i] = 0.0f;
|
|
681
|
-
}
|
|
682
|
-
}
|
|
683
|
-
params.tensor_split.push_back(tensor_split);
|
|
684
|
-
}
|
|
685
|
-
} else if (arg == "-ot" || arg == "--override-tensor") {
|
|
686
|
-
if (++i >= argc) {
|
|
687
|
-
invalid_param = true;
|
|
688
|
-
break;
|
|
689
|
-
}
|
|
690
|
-
auto * value = argv[i];
|
|
691
|
-
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
|
|
692
|
-
if (buft_list.empty()) {
|
|
693
|
-
// enumerate all the devices and add their buffer types to the list
|
|
694
|
-
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
|
|
695
|
-
auto * dev = ggml_backend_dev_get(i);
|
|
696
|
-
auto * buft = ggml_backend_dev_buffer_type(dev);
|
|
697
|
-
if (buft) {
|
|
698
|
-
buft_list[ggml_backend_buft_name(buft)] = buft;
|
|
699
|
-
}
|
|
700
|
-
}
|
|
701
|
-
}
|
|
702
|
-
auto override_group_span_len = std::strcspn(value, ",");
|
|
703
|
-
bool last_group = false;
|
|
704
|
-
do {
|
|
705
|
-
if (override_group_span_len == 0) {
|
|
706
|
-
// Adds an empty override-tensors for an empty span
|
|
707
|
-
params.tensor_buft_overrides.push_back({{}});
|
|
708
|
-
if (value[override_group_span_len] == '\0') {
|
|
709
|
-
value = &value[override_group_span_len];
|
|
710
|
-
last_group = true;
|
|
711
|
-
} else {
|
|
712
|
-
value = &value[override_group_span_len + 1];
|
|
713
|
-
override_group_span_len = std::strcspn(value, ",");
|
|
714
|
-
}
|
|
715
|
-
continue;
|
|
716
|
-
}
|
|
717
|
-
// Stamps null terminators into the argv
|
|
718
|
-
// value for this option to avoid the
|
|
719
|
-
// memory leak present in the implementation
|
|
720
|
-
// over in arg.cpp. Acceptable because we
|
|
721
|
-
// only parse these args once in this program.
|
|
722
|
-
auto * override_group = value;
|
|
723
|
-
if (value[override_group_span_len] == '\0') {
|
|
724
|
-
value = &value[override_group_span_len];
|
|
725
|
-
last_group = true;
|
|
726
|
-
} else {
|
|
727
|
-
value[override_group_span_len] = '\0';
|
|
728
|
-
value = &value[override_group_span_len + 1];
|
|
729
|
-
}
|
|
730
|
-
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
|
|
731
|
-
auto override_span_len = std::strcspn(override_group, ";");
|
|
732
|
-
while (override_span_len > 0) {
|
|
733
|
-
auto * override = override_group;
|
|
734
|
-
if (override_group[override_span_len] != '\0') {
|
|
735
|
-
override_group[override_span_len] = '\0';
|
|
736
|
-
override_group = &override_group[override_span_len + 1];
|
|
737
|
-
} else {
|
|
738
|
-
override_group = &override_group[override_span_len];
|
|
739
|
-
}
|
|
740
|
-
auto tensor_name_span_len = std::strcspn(override, "=");
|
|
741
|
-
if (tensor_name_span_len >= override_span_len) {
|
|
742
|
-
invalid_param = true;
|
|
743
|
-
break;
|
|
744
|
-
}
|
|
745
|
-
override[tensor_name_span_len] = '\0';
|
|
746
|
-
auto * tensor_name = override;
|
|
747
|
-
auto * buffer_type = &override[tensor_name_span_len + 1];
|
|
748
|
-
if (buft_list.find(buffer_type) == buft_list.end()) {
|
|
749
|
-
printf("error: unrecognized buffer type '%s'\n", buffer_type);
|
|
750
|
-
printf("Available buffer types:\n");
|
|
751
|
-
for (const auto & it : buft_list) {
|
|
752
|
-
printf(" %s\n", ggml_backend_buft_name(it.second));
|
|
753
|
-
}
|
|
754
|
-
invalid_param = true;
|
|
755
|
-
break;
|
|
756
|
-
}
|
|
757
|
-
group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
|
|
758
|
-
override_span_len = std::strcspn(override_group, ";");
|
|
759
|
-
}
|
|
760
|
-
if (invalid_param) {
|
|
761
|
-
break;
|
|
762
|
-
}
|
|
763
|
-
group_tensor_buft_overrides.push_back({nullptr,nullptr});
|
|
764
|
-
params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
|
|
765
|
-
override_group_span_len = std::strcspn(value, ",");
|
|
766
|
-
} while (!last_group);
|
|
767
|
-
} else if (arg == "-r" || arg == "--repetitions") {
|
|
768
|
-
if (++i >= argc) {
|
|
769
|
-
invalid_param = true;
|
|
770
|
-
break;
|
|
771
|
-
}
|
|
772
|
-
params.reps = std::stoi(argv[i]);
|
|
773
|
-
} else if (arg == "--prio") {
|
|
774
|
-
if (++i >= argc) {
|
|
775
|
-
invalid_param = true;
|
|
776
|
-
break;
|
|
777
|
-
}
|
|
778
|
-
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
|
779
|
-
} else if (arg == "--delay") {
|
|
780
|
-
if (++i >= argc) {
|
|
781
|
-
invalid_param = true;
|
|
782
|
-
break;
|
|
783
|
-
}
|
|
784
|
-
params.delay = std::stoi(argv[i]);
|
|
785
|
-
} else if (arg == "-o" || arg == "--output") {
|
|
786
|
-
if (++i >= argc) {
|
|
787
|
-
invalid_param = true;
|
|
788
|
-
break;
|
|
789
|
-
}
|
|
790
|
-
invalid_param = !output_format_from_str(argv[i], params.output_format);
|
|
791
|
-
} else if (arg == "-oe" || arg == "--output-err") {
|
|
792
|
-
if (++i >= argc) {
|
|
793
|
-
invalid_param = true;
|
|
794
|
-
break;
|
|
795
|
-
}
|
|
796
|
-
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
|
|
797
|
-
} else if (arg == "-v" || arg == "--verbose") {
|
|
798
|
-
params.verbose = true;
|
|
799
|
-
} else if (arg == "--progress") {
|
|
800
|
-
params.progress = true;
|
|
801
|
-
} else {
|
|
802
|
-
invalid_param = true;
|
|
803
|
-
break;
|
|
804
|
-
}
|
|
805
|
-
} catch (const std::exception & e) {
|
|
806
|
-
fprintf(stderr, "error: %s\n", e.what());
|
|
807
|
-
invalid_param = true;
|
|
808
|
-
break;
|
|
809
|
-
}
|
|
810
|
-
}
|
|
811
|
-
|
|
812
|
-
if (invalid_param) {
|
|
813
|
-
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
814
|
-
print_usage(argc, argv);
|
|
815
|
-
exit(1);
|
|
816
|
-
}
|
|
817
|
-
|
|
818
|
-
// set defaults
|
|
819
|
-
if (params.model.empty()) {
|
|
820
|
-
params.model = cmd_params_defaults.model;
|
|
821
|
-
}
|
|
822
|
-
if (params.n_prompt.empty()) {
|
|
823
|
-
params.n_prompt = cmd_params_defaults.n_prompt;
|
|
824
|
-
}
|
|
825
|
-
if (params.n_gen.empty()) {
|
|
826
|
-
params.n_gen = cmd_params_defaults.n_gen;
|
|
827
|
-
}
|
|
828
|
-
if (params.n_pg.empty()) {
|
|
829
|
-
params.n_pg = cmd_params_defaults.n_pg;
|
|
830
|
-
}
|
|
831
|
-
if (params.n_depth.empty()) {
|
|
832
|
-
params.n_depth = cmd_params_defaults.n_depth;
|
|
833
|
-
}
|
|
834
|
-
if (params.n_batch.empty()) {
|
|
835
|
-
params.n_batch = cmd_params_defaults.n_batch;
|
|
836
|
-
}
|
|
837
|
-
if (params.n_ubatch.empty()) {
|
|
838
|
-
params.n_ubatch = cmd_params_defaults.n_ubatch;
|
|
839
|
-
}
|
|
840
|
-
if (params.type_k.empty()) {
|
|
841
|
-
params.type_k = cmd_params_defaults.type_k;
|
|
842
|
-
}
|
|
843
|
-
if (params.type_v.empty()) {
|
|
844
|
-
params.type_v = cmd_params_defaults.type_v;
|
|
845
|
-
}
|
|
846
|
-
if (params.defrag_thold.empty()) {
|
|
847
|
-
params.defrag_thold = cmd_params_defaults.defrag_thold;
|
|
848
|
-
}
|
|
849
|
-
if (params.n_gpu_layers.empty()) {
|
|
850
|
-
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
|
|
851
|
-
}
|
|
852
|
-
if (params.rpc_servers.empty()) {
|
|
853
|
-
params.rpc_servers = cmd_params_defaults.rpc_servers;
|
|
854
|
-
}
|
|
855
|
-
if (params.split_mode.empty()) {
|
|
856
|
-
params.split_mode = cmd_params_defaults.split_mode;
|
|
857
|
-
}
|
|
858
|
-
if (params.main_gpu.empty()) {
|
|
859
|
-
params.main_gpu = cmd_params_defaults.main_gpu;
|
|
860
|
-
}
|
|
861
|
-
if (params.no_kv_offload.empty()) {
|
|
862
|
-
params.no_kv_offload = cmd_params_defaults.no_kv_offload;
|
|
863
|
-
}
|
|
864
|
-
if (params.flash_attn.empty()) {
|
|
865
|
-
params.flash_attn = cmd_params_defaults.flash_attn;
|
|
866
|
-
}
|
|
867
|
-
if (params.tensor_split.empty()) {
|
|
868
|
-
params.tensor_split = cmd_params_defaults.tensor_split;
|
|
869
|
-
}
|
|
870
|
-
if (params.tensor_buft_overrides.empty()) {
|
|
871
|
-
params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
|
|
872
|
-
}
|
|
873
|
-
if (params.use_mmap.empty()) {
|
|
874
|
-
params.use_mmap = cmd_params_defaults.use_mmap;
|
|
875
|
-
}
|
|
876
|
-
if (params.embeddings.empty()) {
|
|
877
|
-
params.embeddings = cmd_params_defaults.embeddings;
|
|
878
|
-
}
|
|
879
|
-
if (params.no_op_offload.empty()) {
|
|
880
|
-
params.no_op_offload = cmd_params_defaults.no_op_offload;
|
|
881
|
-
}
|
|
882
|
-
if (params.n_threads.empty()) {
|
|
883
|
-
params.n_threads = cmd_params_defaults.n_threads;
|
|
884
|
-
}
|
|
885
|
-
if (params.cpu_mask.empty()) {
|
|
886
|
-
params.cpu_mask = cmd_params_defaults.cpu_mask;
|
|
887
|
-
}
|
|
888
|
-
if (params.cpu_strict.empty()) {
|
|
889
|
-
params.cpu_strict = cmd_params_defaults.cpu_strict;
|
|
890
|
-
}
|
|
891
|
-
if (params.poll.empty()) {
|
|
892
|
-
params.poll = cmd_params_defaults.poll;
|
|
893
|
-
}
|
|
894
|
-
|
|
895
|
-
return params;
|
|
896
|
-
}
|
|
897
|
-
|
|
898
|
-
struct cmd_params_instance {
|
|
899
|
-
std::string model;
|
|
900
|
-
int n_prompt;
|
|
901
|
-
int n_gen;
|
|
902
|
-
int n_depth;
|
|
903
|
-
int n_batch;
|
|
904
|
-
int n_ubatch;
|
|
905
|
-
ggml_type type_k;
|
|
906
|
-
ggml_type type_v;
|
|
907
|
-
float defrag_thold;
|
|
908
|
-
int n_threads;
|
|
909
|
-
std::string cpu_mask;
|
|
910
|
-
bool cpu_strict;
|
|
911
|
-
int poll;
|
|
912
|
-
int n_gpu_layers;
|
|
913
|
-
std::string rpc_servers_str;
|
|
914
|
-
llama_split_mode split_mode;
|
|
915
|
-
int main_gpu;
|
|
916
|
-
bool no_kv_offload;
|
|
917
|
-
bool flash_attn;
|
|
918
|
-
std::vector<float> tensor_split;
|
|
919
|
-
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
920
|
-
bool use_mmap;
|
|
921
|
-
bool embeddings;
|
|
922
|
-
bool no_op_offload;
|
|
923
|
-
|
|
924
|
-
llama_model_params to_llama_mparams() const {
|
|
925
|
-
llama_model_params mparams = llama_model_default_params();
|
|
926
|
-
|
|
927
|
-
mparams.n_gpu_layers = n_gpu_layers;
|
|
928
|
-
if (!rpc_servers_str.empty()) {
|
|
929
|
-
auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
|
|
930
|
-
|
|
931
|
-
// add RPC devices
|
|
932
|
-
if (!rpc_servers.empty()) {
|
|
933
|
-
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
|
|
934
|
-
if (!rpc_reg) {
|
|
935
|
-
fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
|
|
936
|
-
exit(1);
|
|
937
|
-
}
|
|
938
|
-
|
|
939
|
-
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
|
|
940
|
-
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
|
|
941
|
-
if (!ggml_backend_rpc_add_device_fn) {
|
|
942
|
-
fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
|
|
943
|
-
exit(1);
|
|
944
|
-
}
|
|
945
|
-
static std::vector<ggml_backend_dev_t> devices;
|
|
946
|
-
devices.clear();
|
|
947
|
-
for (const std::string & server : rpc_servers) {
|
|
948
|
-
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
|
|
949
|
-
if (dev) {
|
|
950
|
-
devices.push_back(dev);
|
|
951
|
-
} else {
|
|
952
|
-
fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
|
|
953
|
-
exit(1);
|
|
954
|
-
}
|
|
955
|
-
}
|
|
956
|
-
devices.push_back(nullptr);
|
|
957
|
-
mparams.devices = devices.data();
|
|
958
|
-
}
|
|
959
|
-
}
|
|
960
|
-
mparams.split_mode = split_mode;
|
|
961
|
-
mparams.main_gpu = main_gpu;
|
|
962
|
-
mparams.tensor_split = tensor_split.data();
|
|
963
|
-
mparams.use_mmap = use_mmap;
|
|
964
|
-
|
|
965
|
-
if (tensor_buft_overrides.empty()) {
|
|
966
|
-
mparams.tensor_buft_overrides = nullptr;
|
|
967
|
-
} else {
|
|
968
|
-
GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
|
|
969
|
-
mparams.tensor_buft_overrides = tensor_buft_overrides.data();
|
|
970
|
-
}
|
|
971
|
-
|
|
972
|
-
return mparams;
|
|
973
|
-
}
|
|
974
|
-
|
|
975
|
-
bool equal_mparams(const cmd_params_instance & other) const {
|
|
976
|
-
return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
|
|
977
|
-
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
|
|
978
|
-
tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
|
|
979
|
-
}
|
|
980
|
-
|
|
981
|
-
llama_context_params to_llama_cparams() const {
|
|
982
|
-
llama_context_params cparams = llama_context_default_params();
|
|
983
|
-
|
|
984
|
-
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
|
985
|
-
cparams.n_batch = n_batch;
|
|
986
|
-
cparams.n_ubatch = n_ubatch;
|
|
987
|
-
cparams.type_k = type_k;
|
|
988
|
-
cparams.type_v = type_v;
|
|
989
|
-
cparams.defrag_thold = defrag_thold;
|
|
990
|
-
cparams.offload_kqv = !no_kv_offload;
|
|
991
|
-
cparams.flash_attn = flash_attn;
|
|
992
|
-
cparams.embeddings = embeddings;
|
|
993
|
-
cparams.op_offload = !no_op_offload;
|
|
994
|
-
cparams.swa_full = false;
|
|
995
|
-
|
|
996
|
-
return cparams;
|
|
997
|
-
}
|
|
998
|
-
};
|
|
999
|
-
|
|
1000
|
-
static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
|
|
1001
|
-
std::vector<cmd_params_instance> instances;
|
|
1002
|
-
|
|
1003
|
-
// this ordering minimizes the number of times that each model needs to be reloaded
|
|
1004
|
-
// clang-format off
|
|
1005
|
-
for (const auto & m : params.model)
|
|
1006
|
-
for (const auto & nl : params.n_gpu_layers)
|
|
1007
|
-
for (const auto & rpc : params.rpc_servers)
|
|
1008
|
-
for (const auto & sm : params.split_mode)
|
|
1009
|
-
for (const auto & mg : params.main_gpu)
|
|
1010
|
-
for (const auto & ts : params.tensor_split)
|
|
1011
|
-
for (const auto & ot : params.tensor_buft_overrides)
|
|
1012
|
-
for (const auto & mmp : params.use_mmap)
|
|
1013
|
-
for (const auto & embd : params.embeddings)
|
|
1014
|
-
for (const auto & nopo : params.no_op_offload)
|
|
1015
|
-
for (const auto & nb : params.n_batch)
|
|
1016
|
-
for (const auto & nub : params.n_ubatch)
|
|
1017
|
-
for (const auto & tk : params.type_k)
|
|
1018
|
-
for (const auto & tv : params.type_v)
|
|
1019
|
-
for (const auto & defrag_thold : params.defrag_thold)
|
|
1020
|
-
for (const auto & nkvo : params.no_kv_offload)
|
|
1021
|
-
for (const auto & fa : params.flash_attn)
|
|
1022
|
-
for (const auto & nt : params.n_threads)
|
|
1023
|
-
for (const auto & cm : params.cpu_mask)
|
|
1024
|
-
for (const auto & cs : params.cpu_strict)
|
|
1025
|
-
for (const auto & nd : params.n_depth)
|
|
1026
|
-
for (const auto & pl : params.poll) {
|
|
1027
|
-
for (const auto & n_prompt : params.n_prompt) {
|
|
1028
|
-
if (n_prompt == 0) {
|
|
1029
|
-
continue;
|
|
1030
|
-
}
|
|
1031
|
-
cmd_params_instance instance = {
|
|
1032
|
-
/* .model = */ m,
|
|
1033
|
-
/* .n_prompt = */ n_prompt,
|
|
1034
|
-
/* .n_gen = */ 0,
|
|
1035
|
-
/* .n_depth = */ nd,
|
|
1036
|
-
/* .n_batch = */ nb,
|
|
1037
|
-
/* .n_ubatch = */ nub,
|
|
1038
|
-
/* .type_k = */ tk,
|
|
1039
|
-
/* .type_v = */ tv,
|
|
1040
|
-
/* .defrag_thold = */ defrag_thold,
|
|
1041
|
-
/* .n_threads = */ nt,
|
|
1042
|
-
/* .cpu_mask = */ cm,
|
|
1043
|
-
/* .cpu_strict = */ cs,
|
|
1044
|
-
/* .poll = */ pl,
|
|
1045
|
-
/* .n_gpu_layers = */ nl,
|
|
1046
|
-
/* .rpc_servers = */ rpc,
|
|
1047
|
-
/* .split_mode = */ sm,
|
|
1048
|
-
/* .main_gpu = */ mg,
|
|
1049
|
-
/* .no_kv_offload= */ nkvo,
|
|
1050
|
-
/* .flash_attn = */ fa,
|
|
1051
|
-
/* .tensor_split = */ ts,
|
|
1052
|
-
/* .tensor_buft_overrides = */ ot,
|
|
1053
|
-
/* .use_mmap = */ mmp,
|
|
1054
|
-
/* .embeddings = */ embd,
|
|
1055
|
-
/* .no_op_offload= */ nopo,
|
|
1056
|
-
};
|
|
1057
|
-
instances.push_back(instance);
|
|
1058
|
-
}
|
|
1059
|
-
|
|
1060
|
-
for (const auto & n_gen : params.n_gen) {
|
|
1061
|
-
if (n_gen == 0) {
|
|
1062
|
-
continue;
|
|
1063
|
-
}
|
|
1064
|
-
cmd_params_instance instance = {
|
|
1065
|
-
/* .model = */ m,
|
|
1066
|
-
/* .n_prompt = */ 0,
|
|
1067
|
-
/* .n_gen = */ n_gen,
|
|
1068
|
-
/* .n_depth = */ nd,
|
|
1069
|
-
/* .n_batch = */ nb,
|
|
1070
|
-
/* .n_ubatch = */ nub,
|
|
1071
|
-
/* .type_k = */ tk,
|
|
1072
|
-
/* .type_v = */ tv,
|
|
1073
|
-
/* .defrag_thold = */ defrag_thold,
|
|
1074
|
-
/* .n_threads = */ nt,
|
|
1075
|
-
/* .cpu_mask = */ cm,
|
|
1076
|
-
/* .cpu_strict = */ cs,
|
|
1077
|
-
/* .poll = */ pl,
|
|
1078
|
-
/* .n_gpu_layers = */ nl,
|
|
1079
|
-
/* .rpc_servers = */ rpc,
|
|
1080
|
-
/* .split_mode = */ sm,
|
|
1081
|
-
/* .main_gpu = */ mg,
|
|
1082
|
-
/* .no_kv_offload= */ nkvo,
|
|
1083
|
-
/* .flash_attn = */ fa,
|
|
1084
|
-
/* .tensor_split = */ ts,
|
|
1085
|
-
/* .tensor_buft_overrides = */ ot,
|
|
1086
|
-
/* .use_mmap = */ mmp,
|
|
1087
|
-
/* .embeddings = */ embd,
|
|
1088
|
-
/* .no_op_offload= */ nopo,
|
|
1089
|
-
};
|
|
1090
|
-
instances.push_back(instance);
|
|
1091
|
-
}
|
|
1092
|
-
|
|
1093
|
-
for (const auto & n_pg : params.n_pg) {
|
|
1094
|
-
if (n_pg.first == 0 && n_pg.second == 0) {
|
|
1095
|
-
continue;
|
|
1096
|
-
}
|
|
1097
|
-
cmd_params_instance instance = {
|
|
1098
|
-
/* .model = */ m,
|
|
1099
|
-
/* .n_prompt = */ n_pg.first,
|
|
1100
|
-
/* .n_gen = */ n_pg.second,
|
|
1101
|
-
/* .n_depth = */ nd,
|
|
1102
|
-
/* .n_batch = */ nb,
|
|
1103
|
-
/* .n_ubatch = */ nub,
|
|
1104
|
-
/* .type_k = */ tk,
|
|
1105
|
-
/* .type_v = */ tv,
|
|
1106
|
-
/* .defrag_thold = */ defrag_thold,
|
|
1107
|
-
/* .n_threads = */ nt,
|
|
1108
|
-
/* .cpu_mask = */ cm,
|
|
1109
|
-
/* .cpu_strict = */ cs,
|
|
1110
|
-
/* .poll = */ pl,
|
|
1111
|
-
/* .n_gpu_layers = */ nl,
|
|
1112
|
-
/* .rpc_servers = */ rpc,
|
|
1113
|
-
/* .split_mode = */ sm,
|
|
1114
|
-
/* .main_gpu = */ mg,
|
|
1115
|
-
/* .no_kv_offload= */ nkvo,
|
|
1116
|
-
/* .flash_attn = */ fa,
|
|
1117
|
-
/* .tensor_split = */ ts,
|
|
1118
|
-
/* .tensor_buft_overrides = */ ot,
|
|
1119
|
-
/* .use_mmap = */ mmp,
|
|
1120
|
-
/* .embeddings = */ embd,
|
|
1121
|
-
/* .no_op_offload= */ nopo,
|
|
1122
|
-
};
|
|
1123
|
-
instances.push_back(instance);
|
|
1124
|
-
}
|
|
1125
|
-
}
|
|
1126
|
-
// clang-format on
|
|
1127
|
-
|
|
1128
|
-
return instances;
|
|
1129
|
-
}
|
|
1130
|
-
|
|
1131
|
-
struct test {
|
|
1132
|
-
static const std::string build_commit;
|
|
1133
|
-
static const int build_number;
|
|
1134
|
-
const std::string cpu_info;
|
|
1135
|
-
const std::string gpu_info;
|
|
1136
|
-
std::string model_filename;
|
|
1137
|
-
std::string model_type;
|
|
1138
|
-
uint64_t model_size;
|
|
1139
|
-
uint64_t model_n_params;
|
|
1140
|
-
int n_batch;
|
|
1141
|
-
int n_ubatch;
|
|
1142
|
-
int n_threads;
|
|
1143
|
-
std::string cpu_mask;
|
|
1144
|
-
bool cpu_strict;
|
|
1145
|
-
int poll;
|
|
1146
|
-
ggml_type type_k;
|
|
1147
|
-
ggml_type type_v;
|
|
1148
|
-
float defrag_thold;
|
|
1149
|
-
int n_gpu_layers;
|
|
1150
|
-
llama_split_mode split_mode;
|
|
1151
|
-
int main_gpu;
|
|
1152
|
-
bool no_kv_offload;
|
|
1153
|
-
bool flash_attn;
|
|
1154
|
-
std::vector<float> tensor_split;
|
|
1155
|
-
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
|
1156
|
-
bool use_mmap;
|
|
1157
|
-
bool embeddings;
|
|
1158
|
-
bool no_op_offload;
|
|
1159
|
-
int n_prompt;
|
|
1160
|
-
int n_gen;
|
|
1161
|
-
int n_depth;
|
|
1162
|
-
std::string test_time;
|
|
1163
|
-
std::vector<uint64_t> samples_ns;
|
|
1164
|
-
|
|
1165
|
-
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) :
|
|
1166
|
-
cpu_info(get_cpu_info()),
|
|
1167
|
-
gpu_info(get_gpu_info()) {
|
|
1168
|
-
|
|
1169
|
-
model_filename = inst.model;
|
|
1170
|
-
char buf[128];
|
|
1171
|
-
llama_model_desc(lmodel, buf, sizeof(buf));
|
|
1172
|
-
model_type = buf;
|
|
1173
|
-
model_size = llama_model_size(lmodel);
|
|
1174
|
-
model_n_params = llama_model_n_params(lmodel);
|
|
1175
|
-
n_batch = inst.n_batch;
|
|
1176
|
-
n_ubatch = inst.n_ubatch;
|
|
1177
|
-
n_threads = inst.n_threads;
|
|
1178
|
-
cpu_mask = inst.cpu_mask;
|
|
1179
|
-
cpu_strict = inst.cpu_strict;
|
|
1180
|
-
poll = inst.poll;
|
|
1181
|
-
type_k = inst.type_k;
|
|
1182
|
-
type_v = inst.type_v;
|
|
1183
|
-
defrag_thold = inst.defrag_thold;
|
|
1184
|
-
n_gpu_layers = inst.n_gpu_layers;
|
|
1185
|
-
split_mode = inst.split_mode;
|
|
1186
|
-
main_gpu = inst.main_gpu;
|
|
1187
|
-
no_kv_offload = inst.no_kv_offload;
|
|
1188
|
-
flash_attn = inst.flash_attn;
|
|
1189
|
-
tensor_split = inst.tensor_split;
|
|
1190
|
-
tensor_buft_overrides = inst.tensor_buft_overrides;
|
|
1191
|
-
use_mmap = inst.use_mmap;
|
|
1192
|
-
embeddings = inst.embeddings;
|
|
1193
|
-
no_op_offload = inst.no_op_offload;
|
|
1194
|
-
n_prompt = inst.n_prompt;
|
|
1195
|
-
n_gen = inst.n_gen;
|
|
1196
|
-
n_depth = inst.n_depth;
|
|
1197
|
-
// RFC 3339 date-time format
|
|
1198
|
-
time_t t = time(NULL);
|
|
1199
|
-
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
|
1200
|
-
test_time = buf;
|
|
1201
|
-
|
|
1202
|
-
(void) ctx;
|
|
1203
|
-
}
|
|
1204
|
-
|
|
1205
|
-
uint64_t avg_ns() const { return ::avg(samples_ns); }
|
|
1206
|
-
|
|
1207
|
-
uint64_t stdev_ns() const { return ::stdev(samples_ns); }
|
|
1208
|
-
|
|
1209
|
-
std::vector<double> get_ts() const {
|
|
1210
|
-
int n_tokens = n_prompt + n_gen;
|
|
1211
|
-
std::vector<double> ts;
|
|
1212
|
-
std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts),
|
|
1213
|
-
[n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
|
|
1214
|
-
return ts;
|
|
1215
|
-
}
|
|
1216
|
-
|
|
1217
|
-
double avg_ts() const { return ::avg(get_ts()); }
|
|
1218
|
-
|
|
1219
|
-
double stdev_ts() const { return ::stdev(get_ts()); }
|
|
1220
|
-
|
|
1221
|
-
static std::string get_backend() {
|
|
1222
|
-
std::vector<std::string> backends;
|
|
1223
|
-
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
|
1224
|
-
auto * reg = ggml_backend_reg_get(i);
|
|
1225
|
-
std::string name = ggml_backend_reg_name(reg);
|
|
1226
|
-
if (name != "CPU") {
|
|
1227
|
-
backends.push_back(ggml_backend_reg_name(reg));
|
|
1228
|
-
}
|
|
1229
|
-
}
|
|
1230
|
-
return backends.empty() ? "CPU" : join(backends, ",");
|
|
1231
|
-
}
|
|
1232
|
-
|
|
1233
|
-
static const std::vector<std::string> & get_fields() {
|
|
1234
|
-
static const std::vector<std::string> fields = {
|
|
1235
|
-
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
|
1236
|
-
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
|
1237
|
-
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
|
1238
|
-
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
|
1239
|
-
"defrag_thold",
|
|
1240
|
-
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
|
|
1241
|
-
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
|
1242
|
-
};
|
|
1243
|
-
return fields;
|
|
1244
|
-
}
|
|
1245
|
-
|
|
1246
|
-
enum field_type { STRING, BOOL, INT, FLOAT };
|
|
1247
|
-
|
|
1248
|
-
static field_type get_field_type(const std::string & field) {
|
|
1249
|
-
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
|
1250
|
-
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
|
1251
|
-
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
|
|
1252
|
-
field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") {
|
|
1253
|
-
return INT;
|
|
1254
|
-
}
|
|
1255
|
-
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
|
1256
|
-
field == "use_mmap" || field == "embeddings") {
|
|
1257
|
-
return BOOL;
|
|
1258
|
-
}
|
|
1259
|
-
if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
|
|
1260
|
-
return FLOAT;
|
|
1261
|
-
}
|
|
1262
|
-
return STRING;
|
|
1263
|
-
}
|
|
1264
|
-
|
|
1265
|
-
std::vector<std::string> get_values() const {
|
|
1266
|
-
std::string tensor_split_str;
|
|
1267
|
-
std::string tensor_buft_overrides_str;
|
|
1268
|
-
int max_nonzero = 0;
|
|
1269
|
-
for (size_t i = 0; i < llama_max_devices(); i++) {
|
|
1270
|
-
if (tensor_split[i] > 0) {
|
|
1271
|
-
max_nonzero = i;
|
|
1272
|
-
}
|
|
1273
|
-
}
|
|
1274
|
-
for (int i = 0; i <= max_nonzero; i++) {
|
|
1275
|
-
char buf[32];
|
|
1276
|
-
snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
|
|
1277
|
-
tensor_split_str += buf;
|
|
1278
|
-
if (i < max_nonzero) {
|
|
1279
|
-
tensor_split_str += "/";
|
|
1280
|
-
}
|
|
1281
|
-
}
|
|
1282
|
-
if (tensor_buft_overrides.size() == 1) {
|
|
1283
|
-
// Last element of tensor_buft_overrides is always a null pattern
|
|
1284
|
-
// so if it is only one element long, it must be a null pattern.
|
|
1285
|
-
GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
|
|
1286
|
-
tensor_buft_overrides_str += "none";
|
|
1287
|
-
} else {
|
|
1288
|
-
for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
|
|
1289
|
-
// Last element of tensor_buft_overrides is always a null pattern
|
|
1290
|
-
if (tensor_buft_overrides[i].pattern == nullptr) {
|
|
1291
|
-
tensor_buft_overrides_str += "none";
|
|
1292
|
-
} else {
|
|
1293
|
-
tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
|
|
1294
|
-
tensor_buft_overrides_str += "=";
|
|
1295
|
-
tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
|
|
1296
|
-
}
|
|
1297
|
-
if (i + 2 < tensor_buft_overrides.size()) {
|
|
1298
|
-
tensor_buft_overrides_str += ";";
|
|
1299
|
-
}
|
|
1300
|
-
}
|
|
1301
|
-
}
|
|
1302
|
-
std::vector<std::string> values = { build_commit,
|
|
1303
|
-
std::to_string(build_number),
|
|
1304
|
-
cpu_info,
|
|
1305
|
-
gpu_info,
|
|
1306
|
-
get_backend(),
|
|
1307
|
-
model_filename,
|
|
1308
|
-
model_type,
|
|
1309
|
-
std::to_string(model_size),
|
|
1310
|
-
std::to_string(model_n_params),
|
|
1311
|
-
std::to_string(n_batch),
|
|
1312
|
-
std::to_string(n_ubatch),
|
|
1313
|
-
std::to_string(n_threads),
|
|
1314
|
-
cpu_mask,
|
|
1315
|
-
std::to_string(cpu_strict),
|
|
1316
|
-
std::to_string(poll),
|
|
1317
|
-
ggml_type_name(type_k),
|
|
1318
|
-
ggml_type_name(type_v),
|
|
1319
|
-
std::to_string(n_gpu_layers),
|
|
1320
|
-
split_mode_str(split_mode),
|
|
1321
|
-
std::to_string(main_gpu),
|
|
1322
|
-
std::to_string(no_kv_offload),
|
|
1323
|
-
std::to_string(flash_attn),
|
|
1324
|
-
tensor_split_str,
|
|
1325
|
-
tensor_buft_overrides_str,
|
|
1326
|
-
std::to_string(defrag_thold),
|
|
1327
|
-
std::to_string(use_mmap),
|
|
1328
|
-
std::to_string(embeddings),
|
|
1329
|
-
std::to_string(no_op_offload),
|
|
1330
|
-
std::to_string(n_prompt),
|
|
1331
|
-
std::to_string(n_gen),
|
|
1332
|
-
std::to_string(n_depth),
|
|
1333
|
-
test_time,
|
|
1334
|
-
std::to_string(avg_ns()),
|
|
1335
|
-
std::to_string(stdev_ns()),
|
|
1336
|
-
std::to_string(avg_ts()),
|
|
1337
|
-
std::to_string(stdev_ts()) };
|
|
1338
|
-
return values;
|
|
1339
|
-
}
|
|
1340
|
-
|
|
1341
|
-
std::map<std::string, std::string> get_map() const {
|
|
1342
|
-
std::map<std::string, std::string> map;
|
|
1343
|
-
auto fields = get_fields();
|
|
1344
|
-
auto values = get_values();
|
|
1345
|
-
std::transform(fields.begin(), fields.end(), values.begin(), std::inserter(map, map.end()),
|
|
1346
|
-
std::make_pair<const std::string &, const std::string &>);
|
|
1347
|
-
return map;
|
|
1348
|
-
}
|
|
1349
|
-
};
|
|
1350
|
-
|
|
1351
|
-
const std::string test::build_commit = LLAMA_COMMIT;
|
|
1352
|
-
const int test::build_number = LLAMA_BUILD_NUMBER;
|
|
1353
|
-
|
|
1354
|
-
struct printer {
|
|
1355
|
-
virtual ~printer() {}
|
|
1356
|
-
|
|
1357
|
-
FILE * fout;
|
|
1358
|
-
|
|
1359
|
-
virtual void print_header(const cmd_params & params) { (void) params; }
|
|
1360
|
-
|
|
1361
|
-
virtual void print_test(const test & t) = 0;
|
|
1362
|
-
|
|
1363
|
-
virtual void print_footer() {}
|
|
1364
|
-
};
|
|
1365
|
-
|
|
1366
|
-
struct csv_printer : public printer {
|
|
1367
|
-
static std::string escape_csv(const std::string & field) {
|
|
1368
|
-
std::string escaped = "\"";
|
|
1369
|
-
for (auto c : field) {
|
|
1370
|
-
if (c == '"') {
|
|
1371
|
-
escaped += "\"";
|
|
1372
|
-
}
|
|
1373
|
-
escaped += c;
|
|
1374
|
-
}
|
|
1375
|
-
escaped += "\"";
|
|
1376
|
-
return escaped;
|
|
1377
|
-
}
|
|
1378
|
-
|
|
1379
|
-
void print_header(const cmd_params & params) override {
|
|
1380
|
-
std::vector<std::string> fields = test::get_fields();
|
|
1381
|
-
fprintf(fout, "%s\n", join(fields, ",").c_str());
|
|
1382
|
-
(void) params;
|
|
1383
|
-
}
|
|
1384
|
-
|
|
1385
|
-
void print_test(const test & t) override {
|
|
1386
|
-
std::vector<std::string> values = t.get_values();
|
|
1387
|
-
std::transform(values.begin(), values.end(), values.begin(), escape_csv);
|
|
1388
|
-
fprintf(fout, "%s\n", join(values, ",").c_str());
|
|
1389
|
-
}
|
|
1390
|
-
};
|
|
1391
|
-
|
|
1392
|
-
static std::string escape_json(const std::string & value) {
|
|
1393
|
-
std::string escaped;
|
|
1394
|
-
for (auto c : value) {
|
|
1395
|
-
if (c == '"') {
|
|
1396
|
-
escaped += "\\\"";
|
|
1397
|
-
} else if (c == '\\') {
|
|
1398
|
-
escaped += "\\\\";
|
|
1399
|
-
} else if (c <= 0x1f) {
|
|
1400
|
-
char buf[8];
|
|
1401
|
-
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
|
1402
|
-
escaped += buf;
|
|
1403
|
-
} else {
|
|
1404
|
-
escaped += c;
|
|
1405
|
-
}
|
|
1406
|
-
}
|
|
1407
|
-
return escaped;
|
|
1408
|
-
}
|
|
1409
|
-
|
|
1410
|
-
static std::string format_json_value(const std::string & field, const std::string & value) {
|
|
1411
|
-
switch (test::get_field_type(field)) {
|
|
1412
|
-
case test::STRING:
|
|
1413
|
-
return "\"" + escape_json(value) + "\"";
|
|
1414
|
-
case test::BOOL:
|
|
1415
|
-
return value == "0" ? "false" : "true";
|
|
1416
|
-
default:
|
|
1417
|
-
return value;
|
|
1418
|
-
}
|
|
1419
|
-
}
|
|
1420
|
-
|
|
1421
|
-
struct json_printer : public printer {
|
|
1422
|
-
bool first = true;
|
|
1423
|
-
|
|
1424
|
-
void print_header(const cmd_params & params) override {
|
|
1425
|
-
fprintf(fout, "[\n");
|
|
1426
|
-
(void) params;
|
|
1427
|
-
}
|
|
1428
|
-
|
|
1429
|
-
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1430
|
-
assert(fields.size() == values.size());
|
|
1431
|
-
for (size_t i = 0; i < fields.size(); i++) {
|
|
1432
|
-
fprintf(fout, " \"%s\": %s,\n", fields.at(i).c_str(),
|
|
1433
|
-
format_json_value(fields.at(i), values.at(i)).c_str());
|
|
1434
|
-
}
|
|
1435
|
-
}
|
|
1436
|
-
|
|
1437
|
-
void print_test(const test & t) override {
|
|
1438
|
-
if (first) {
|
|
1439
|
-
first = false;
|
|
1440
|
-
} else {
|
|
1441
|
-
fprintf(fout, ",\n");
|
|
1442
|
-
}
|
|
1443
|
-
fprintf(fout, " {\n");
|
|
1444
|
-
print_fields(test::get_fields(), t.get_values());
|
|
1445
|
-
fprintf(fout, " \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
|
|
1446
|
-
fprintf(fout, " \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
|
|
1447
|
-
fprintf(fout, " }");
|
|
1448
|
-
fflush(fout);
|
|
1449
|
-
}
|
|
1450
|
-
|
|
1451
|
-
void print_footer() override { fprintf(fout, "\n]\n"); }
|
|
1452
|
-
};
|
|
1453
|
-
|
|
1454
|
-
struct jsonl_printer : public printer {
|
|
1455
|
-
void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
|
|
1456
|
-
assert(fields.size() == values.size());
|
|
1457
|
-
for (size_t i = 0; i < fields.size(); i++) {
|
|
1458
|
-
fprintf(fout, "\"%s\": %s, ", fields.at(i).c_str(), format_json_value(fields.at(i), values.at(i)).c_str());
|
|
1459
|
-
}
|
|
1460
|
-
}
|
|
1461
|
-
|
|
1462
|
-
void print_test(const test & t) override {
|
|
1463
|
-
fprintf(fout, "{");
|
|
1464
|
-
print_fields(test::get_fields(), t.get_values());
|
|
1465
|
-
fprintf(fout, "\"samples_ns\": [ %s ],", join(t.samples_ns, ", ").c_str());
|
|
1466
|
-
fprintf(fout, "\"samples_ts\": [ %s ]", join(t.get_ts(), ", ").c_str());
|
|
1467
|
-
fprintf(fout, "}\n");
|
|
1468
|
-
fflush(fout);
|
|
1469
|
-
}
|
|
1470
|
-
};
|
|
1471
|
-
|
|
1472
|
-
struct markdown_printer : public printer {
|
|
1473
|
-
std::vector<std::string> fields;
|
|
1474
|
-
|
|
1475
|
-
static int get_field_width(const std::string & field) {
|
|
1476
|
-
if (field == "model") {
|
|
1477
|
-
return -30;
|
|
1478
|
-
}
|
|
1479
|
-
if (field == "t/s") {
|
|
1480
|
-
return 20;
|
|
1481
|
-
}
|
|
1482
|
-
if (field == "size" || field == "params") {
|
|
1483
|
-
return 10;
|
|
1484
|
-
}
|
|
1485
|
-
if (field == "n_gpu_layers") {
|
|
1486
|
-
return 3;
|
|
1487
|
-
}
|
|
1488
|
-
if (field == "n_threads") {
|
|
1489
|
-
return 7;
|
|
1490
|
-
}
|
|
1491
|
-
if (field == "n_batch") {
|
|
1492
|
-
return 7;
|
|
1493
|
-
}
|
|
1494
|
-
if (field == "n_ubatch") {
|
|
1495
|
-
return 8;
|
|
1496
|
-
}
|
|
1497
|
-
if (field == "type_k" || field == "type_v") {
|
|
1498
|
-
return 6;
|
|
1499
|
-
}
|
|
1500
|
-
if (field == "split_mode") {
|
|
1501
|
-
return 5;
|
|
1502
|
-
}
|
|
1503
|
-
if (field == "flash_attn") {
|
|
1504
|
-
return 2;
|
|
1505
|
-
}
|
|
1506
|
-
if (field == "use_mmap") {
|
|
1507
|
-
return 4;
|
|
1508
|
-
}
|
|
1509
|
-
if (field == "test") {
|
|
1510
|
-
return 15;
|
|
1511
|
-
}
|
|
1512
|
-
if (field == "no_op_offload") {
|
|
1513
|
-
return 4;
|
|
1514
|
-
}
|
|
1515
|
-
|
|
1516
|
-
int width = std::max((int) field.length(), 10);
|
|
1517
|
-
|
|
1518
|
-
if (test::get_field_type(field) == test::STRING) {
|
|
1519
|
-
return -width;
|
|
1520
|
-
}
|
|
1521
|
-
return width;
|
|
1522
|
-
}
|
|
1523
|
-
|
|
1524
|
-
static std::string get_field_display_name(const std::string & field) {
|
|
1525
|
-
if (field == "n_gpu_layers") {
|
|
1526
|
-
return "ngl";
|
|
1527
|
-
}
|
|
1528
|
-
if (field == "split_mode") {
|
|
1529
|
-
return "sm";
|
|
1530
|
-
}
|
|
1531
|
-
if (field == "n_threads") {
|
|
1532
|
-
return "threads";
|
|
1533
|
-
}
|
|
1534
|
-
if (field == "no_kv_offload") {
|
|
1535
|
-
return "nkvo";
|
|
1536
|
-
}
|
|
1537
|
-
if (field == "flash_attn") {
|
|
1538
|
-
return "fa";
|
|
1539
|
-
}
|
|
1540
|
-
if (field == "use_mmap") {
|
|
1541
|
-
return "mmap";
|
|
1542
|
-
}
|
|
1543
|
-
if (field == "embeddings") {
|
|
1544
|
-
return "embd";
|
|
1545
|
-
}
|
|
1546
|
-
if (field == "no_op_offload") {
|
|
1547
|
-
return "nopo";
|
|
1548
|
-
}
|
|
1549
|
-
if (field == "tensor_split") {
|
|
1550
|
-
return "ts";
|
|
1551
|
-
}
|
|
1552
|
-
if (field == "tensor_buft_overrides") {
|
|
1553
|
-
return "ot";
|
|
1554
|
-
}
|
|
1555
|
-
return field;
|
|
1556
|
-
}
|
|
1557
|
-
|
|
1558
|
-
void print_header(const cmd_params & params) override {
|
|
1559
|
-
// select fields to print
|
|
1560
|
-
fields.emplace_back("model");
|
|
1561
|
-
fields.emplace_back("size");
|
|
1562
|
-
fields.emplace_back("params");
|
|
1563
|
-
fields.emplace_back("backend");
|
|
1564
|
-
bool is_cpu_backend = test::get_backend().find("CPU") != std::string::npos ||
|
|
1565
|
-
test::get_backend().find("BLAS") != std::string::npos;
|
|
1566
|
-
if (!is_cpu_backend) {
|
|
1567
|
-
fields.emplace_back("n_gpu_layers");
|
|
1568
|
-
}
|
|
1569
|
-
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
|
1570
|
-
fields.emplace_back("n_threads");
|
|
1571
|
-
}
|
|
1572
|
-
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
|
1573
|
-
fields.emplace_back("cpu_mask");
|
|
1574
|
-
}
|
|
1575
|
-
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
|
1576
|
-
fields.emplace_back("cpu_strict");
|
|
1577
|
-
}
|
|
1578
|
-
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
|
1579
|
-
fields.emplace_back("poll");
|
|
1580
|
-
}
|
|
1581
|
-
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
|
1582
|
-
fields.emplace_back("n_batch");
|
|
1583
|
-
}
|
|
1584
|
-
if (params.n_ubatch.size() > 1 || params.n_ubatch != cmd_params_defaults.n_ubatch) {
|
|
1585
|
-
fields.emplace_back("n_ubatch");
|
|
1586
|
-
}
|
|
1587
|
-
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
|
|
1588
|
-
fields.emplace_back("type_k");
|
|
1589
|
-
}
|
|
1590
|
-
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
|
1591
|
-
fields.emplace_back("type_v");
|
|
1592
|
-
}
|
|
1593
|
-
if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
|
|
1594
|
-
fields.emplace_back("defrag_thold");
|
|
1595
|
-
}
|
|
1596
|
-
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
|
1597
|
-
fields.emplace_back("main_gpu");
|
|
1598
|
-
}
|
|
1599
|
-
if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
|
|
1600
|
-
fields.emplace_back("split_mode");
|
|
1601
|
-
}
|
|
1602
|
-
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
|
|
1603
|
-
fields.emplace_back("no_kv_offload");
|
|
1604
|
-
}
|
|
1605
|
-
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
|
1606
|
-
fields.emplace_back("flash_attn");
|
|
1607
|
-
}
|
|
1608
|
-
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
|
1609
|
-
fields.emplace_back("tensor_split");
|
|
1610
|
-
}
|
|
1611
|
-
if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
|
|
1612
|
-
fields.emplace_back("tensor_buft_overrides");
|
|
1613
|
-
}
|
|
1614
|
-
if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
|
|
1615
|
-
fields.emplace_back("use_mmap");
|
|
1616
|
-
}
|
|
1617
|
-
if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) {
|
|
1618
|
-
fields.emplace_back("embeddings");
|
|
1619
|
-
}
|
|
1620
|
-
if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) {
|
|
1621
|
-
fields.emplace_back("no_op_offload");
|
|
1622
|
-
}
|
|
1623
|
-
fields.emplace_back("test");
|
|
1624
|
-
fields.emplace_back("t/s");
|
|
1625
|
-
|
|
1626
|
-
fprintf(fout, "|");
|
|
1627
|
-
for (const auto & field : fields) {
|
|
1628
|
-
fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
|
|
1629
|
-
}
|
|
1630
|
-
fprintf(fout, "\n");
|
|
1631
|
-
fprintf(fout, "|");
|
|
1632
|
-
for (const auto & field : fields) {
|
|
1633
|
-
int width = get_field_width(field);
|
|
1634
|
-
fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
|
|
1635
|
-
}
|
|
1636
|
-
fprintf(fout, "\n");
|
|
1637
|
-
}
|
|
1638
|
-
|
|
1639
|
-
void print_test(const test & t) override {
|
|
1640
|
-
std::map<std::string, std::string> vmap = t.get_map();
|
|
1641
|
-
|
|
1642
|
-
fprintf(fout, "|");
|
|
1643
|
-
for (const auto & field : fields) {
|
|
1644
|
-
std::string value;
|
|
1645
|
-
char buf[128];
|
|
1646
|
-
if (field == "model") {
|
|
1647
|
-
value = t.model_type;
|
|
1648
|
-
} else if (field == "size") {
|
|
1649
|
-
if (t.model_size < 1024 * 1024 * 1024) {
|
|
1650
|
-
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
|
|
1651
|
-
} else {
|
|
1652
|
-
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
|
|
1653
|
-
}
|
|
1654
|
-
value = buf;
|
|
1655
|
-
} else if (field == "params") {
|
|
1656
|
-
if (t.model_n_params < 1000 * 1000 * 1000) {
|
|
1657
|
-
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
|
|
1658
|
-
} else {
|
|
1659
|
-
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
|
|
1660
|
-
}
|
|
1661
|
-
value = buf;
|
|
1662
|
-
} else if (field == "backend") {
|
|
1663
|
-
value = test::get_backend();
|
|
1664
|
-
} else if (field == "test") {
|
|
1665
|
-
if (t.n_prompt > 0 && t.n_gen == 0) {
|
|
1666
|
-
snprintf(buf, sizeof(buf), "pp%d", t.n_prompt);
|
|
1667
|
-
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
|
1668
|
-
snprintf(buf, sizeof(buf), "tg%d", t.n_gen);
|
|
1669
|
-
} else {
|
|
1670
|
-
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
|
1671
|
-
}
|
|
1672
|
-
if (t.n_depth > 0) {
|
|
1673
|
-
int len = strlen(buf);
|
|
1674
|
-
snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
|
|
1675
|
-
}
|
|
1676
|
-
value = buf;
|
|
1677
|
-
} else if (field == "t/s") {
|
|
1678
|
-
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
|
1679
|
-
value = buf;
|
|
1680
|
-
} else if (vmap.find(field) != vmap.end()) {
|
|
1681
|
-
value = vmap.at(field);
|
|
1682
|
-
} else {
|
|
1683
|
-
assert(false);
|
|
1684
|
-
exit(1);
|
|
1685
|
-
}
|
|
1686
|
-
|
|
1687
|
-
int width = get_field_width(field);
|
|
1688
|
-
if (field == "t/s") {
|
|
1689
|
-
// HACK: the utf-8 character is 2 bytes
|
|
1690
|
-
width += 1;
|
|
1691
|
-
}
|
|
1692
|
-
fprintf(fout, " %*s |", width, value.c_str());
|
|
1693
|
-
}
|
|
1694
|
-
fprintf(fout, "\n");
|
|
1695
|
-
}
|
|
1696
|
-
|
|
1697
|
-
void print_footer() override {
|
|
1698
|
-
fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
|
|
1699
|
-
}
|
|
1700
|
-
};
|
|
1701
|
-
|
|
1702
|
-
struct sql_printer : public printer {
|
|
1703
|
-
static std::string get_sql_field_type(const std::string & field) {
|
|
1704
|
-
switch (test::get_field_type(field)) {
|
|
1705
|
-
case test::STRING:
|
|
1706
|
-
return "TEXT";
|
|
1707
|
-
case test::BOOL:
|
|
1708
|
-
case test::INT:
|
|
1709
|
-
return "INTEGER";
|
|
1710
|
-
case test::FLOAT:
|
|
1711
|
-
return "REAL";
|
|
1712
|
-
default:
|
|
1713
|
-
assert(false);
|
|
1714
|
-
exit(1);
|
|
1715
|
-
}
|
|
1716
|
-
}
|
|
1717
|
-
|
|
1718
|
-
void print_header(const cmd_params & params) override {
|
|
1719
|
-
std::vector<std::string> fields = test::get_fields();
|
|
1720
|
-
fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
|
|
1721
|
-
for (size_t i = 0; i < fields.size(); i++) {
|
|
1722
|
-
fprintf(fout, " %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),
|
|
1723
|
-
i < fields.size() - 1 ? "," : "");
|
|
1724
|
-
}
|
|
1725
|
-
fprintf(fout, ");\n");
|
|
1726
|
-
fprintf(fout, "\n");
|
|
1727
|
-
(void) params;
|
|
1728
|
-
}
|
|
1729
|
-
|
|
1730
|
-
void print_test(const test & t) override {
|
|
1731
|
-
fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
|
|
1732
|
-
fprintf(fout, "VALUES (");
|
|
1733
|
-
std::vector<std::string> values = t.get_values();
|
|
1734
|
-
for (size_t i = 0; i < values.size(); i++) {
|
|
1735
|
-
fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
|
|
1736
|
-
}
|
|
1737
|
-
fprintf(fout, ");\n");
|
|
1738
|
-
}
|
|
1739
|
-
};
|
|
1740
|
-
|
|
1741
|
-
static bool test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
|
|
1742
|
-
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1743
|
-
|
|
1744
|
-
const llama_model * model = llama_get_model(ctx);
|
|
1745
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1746
|
-
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
1747
|
-
|
|
1748
|
-
std::vector<llama_token> tokens(n_batch);
|
|
1749
|
-
|
|
1750
|
-
int n_processed = 0;
|
|
1751
|
-
|
|
1752
|
-
while (n_processed < n_prompt) {
|
|
1753
|
-
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
|
1754
|
-
tokens[0] = n_processed == 0 && llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
|
|
1755
|
-
for (int i = 1; i < n_tokens; i++) {
|
|
1756
|
-
tokens[i] = std::rand() % n_vocab;
|
|
1757
|
-
}
|
|
1758
|
-
int res = llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
|
|
1759
|
-
if (res != 0) {
|
|
1760
|
-
fprintf(stderr, "%s: failed to decode prompt batch, res = %d\n", __func__, res);
|
|
1761
|
-
return false;
|
|
1762
|
-
}
|
|
1763
|
-
n_processed += n_tokens;
|
|
1764
|
-
}
|
|
1765
|
-
|
|
1766
|
-
llama_synchronize(ctx);
|
|
1767
|
-
return true;
|
|
1768
|
-
}
|
|
1769
|
-
|
|
1770
|
-
static bool test_gen(llama_context * ctx, int n_gen, int n_threads) {
|
|
1771
|
-
llama_set_n_threads(ctx, n_threads, n_threads);
|
|
1772
|
-
|
|
1773
|
-
const llama_model * model = llama_get_model(ctx);
|
|
1774
|
-
const llama_vocab * vocab = llama_model_get_vocab(model);
|
|
1775
|
-
const int32_t n_vocab = llama_vocab_n_tokens(vocab);
|
|
1776
|
-
|
|
1777
|
-
llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;
|
|
1778
|
-
|
|
1779
|
-
for (int i = 0; i < n_gen; i++) {
|
|
1780
|
-
int res = llama_decode(ctx, llama_batch_get_one(&token, 1));
|
|
1781
|
-
if (res != 0) {
|
|
1782
|
-
fprintf(stderr, "%s: failed to decode generation batch, res = %d\n", __func__, res);
|
|
1783
|
-
return false;
|
|
1784
|
-
}
|
|
1785
|
-
llama_synchronize(ctx);
|
|
1786
|
-
token = std::rand() % n_vocab;
|
|
1787
|
-
}
|
|
1788
|
-
return true;
|
|
1789
|
-
}
|
|
1790
|
-
|
|
1791
|
-
static void llama_null_log_callback(enum ggml_log_level level, const char * text, void * user_data) {
|
|
1792
|
-
(void) level;
|
|
1793
|
-
(void) text;
|
|
1794
|
-
(void) user_data;
|
|
1795
|
-
}
|
|
1796
|
-
|
|
1797
|
-
static std::unique_ptr<printer> create_printer(output_formats format) {
|
|
1798
|
-
switch (format) {
|
|
1799
|
-
case NONE:
|
|
1800
|
-
return nullptr;
|
|
1801
|
-
case CSV:
|
|
1802
|
-
return std::unique_ptr<printer>(new csv_printer());
|
|
1803
|
-
case JSON:
|
|
1804
|
-
return std::unique_ptr<printer>(new json_printer());
|
|
1805
|
-
case JSONL:
|
|
1806
|
-
return std::unique_ptr<printer>(new jsonl_printer());
|
|
1807
|
-
case MARKDOWN:
|
|
1808
|
-
return std::unique_ptr<printer>(new markdown_printer());
|
|
1809
|
-
case SQL:
|
|
1810
|
-
return std::unique_ptr<printer>(new sql_printer());
|
|
1811
|
-
}
|
|
1812
|
-
GGML_ABORT("fatal error");
|
|
1813
|
-
}
|
|
1814
|
-
|
|
1815
|
-
int main(int argc, char ** argv) {
|
|
1816
|
-
// try to set locale for unicode characters in markdown
|
|
1817
|
-
setlocale(LC_CTYPE, ".UTF-8");
|
|
1818
|
-
|
|
1819
|
-
#if !defined(NDEBUG)
|
|
1820
|
-
fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
|
|
1821
|
-
#endif
|
|
1822
|
-
|
|
1823
|
-
#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
|
|
1824
|
-
fprintf(stderr, "warning: debug build, performance may be affected\n");
|
|
1825
|
-
#endif
|
|
1826
|
-
|
|
1827
|
-
#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
|
|
1828
|
-
fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
|
|
1829
|
-
#endif
|
|
1830
|
-
|
|
1831
|
-
// initialize backends
|
|
1832
|
-
ggml_backend_load_all();
|
|
1833
|
-
|
|
1834
|
-
cmd_params params = parse_cmd_params(argc, argv);
|
|
1835
|
-
|
|
1836
|
-
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
|
1837
|
-
if (!cpu_dev) {
|
|
1838
|
-
fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__);
|
|
1839
|
-
return 1;
|
|
1840
|
-
}
|
|
1841
|
-
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
|
1842
|
-
auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new");
|
|
1843
|
-
auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free");
|
|
1844
|
-
|
|
1845
|
-
// initialize llama.cpp
|
|
1846
|
-
if (!params.verbose) {
|
|
1847
|
-
llama_log_set(llama_null_log_callback, NULL);
|
|
1848
|
-
}
|
|
1849
|
-
llama_backend_init();
|
|
1850
|
-
llama_numa_init(params.numa);
|
|
1851
|
-
|
|
1852
|
-
set_process_priority(params.prio);
|
|
1853
|
-
|
|
1854
|
-
// initialize printer
|
|
1855
|
-
std::unique_ptr<printer> p = create_printer(params.output_format);
|
|
1856
|
-
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
|
1857
|
-
|
|
1858
|
-
if (p) {
|
|
1859
|
-
p->fout = stdout;
|
|
1860
|
-
p->print_header(params);
|
|
1861
|
-
}
|
|
1862
|
-
|
|
1863
|
-
if (p_err) {
|
|
1864
|
-
p_err->fout = stderr;
|
|
1865
|
-
p_err->print_header(params);
|
|
1866
|
-
}
|
|
1867
|
-
|
|
1868
|
-
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
|
|
1869
|
-
|
|
1870
|
-
llama_model * lmodel = nullptr;
|
|
1871
|
-
const cmd_params_instance * prev_inst = nullptr;
|
|
1872
|
-
|
|
1873
|
-
int params_idx = 0;
|
|
1874
|
-
auto params_count = params_instances.size();
|
|
1875
|
-
for (const auto & inst : params_instances) {
|
|
1876
|
-
params_idx++;
|
|
1877
|
-
if (params.progress) {
|
|
1878
|
-
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
|
|
1879
|
-
}
|
|
1880
|
-
// keep the same model between tests when possible
|
|
1881
|
-
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
|
1882
|
-
if (lmodel) {
|
|
1883
|
-
llama_model_free(lmodel);
|
|
1884
|
-
}
|
|
1885
|
-
|
|
1886
|
-
lmodel = llama_model_load_from_file(inst.model.c_str(), inst.to_llama_mparams());
|
|
1887
|
-
if (lmodel == NULL) {
|
|
1888
|
-
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
|
|
1889
|
-
return 1;
|
|
1890
|
-
}
|
|
1891
|
-
prev_inst = &inst;
|
|
1892
|
-
}
|
|
1893
|
-
|
|
1894
|
-
llama_context * ctx = llama_init_from_model(lmodel, inst.to_llama_cparams());
|
|
1895
|
-
if (ctx == NULL) {
|
|
1896
|
-
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
|
|
1897
|
-
llama_model_free(lmodel);
|
|
1898
|
-
return 1;
|
|
1899
|
-
}
|
|
1900
|
-
|
|
1901
|
-
test t(inst, lmodel, ctx);
|
|
1902
|
-
|
|
1903
|
-
llama_kv_self_clear(ctx);
|
|
1904
|
-
|
|
1905
|
-
// cool off before the test
|
|
1906
|
-
if (params.delay) {
|
|
1907
|
-
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
|
1908
|
-
}
|
|
1909
|
-
|
|
1910
|
-
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
|
1911
|
-
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
|
1912
|
-
fprintf(stderr, "%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
|
1913
|
-
exit(1);
|
|
1914
|
-
}
|
|
1915
|
-
tpp.strict_cpu = t.cpu_strict;
|
|
1916
|
-
tpp.poll = t.poll;
|
|
1917
|
-
tpp.prio = params.prio;
|
|
1918
|
-
|
|
1919
|
-
struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
|
|
1920
|
-
if (!threadpool) {
|
|
1921
|
-
fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
|
1922
|
-
exit(1);
|
|
1923
|
-
}
|
|
1924
|
-
|
|
1925
|
-
llama_attach_threadpool(ctx, threadpool, NULL);
|
|
1926
|
-
|
|
1927
|
-
// warmup run
|
|
1928
|
-
if (t.n_prompt > 0) {
|
|
1929
|
-
if (params.progress) {
|
|
1930
|
-
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
|
|
1931
|
-
}
|
|
1932
|
-
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
|
1933
|
-
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1934
|
-
if (!res) {
|
|
1935
|
-
fprintf(stderr, "%s: error: failed to run prompt warmup\n", __func__);
|
|
1936
|
-
exit(1);
|
|
1937
|
-
}
|
|
1938
|
-
}
|
|
1939
|
-
if (t.n_gen > 0) {
|
|
1940
|
-
if (params.progress) {
|
|
1941
|
-
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
|
|
1942
|
-
}
|
|
1943
|
-
bool res = test_gen(ctx, 1, t.n_threads);
|
|
1944
|
-
if (!res) {
|
|
1945
|
-
fprintf(stderr, "%s: error: failed to run gen warmup\n", __func__);
|
|
1946
|
-
exit(1);
|
|
1947
|
-
}
|
|
1948
|
-
}
|
|
1949
|
-
|
|
1950
|
-
for (int i = 0; i < params.reps; i++) {
|
|
1951
|
-
llama_kv_self_clear(ctx);
|
|
1952
|
-
|
|
1953
|
-
if (t.n_depth > 0) {
|
|
1954
|
-
if (params.progress) {
|
|
1955
|
-
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
|
1956
|
-
i + 1, params.reps);
|
|
1957
|
-
}
|
|
1958
|
-
bool res = test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
|
1959
|
-
if (!res) {
|
|
1960
|
-
fprintf(stderr, "%s: error: failed to run depth\n", __func__);
|
|
1961
|
-
exit(1);
|
|
1962
|
-
}
|
|
1963
|
-
}
|
|
1964
|
-
|
|
1965
|
-
uint64_t t_start = get_time_ns();
|
|
1966
|
-
|
|
1967
|
-
if (t.n_prompt > 0) {
|
|
1968
|
-
if (params.progress) {
|
|
1969
|
-
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
|
|
1970
|
-
i + 1, params.reps);
|
|
1971
|
-
}
|
|
1972
|
-
bool res = test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
|
|
1973
|
-
if (!res) {
|
|
1974
|
-
fprintf(stderr, "%s: error: failed to run prompt\n", __func__);
|
|
1975
|
-
exit(1);
|
|
1976
|
-
}
|
|
1977
|
-
}
|
|
1978
|
-
if (t.n_gen > 0) {
|
|
1979
|
-
if (params.progress) {
|
|
1980
|
-
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
|
|
1981
|
-
i + 1, params.reps);
|
|
1982
|
-
}
|
|
1983
|
-
bool res = test_gen(ctx, t.n_gen, t.n_threads);
|
|
1984
|
-
if (!res) {
|
|
1985
|
-
fprintf(stderr, "%s: error: failed to run gen\n", __func__);
|
|
1986
|
-
exit(1);
|
|
1987
|
-
}
|
|
1988
|
-
}
|
|
1989
|
-
|
|
1990
|
-
uint64_t t_ns = get_time_ns() - t_start;
|
|
1991
|
-
t.samples_ns.push_back(t_ns);
|
|
1992
|
-
}
|
|
1993
|
-
|
|
1994
|
-
if (p) {
|
|
1995
|
-
p->print_test(t);
|
|
1996
|
-
fflush(p->fout);
|
|
1997
|
-
}
|
|
1998
|
-
|
|
1999
|
-
if (p_err) {
|
|
2000
|
-
p_err->print_test(t);
|
|
2001
|
-
fflush(p_err->fout);
|
|
2002
|
-
}
|
|
2003
|
-
|
|
2004
|
-
llama_perf_context_print(ctx);
|
|
2005
|
-
|
|
2006
|
-
llama_free(ctx);
|
|
2007
|
-
|
|
2008
|
-
ggml_threadpool_free_fn(threadpool);
|
|
2009
|
-
}
|
|
2010
|
-
|
|
2011
|
-
llama_model_free(lmodel);
|
|
2012
|
-
|
|
2013
|
-
if (p) {
|
|
2014
|
-
p->print_footer();
|
|
2015
|
-
}
|
|
2016
|
-
|
|
2017
|
-
if (p_err) {
|
|
2018
|
-
p_err->print_footer();
|
|
2019
|
-
}
|
|
2020
|
-
|
|
2021
|
-
llama_backend_free();
|
|
2022
|
-
|
|
2023
|
-
return 0;
|
|
2024
|
-
}
|