@fugood/llama.node 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +7 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +18 -1
- package/package.json +1 -1
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +17 -7
- package/src/EmbeddingWorker.h +2 -1
- package/src/LlamaCompletionWorker.cpp +8 -8
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +89 -27
- package/src/LlamaContext.h +2 -0
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +4 -4
- package/src/llama.cpp/.github/workflows/build.yml +240 -168
- package/src/llama.cpp/.github/workflows/docker.yml +8 -8
- package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
- package/src/llama.cpp/.github/workflows/server.yml +21 -14
- package/src/llama.cpp/CMakeLists.txt +14 -6
- package/src/llama.cpp/Sources/llama/llama.h +4 -0
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/cmake/common.cmake +33 -0
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
- package/src/llama.cpp/common/CMakeLists.txt +6 -4
- package/src/llama.cpp/common/arg.cpp +986 -770
- package/src/llama.cpp/common/arg.h +22 -22
- package/src/llama.cpp/common/common.cpp +212 -351
- package/src/llama.cpp/common/common.h +204 -117
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +50 -50
- package/src/llama.cpp/common/log.h +18 -18
- package/src/llama.cpp/common/ngram-cache.cpp +36 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +163 -121
- package/src/llama.cpp/common/sampling.h +41 -20
- package/src/llama.cpp/common/speculative.cpp +274 -0
- package/src/llama.cpp/common/speculative.h +28 -0
- package/src/llama.cpp/docs/build.md +134 -161
- package/src/llama.cpp/examples/CMakeLists.txt +33 -14
- package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched/batched.cpp +19 -18
- package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
- package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
- package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
- package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
- package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
- package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/infill/infill.cpp +41 -87
- package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
- package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
- package/src/llama.cpp/examples/llava/clip.cpp +263 -66
- package/src/llama.cpp/examples/llava/clip.h +8 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
- package/src/llama.cpp/examples/llava/llava.cpp +83 -22
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
- package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
- package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/main/main.cpp +73 -114
- package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
- package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
- package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/run/run.cpp +911 -0
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
- package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
- package/src/llama.cpp/examples/server/server.cpp +2073 -1339
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
- package/src/llama.cpp/examples/server/utils.hpp +354 -277
- package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/simple/simple.cpp +130 -94
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
- package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
- package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/tts/tts.cpp +932 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
- package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
- package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +159 -417
- package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
- package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
- package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
- package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
- package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
- package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
- package/src/llama.cpp/include/llama-cpp.h +25 -0
- package/src/llama.cpp/include/llama.h +93 -52
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
- package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/src/CMakeLists.txt +4 -8
- package/src/llama.cpp/src/llama-grammar.cpp +15 -15
- package/src/llama.cpp/src/llama-grammar.h +2 -5
- package/src/llama.cpp/src/llama-sampling.cpp +779 -194
- package/src/llama.cpp/src/llama-sampling.h +21 -2
- package/src/llama.cpp/src/llama-vocab.cpp +55 -10
- package/src/llama.cpp/src/llama-vocab.h +35 -11
- package/src/llama.cpp/src/llama.cpp +4317 -2979
- package/src/llama.cpp/src/unicode-data.cpp +2 -2
- package/src/llama.cpp/src/unicode.cpp +62 -51
- package/src/llama.cpp/src/unicode.h +9 -10
- package/src/llama.cpp/tests/CMakeLists.txt +48 -38
- package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
- package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
- package/src/llama.cpp/tests/test-barrier.cpp +1 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
- package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
- package/src/llama.cpp/tests/test-log.cpp +2 -2
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +62 -20
- package/src/llama.cpp/tests/test-sampling.cpp +163 -138
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
- package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
- package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
- package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
- package/src/llama.cpp/common/train.cpp +0 -1515
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
- package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -34,55 +34,6 @@ struct results_log_softmax {
|
|
|
34
34
|
float prob;
|
|
35
35
|
};
|
|
36
36
|
|
|
37
|
-
static void write_logfile(
|
|
38
|
-
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
|
39
|
-
const struct results_perplexity & results
|
|
40
|
-
) {
|
|
41
|
-
if (params.logdir.empty()) {
|
|
42
|
-
return;
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
if (params.hellaswag) {
|
|
46
|
-
LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
|
|
47
|
-
return;
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
const std::string timestamp = string_get_sortable_timestamp();
|
|
51
|
-
|
|
52
|
-
const bool success = fs_create_directory_with_parents(params.logdir);
|
|
53
|
-
if (!success) {
|
|
54
|
-
LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
|
|
55
|
-
__func__, params.logdir.c_str());
|
|
56
|
-
return;
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
|
60
|
-
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
|
61
|
-
|
|
62
|
-
if (logfile == NULL) {
|
|
63
|
-
LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
|
64
|
-
return;
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
fprintf(logfile, "binary: main\n");
|
|
68
|
-
char model_desc[128];
|
|
69
|
-
llama_model_desc(model, model_desc, sizeof(model_desc));
|
|
70
|
-
yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
|
|
71
|
-
|
|
72
|
-
fprintf(logfile, "\n");
|
|
73
|
-
fprintf(logfile, "######################\n");
|
|
74
|
-
fprintf(logfile, "# Perplexity Results #\n");
|
|
75
|
-
fprintf(logfile, "######################\n");
|
|
76
|
-
fprintf(logfile, "\n");
|
|
77
|
-
|
|
78
|
-
yaml_dump_vector_float(logfile, "logits", results.logits);
|
|
79
|
-
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
|
|
80
|
-
yaml_dump_vector_float(logfile, "probs", results.probs);
|
|
81
|
-
|
|
82
|
-
llama_perf_dump_yaml(logfile, ctx);
|
|
83
|
-
fclose(logfile);
|
|
84
|
-
}
|
|
85
|
-
|
|
86
37
|
static std::vector<float> softmax(const std::vector<float>& logits) {
|
|
87
38
|
std::vector<float> probs(logits.size());
|
|
88
39
|
float max_logit = logits[0];
|
|
@@ -169,7 +120,7 @@ static void process_logits(
|
|
|
169
120
|
break;
|
|
170
121
|
}
|
|
171
122
|
lock.unlock();
|
|
172
|
-
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
|
123
|
+
const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
|
|
173
124
|
const double v = -results.log_softmax;
|
|
174
125
|
local_nll += v;
|
|
175
126
|
local_nll2 += v*v;
|
|
@@ -203,7 +154,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
|
|
|
203
154
|
break;
|
|
204
155
|
}
|
|
205
156
|
lock.unlock();
|
|
206
|
-
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
|
|
157
|
+
const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
|
|
207
158
|
local_nll += v;
|
|
208
159
|
local_nll2 += v*v;
|
|
209
160
|
}
|
|
@@ -281,7 +232,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
|
|
|
281
232
|
kld.sum_kld += sum;
|
|
282
233
|
kld.sum_kld2 += sum*sum;
|
|
283
234
|
++kld.count;
|
|
284
|
-
if (imax == imax_base)
|
|
235
|
+
if (imax == imax_base) {
|
|
236
|
+
++kld.n_same_top;
|
|
237
|
+
}
|
|
285
238
|
|
|
286
239
|
const float p_base = expf(-nll_base);
|
|
287
240
|
const float p = expf(-nll);
|
|
@@ -323,7 +276,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
|
|
|
323
276
|
break;
|
|
324
277
|
}
|
|
325
278
|
lock.unlock();
|
|
326
|
-
std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
|
|
279
|
+
std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
|
|
327
280
|
kld_values[i] = (float)v.first;
|
|
328
281
|
p_diff_values[i] = v.second;
|
|
329
282
|
}
|
|
@@ -337,7 +290,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
|
|
|
337
290
|
}
|
|
338
291
|
}
|
|
339
292
|
|
|
340
|
-
static results_perplexity perplexity_v2(llama_context * ctx, const
|
|
293
|
+
static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
|
|
341
294
|
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
|
342
295
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
|
343
296
|
// Output: `perplexity: 13.5106 [114/114]`
|
|
@@ -348,7 +301,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
348
301
|
|
|
349
302
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
350
303
|
|
|
351
|
-
std::vector<llama_token> tokens =
|
|
304
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
|
352
305
|
|
|
353
306
|
const int n_ctx = llama_n_ctx(ctx);
|
|
354
307
|
|
|
@@ -383,9 +336,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
383
336
|
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
|
|
384
337
|
|
|
385
338
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
|
386
|
-
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
387
339
|
const int n_batch = params.n_batch;
|
|
388
340
|
|
|
341
|
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
342
|
+
|
|
389
343
|
int count = 0;
|
|
390
344
|
double nll = 0.0;
|
|
391
345
|
|
|
@@ -405,14 +359,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
405
359
|
// clear the KV cache
|
|
406
360
|
llama_kv_cache_clear(ctx);
|
|
407
361
|
|
|
362
|
+
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
363
|
+
|
|
408
364
|
for (int j = 0; j < num_batches; ++j) {
|
|
409
365
|
const int batch_start = start + j * n_batch;
|
|
410
366
|
const int batch_size = std::min(end - batch_start, n_batch);
|
|
411
367
|
|
|
368
|
+
common_batch_clear(batch);
|
|
369
|
+
for (int i = 0; i < batch_size; i++) {
|
|
370
|
+
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
|
|
371
|
+
}
|
|
372
|
+
|
|
412
373
|
//LOG_DBG(" Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
|
|
413
|
-
|
|
414
|
-
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
|
|
374
|
+
if (llama_decode(ctx, batch)) {
|
|
415
375
|
//LOG_ERR("%s : failed to eval\n", __func__);
|
|
376
|
+
llama_batch_free(batch);
|
|
416
377
|
return {tokens, -1, logit_history, prob_history};
|
|
417
378
|
}
|
|
418
379
|
|
|
@@ -424,14 +385,16 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
424
385
|
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
|
425
386
|
}
|
|
426
387
|
|
|
427
|
-
const auto batch_logits = llama_get_logits(ctx);
|
|
428
|
-
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
|
388
|
+
const auto * batch_logits = llama_get_logits(ctx);
|
|
389
|
+
logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
|
|
429
390
|
|
|
430
391
|
if (j == 0) {
|
|
431
392
|
tokens[batch_start] = token_org;
|
|
432
393
|
}
|
|
433
394
|
}
|
|
434
395
|
|
|
396
|
+
llama_batch_free(batch);
|
|
397
|
+
|
|
435
398
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
|
436
399
|
|
|
437
400
|
if (i == 0) {
|
|
@@ -447,11 +410,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
447
410
|
|
|
448
411
|
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
|
449
412
|
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
|
450
|
-
|
|
451
413
|
// Calculate probability of next token, given the previous ones.
|
|
452
414
|
const std::vector<float> tok_logits(
|
|
453
|
-
logits.begin() + (j + 0) * n_vocab,
|
|
454
|
-
logits.begin() + (j + 1) * n_vocab);
|
|
415
|
+
logits.begin() + size_t(j + 0) * n_vocab,
|
|
416
|
+
logits.begin() + size_t(j + 1) * n_vocab);
|
|
455
417
|
|
|
456
418
|
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
|
457
419
|
logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
|
|
@@ -472,7 +434,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
|
|
472
434
|
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
|
473
435
|
}
|
|
474
436
|
|
|
475
|
-
static results_perplexity perplexity(llama_context * ctx, const
|
|
437
|
+
static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
|
|
476
438
|
if (params.ppl_stride > 0) {
|
|
477
439
|
return perplexity_v2(ctx, params);
|
|
478
440
|
}
|
|
@@ -500,7 +462,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
500
462
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
|
501
463
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
|
502
464
|
|
|
503
|
-
std::vector<llama_token> tokens =
|
|
465
|
+
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
|
504
466
|
|
|
505
467
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
|
506
468
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
|
@@ -521,9 +483,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
521
483
|
const int n_chunk_max = tokens.size() / n_ctx;
|
|
522
484
|
|
|
523
485
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
|
524
|
-
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
525
486
|
const int n_batch = params.n_batch;
|
|
526
487
|
|
|
488
|
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
489
|
+
|
|
527
490
|
int count = 0;
|
|
528
491
|
double nll = 0.0;
|
|
529
492
|
double nll2 = 0.0;
|
|
@@ -538,7 +501,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
538
501
|
|
|
539
502
|
std::vector<float> logits;
|
|
540
503
|
if (num_batches > 1) {
|
|
541
|
-
logits.reserve((
|
|
504
|
+
logits.reserve(size_t(n_ctx) * n_vocab);
|
|
542
505
|
}
|
|
543
506
|
|
|
544
507
|
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
|
@@ -620,7 +583,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
620
583
|
|
|
621
584
|
if (num_batches > 1 && n_outputs > 0) {
|
|
622
585
|
const auto * batch_logits = llama_get_logits(ctx);
|
|
623
|
-
logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
|
|
586
|
+
logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
|
|
624
587
|
}
|
|
625
588
|
}
|
|
626
589
|
|
|
@@ -661,7 +624,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
661
624
|
} else {
|
|
662
625
|
double av = nll/count;
|
|
663
626
|
double av2 = nll2/count - av*av;
|
|
664
|
-
if (av2 > 0)
|
|
627
|
+
if (av2 > 0) {
|
|
628
|
+
av2 = sqrt(av2/(count-1));
|
|
629
|
+
}
|
|
665
630
|
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
|
666
631
|
}
|
|
667
632
|
}
|
|
@@ -686,10 +651,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
|
|
686
651
|
return {tokens, ppl, logit_history, prob_history};
|
|
687
652
|
}
|
|
688
653
|
|
|
689
|
-
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits,
|
|
654
|
+
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
|
|
690
655
|
int prev_outputs = 0;
|
|
691
|
-
for (
|
|
692
|
-
const
|
|
656
|
+
for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
|
|
657
|
+
const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
|
|
693
658
|
|
|
694
659
|
llama_batch batch_view = {
|
|
695
660
|
n_tokens,
|
|
@@ -699,7 +664,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
|
|
699
664
|
batch.n_seq_id + i,
|
|
700
665
|
batch.seq_id + i,
|
|
701
666
|
batch.logits + i,
|
|
702
|
-
0, 0, 0, // unused
|
|
703
667
|
};
|
|
704
668
|
|
|
705
669
|
const int ret = llama_decode(ctx, batch_view);
|
|
@@ -713,7 +677,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
|
|
713
677
|
n_outputs += batch_view.logits[i] != 0;
|
|
714
678
|
}
|
|
715
679
|
|
|
716
|
-
memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
|
|
680
|
+
memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
|
|
717
681
|
|
|
718
682
|
prev_outputs += n_outputs;
|
|
719
683
|
}
|
|
@@ -728,7 +692,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
|
|
728
692
|
if (eval_results.size() != eval_pairs.size()) {
|
|
729
693
|
eval_results.resize(eval_pairs.size());
|
|
730
694
|
}
|
|
731
|
-
if (eval_pairs.empty())
|
|
695
|
+
if (eval_pairs.empty()) {
|
|
696
|
+
return;
|
|
697
|
+
}
|
|
732
698
|
|
|
733
699
|
size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
|
|
734
700
|
|
|
@@ -736,11 +702,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
|
|
736
702
|
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
|
|
737
703
|
float local_logprobs[K_TOKEN_CHUNK];
|
|
738
704
|
while (true) {
|
|
739
|
-
size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
|
|
740
|
-
if (first >= eval_results.size())
|
|
741
|
-
|
|
705
|
+
const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
|
|
706
|
+
if (first >= eval_results.size()) {
|
|
707
|
+
break;
|
|
708
|
+
}
|
|
709
|
+
const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
|
|
742
710
|
for (size_t i = first; i < last; ++i) {
|
|
743
|
-
auto logits = batch_logits + eval_pairs[i].first * n_vocab;
|
|
711
|
+
const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
|
|
744
712
|
float max_logit = logits[0];
|
|
745
713
|
for (int j = 1; j < n_vocab; ++j) {
|
|
746
714
|
max_logit = std::max(max_logit, logits[j]);
|
|
@@ -763,7 +731,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
|
|
763
731
|
}
|
|
764
732
|
}
|
|
765
733
|
|
|
766
|
-
static void hellaswag_score(llama_context * ctx, const
|
|
734
|
+
static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
|
767
735
|
// Calculates hellaswag score (acc_norm) from prompt
|
|
768
736
|
//
|
|
769
737
|
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
|
@@ -844,7 +812,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
844
812
|
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
|
845
813
|
for (size_t j = 0; j < 4; j++) {
|
|
846
814
|
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
|
847
|
-
hs_cur.seq_tokens[j] =
|
|
815
|
+
hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
|
848
816
|
}
|
|
849
817
|
|
|
850
818
|
// determine the common prefix of the endings
|
|
@@ -877,10 +845,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
877
845
|
|
|
878
846
|
double acc = 0.0f;
|
|
879
847
|
|
|
880
|
-
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
881
848
|
const int n_ctx = llama_n_ctx(ctx);
|
|
882
849
|
const int n_batch = params.n_batch;
|
|
883
850
|
|
|
851
|
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
852
|
+
|
|
884
853
|
const int max_tasks_per_batch = 32;
|
|
885
854
|
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
|
886
855
|
|
|
@@ -888,7 +857,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
888
857
|
|
|
889
858
|
std::vector<float> tok_logits(n_vocab);
|
|
890
859
|
// TODO: this could be made smaller; it's currently the worst-case size
|
|
891
|
-
std::vector<float> batch_logits(
|
|
860
|
+
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
|
|
892
861
|
|
|
893
862
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
|
894
863
|
std::vector<float> eval_results;
|
|
@@ -900,7 +869,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
900
869
|
size_t i1 = i0;
|
|
901
870
|
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
|
902
871
|
|
|
903
|
-
|
|
872
|
+
common_batch_clear(batch);
|
|
904
873
|
|
|
905
874
|
// batch as much tasks as possible into the available context
|
|
906
875
|
// each task has 4 unique sequence ids - one for each ending
|
|
@@ -916,7 +885,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
916
885
|
}
|
|
917
886
|
|
|
918
887
|
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
|
|
919
|
-
|
|
888
|
+
common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
|
|
920
889
|
}
|
|
921
890
|
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
|
922
891
|
n_logits += 1;
|
|
@@ -926,7 +895,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
926
895
|
// TODO: don't evaluate the last token of each sequence
|
|
927
896
|
for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
|
|
928
897
|
const bool needs_logits = i < seq_tokens_size - 1;
|
|
929
|
-
|
|
898
|
+
common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
|
930
899
|
n_logits += needs_logits;
|
|
931
900
|
}
|
|
932
901
|
}
|
|
@@ -975,7 +944,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
|
|
975
944
|
auto & hs_cur = hs_data[i];
|
|
976
945
|
|
|
977
946
|
// get the logits of the last token of the common prefix
|
|
978
|
-
std::memcpy(tok_logits.data(), batch_logits.data() +
|
|
947
|
+
std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
|
|
979
948
|
|
|
980
949
|
const auto first_probs = softmax(tok_logits);
|
|
981
950
|
|
|
@@ -1102,7 +1071,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
|
|
1102
1071
|
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
|
|
1103
1072
|
*
|
|
1104
1073
|
*/
|
|
1105
|
-
static void winogrande_score(llama_context * ctx, const
|
|
1074
|
+
static void winogrande_score(llama_context * ctx, const common_params & params) {
|
|
1106
1075
|
|
|
1107
1076
|
constexpr int k_min_trailing_ctx = 3;
|
|
1108
1077
|
|
|
@@ -1136,8 +1105,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1136
1105
|
LOG_INF("%s : tokenizing selected tasks\n", __func__);
|
|
1137
1106
|
|
|
1138
1107
|
for (auto & task : data) {
|
|
1139
|
-
task.seq_tokens[0] =
|
|
1140
|
-
task.seq_tokens[1] =
|
|
1108
|
+
task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
|
1109
|
+
task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
|
1141
1110
|
|
|
1142
1111
|
task.common_prefix = 0;
|
|
1143
1112
|
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
|
@@ -1152,16 +1121,17 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1152
1121
|
task.seq_tokens[0].size() - task.common_prefix +
|
|
1153
1122
|
task.seq_tokens[1].size() - task.common_prefix;
|
|
1154
1123
|
|
|
1155
|
-
task.n_base1 =
|
|
1156
|
-
task.n_base2 =
|
|
1124
|
+
task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
|
|
1125
|
+
task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
|
|
1157
1126
|
}
|
|
1158
1127
|
|
|
1159
1128
|
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
|
|
1160
1129
|
|
|
1161
|
-
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
1162
1130
|
const int n_ctx = llama_n_ctx(ctx);
|
|
1163
1131
|
const int n_batch = params.n_batch;
|
|
1164
1132
|
|
|
1133
|
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
1134
|
+
|
|
1165
1135
|
const int max_tasks_per_batch = 128;
|
|
1166
1136
|
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
|
1167
1137
|
|
|
@@ -1169,7 +1139,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1169
1139
|
|
|
1170
1140
|
std::vector<float> tok_logits(n_vocab);
|
|
1171
1141
|
// TODO: this could be made smaller; it's currently the worst-case size
|
|
1172
|
-
std::vector<float> batch_logits(
|
|
1142
|
+
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
|
|
1173
1143
|
|
|
1174
1144
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
|
1175
1145
|
std::vector<float> eval_results;
|
|
@@ -1184,7 +1154,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1184
1154
|
size_t i1 = i0;
|
|
1185
1155
|
size_t i_logits = 0;
|
|
1186
1156
|
|
|
1187
|
-
|
|
1157
|
+
common_batch_clear(batch);
|
|
1188
1158
|
|
|
1189
1159
|
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
|
|
1190
1160
|
int n_logits = 0;
|
|
@@ -1194,7 +1164,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1194
1164
|
}
|
|
1195
1165
|
|
|
1196
1166
|
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
|
|
1197
|
-
|
|
1167
|
+
common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
|
|
1198
1168
|
}
|
|
1199
1169
|
batch.logits[batch.n_tokens - 1] = true;
|
|
1200
1170
|
n_logits += 1;
|
|
@@ -1202,7 +1172,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
|
|
1202
1172
|
for (int s = 0; s < 2; ++s) {
|
|
1203
1173
|
// TODO: end before the last token, no need to predict past the end of the sequences
|
|
1204
1174
|
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
|
|
1205
|
-
|
|
1175
|
+
common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
|
|
1206
1176
|
n_logits += 1;
|
|
1207
1177
|
}
|
|
1208
1178
|
}
|
|
@@ -1359,7 +1329,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
|
|
1359
1329
|
}
|
|
1360
1330
|
return false;
|
|
1361
1331
|
}
|
|
1362
|
-
task.seq_tokens.emplace_back(::
|
|
1332
|
+
task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
|
|
1363
1333
|
}
|
|
1364
1334
|
auto min_len = task.seq_tokens.front().size();
|
|
1365
1335
|
for (auto& seq : task.seq_tokens) {
|
|
@@ -1403,7 +1373,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
|
|
1403
1373
|
// git@hf.co:datasets/Stevross/mmlu
|
|
1404
1374
|
// https://huggingface.co/datasets/truthful_qa
|
|
1405
1375
|
//
|
|
1406
|
-
static void multiple_choice_score(llama_context * ctx, const
|
|
1376
|
+
static void multiple_choice_score(llama_context * ctx, const common_params & params) {
|
|
1407
1377
|
|
|
1408
1378
|
std::istringstream strstream(params.prompt);
|
|
1409
1379
|
uint32_t n_task;
|
|
@@ -1509,17 +1479,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1509
1479
|
|
|
1510
1480
|
LOG("\ntask\tacc_norm\n");
|
|
1511
1481
|
|
|
1512
|
-
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
1513
1482
|
const int n_ctx = llama_n_ctx(ctx);
|
|
1514
1483
|
const int n_batch = params.n_batch;
|
|
1515
1484
|
|
|
1485
|
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
1486
|
+
|
|
1516
1487
|
const int max_tasks_per_batch = 32;
|
|
1517
1488
|
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
|
1518
1489
|
|
|
1519
1490
|
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
|
1520
1491
|
|
|
1521
1492
|
std::vector<float> tok_logits(n_vocab);
|
|
1522
|
-
std::vector<float> batch_logits(
|
|
1493
|
+
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
|
|
1523
1494
|
|
|
1524
1495
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
|
1525
1496
|
std::vector<float> eval_results;
|
|
@@ -1536,7 +1507,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1536
1507
|
size_t i1 = i0;
|
|
1537
1508
|
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
|
1538
1509
|
|
|
1539
|
-
|
|
1510
|
+
common_batch_clear(batch);
|
|
1540
1511
|
|
|
1541
1512
|
// batch as much tasks as possible into the available context
|
|
1542
1513
|
// each task has 4 unique sequence ids - one for each ending
|
|
@@ -1559,7 +1530,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1559
1530
|
|
|
1560
1531
|
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
|
|
1561
1532
|
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
|
|
1562
|
-
|
|
1533
|
+
common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
|
|
1563
1534
|
}
|
|
1564
1535
|
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
|
1565
1536
|
n_logits += 1;
|
|
@@ -1569,7 +1540,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1569
1540
|
// TODO: don't evaluate the last token of each sequence
|
|
1570
1541
|
for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
|
|
1571
1542
|
const bool needs_logits = i < seq_tokens_size - 1;
|
|
1572
|
-
|
|
1543
|
+
common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
|
1573
1544
|
n_logits += needs_logits;
|
|
1574
1545
|
}
|
|
1575
1546
|
}
|
|
@@ -1627,7 +1598,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1627
1598
|
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
|
|
1628
1599
|
|
|
1629
1600
|
// get the logits of the last token of the common prefix
|
|
1630
|
-
std::memcpy(tok_logits.data(), batch_logits.data() +
|
|
1601
|
+
std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
|
|
1631
1602
|
|
|
1632
1603
|
const auto first_probs = softmax(tok_logits);
|
|
1633
1604
|
|
|
@@ -1683,7 +1654,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
|
|
1683
1654
|
LOG_INF("\n");
|
|
1684
1655
|
}
|
|
1685
1656
|
|
|
1686
|
-
static void kl_divergence(llama_context * ctx, const
|
|
1657
|
+
static void kl_divergence(llama_context * ctx, const common_params & params) {
|
|
1687
1658
|
if (params.logits_file.empty()) {
|
|
1688
1659
|
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
|
1689
1660
|
return;
|
|
@@ -1709,7 +1680,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1709
1680
|
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
|
1710
1681
|
}
|
|
1711
1682
|
|
|
1712
|
-
int n_vocab
|
|
1683
|
+
int n_vocab;
|
|
1684
|
+
int n_chunk;
|
|
1713
1685
|
in.read((char *)&n_vocab, sizeof(n_vocab));
|
|
1714
1686
|
in.read((char *)&n_chunk, sizeof(n_chunk));
|
|
1715
1687
|
if (in.fail()) {
|
|
@@ -1720,7 +1692,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1720
1692
|
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
|
1721
1693
|
}
|
|
1722
1694
|
|
|
1723
|
-
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
|
1695
|
+
std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
|
|
1724
1696
|
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
|
1725
1697
|
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
|
1726
1698
|
return;
|
|
@@ -1737,7 +1709,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1737
1709
|
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
|
1738
1710
|
std::vector<float> logits;
|
|
1739
1711
|
if (num_batches > 1) {
|
|
1740
|
-
logits.reserve(n_ctx * n_vocab);
|
|
1712
|
+
logits.reserve(size_t(n_ctx) * n_vocab);
|
|
1741
1713
|
}
|
|
1742
1714
|
|
|
1743
1715
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
|
@@ -1778,6 +1750,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1778
1750
|
// clear the KV cache
|
|
1779
1751
|
llama_kv_cache_clear(ctx);
|
|
1780
1752
|
|
|
1753
|
+
llama_batch batch = llama_batch_init(n_batch, 0, 1);
|
|
1754
|
+
|
|
1781
1755
|
for (int j = 0; j < num_batches; ++j) {
|
|
1782
1756
|
const int batch_start = start + j * n_batch;
|
|
1783
1757
|
const int batch_size = std::min(end - batch_start, n_batch);
|
|
@@ -1790,9 +1764,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1790
1764
|
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
|
1791
1765
|
}
|
|
1792
1766
|
|
|
1793
|
-
|
|
1794
|
-
|
|
1767
|
+
common_batch_clear(batch);
|
|
1768
|
+
for (int i = 0; i < batch_size; i++) {
|
|
1769
|
+
common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
if (llama_decode(ctx, batch)) {
|
|
1795
1773
|
LOG_ERR("%s : failed to eval\n", __func__);
|
|
1774
|
+
llama_batch_free(batch);
|
|
1796
1775
|
return;
|
|
1797
1776
|
}
|
|
1798
1777
|
|
|
@@ -1801,10 +1780,12 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1801
1780
|
|
|
1802
1781
|
if (num_batches > 1) {
|
|
1803
1782
|
const auto * batch_logits = llama_get_logits(ctx);
|
|
1804
|
-
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
|
1783
|
+
logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
|
|
1805
1784
|
}
|
|
1806
1785
|
}
|
|
1807
1786
|
|
|
1787
|
+
llama_batch_free(batch);
|
|
1788
|
+
|
|
1808
1789
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
|
1809
1790
|
|
|
1810
1791
|
if (i == 0) {
|
|
@@ -1822,7 +1803,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1822
1803
|
|
|
1823
1804
|
const int first = n_ctx/2;
|
|
1824
1805
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
|
1825
|
-
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
|
1806
|
+
process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
|
1826
1807
|
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
|
|
1827
1808
|
p_diff_ptr += n_ctx - 1 - first;
|
|
1828
1809
|
kld_ptr += n_ctx - 1 - first;
|
|
@@ -1955,17 +1936,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
|
|
1955
1936
|
}
|
|
1956
1937
|
|
|
1957
1938
|
int main(int argc, char ** argv) {
|
|
1958
|
-
|
|
1939
|
+
common_params params;
|
|
1959
1940
|
|
|
1960
1941
|
params.n_ctx = 512;
|
|
1961
1942
|
params.logits_all = true;
|
|
1962
1943
|
params.escape = false;
|
|
1963
1944
|
|
|
1964
|
-
if (!
|
|
1945
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
|
1965
1946
|
return 1;
|
|
1966
1947
|
}
|
|
1967
1948
|
|
|
1968
|
-
|
|
1949
|
+
common_init();
|
|
1969
1950
|
|
|
1970
1951
|
const int32_t n_ctx = params.n_ctx;
|
|
1971
1952
|
|
|
@@ -2004,7 +1985,7 @@ int main(int argc, char ** argv) {
|
|
|
2004
1985
|
llama_numa_init(params.numa);
|
|
2005
1986
|
|
|
2006
1987
|
// load the model and apply lora adapter, if any
|
|
2007
|
-
|
|
1988
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
2008
1989
|
|
|
2009
1990
|
llama_model * model = llama_init.model;
|
|
2010
1991
|
llama_context * ctx = llama_init.context;
|
|
@@ -2023,7 +2004,7 @@ int main(int argc, char ** argv) {
|
|
|
2023
2004
|
// print system information
|
|
2024
2005
|
{
|
|
2025
2006
|
LOG_INF("\n");
|
|
2026
|
-
LOG_INF("%s\n",
|
|
2007
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
2027
2008
|
}
|
|
2028
2009
|
|
|
2029
2010
|
struct results_perplexity results;
|
|
@@ -2042,8 +2023,6 @@ int main(int argc, char ** argv) {
|
|
|
2042
2023
|
LOG("\n");
|
|
2043
2024
|
llama_perf_context_print(ctx);
|
|
2044
2025
|
|
|
2045
|
-
write_logfile(ctx, params, model, results);
|
|
2046
|
-
|
|
2047
2026
|
llama_free(ctx);
|
|
2048
2027
|
llama_free_model(model);
|
|
2049
2028
|
|
|
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp)
|
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
|
6
|
-
target_compile_features(${TARGET} PRIVATE
|
|
6
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -48,9 +48,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|
|
48
48
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
|
49
49
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
|
50
50
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
|
51
|
-
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
52
|
-
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
53
|
-
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
|
54
51
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
|
55
52
|
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
|
56
53
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
|
@@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp)
|
|
|
3
3
|
install(TARGETS ${TARGET} RUNTIME)
|
|
4
4
|
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
|
5
5
|
target_include_directories(${TARGET} PRIVATE ../../common)
|
|
6
|
-
target_compile_features(${TARGET} PRIVATE
|
|
6
|
+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|