@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -12,60 +12,30 @@ llama_add_compile_flags()
|
|
|
12
12
|
|
|
13
13
|
# examples
|
|
14
14
|
|
|
15
|
-
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
|
16
|
-
|
|
17
15
|
if (EMSCRIPTEN)
|
|
18
16
|
else()
|
|
19
|
-
add_subdirectory(batched-bench)
|
|
20
17
|
add_subdirectory(batched)
|
|
21
18
|
add_subdirectory(embedding)
|
|
22
19
|
add_subdirectory(eval-callback)
|
|
23
20
|
|
|
24
|
-
if (NOT WIN32)
|
|
25
|
-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
|
26
|
-
add_subdirectory(gbnf-validator)
|
|
27
|
-
endif()
|
|
28
|
-
|
|
29
21
|
add_subdirectory(gguf-hash)
|
|
30
|
-
add_subdirectory(gguf-split)
|
|
31
22
|
add_subdirectory(gguf)
|
|
32
23
|
add_subdirectory(gritlm)
|
|
33
|
-
add_subdirectory(imatrix)
|
|
34
|
-
add_subdirectory(infill)
|
|
35
|
-
add_subdirectory(llama-bench)
|
|
36
24
|
add_subdirectory(lookahead)
|
|
37
25
|
add_subdirectory(lookup)
|
|
38
|
-
add_subdirectory(main)
|
|
39
26
|
add_subdirectory(parallel)
|
|
40
27
|
add_subdirectory(passkey)
|
|
41
|
-
add_subdirectory(perplexity)
|
|
42
|
-
add_subdirectory(quantize)
|
|
43
28
|
add_subdirectory(retrieval)
|
|
44
|
-
if (LLAMA_BUILD_SERVER)
|
|
45
|
-
add_subdirectory(server)
|
|
46
|
-
endif()
|
|
47
29
|
add_subdirectory(save-load-state)
|
|
48
|
-
add_subdirectory(run)
|
|
49
30
|
add_subdirectory(simple)
|
|
50
31
|
add_subdirectory(simple-chat)
|
|
51
32
|
add_subdirectory(speculative)
|
|
52
33
|
add_subdirectory(speculative-simple)
|
|
53
|
-
add_subdirectory(tokenize)
|
|
54
|
-
add_subdirectory(tts)
|
|
55
34
|
add_subdirectory(gen-docs)
|
|
35
|
+
add_subdirectory(training)
|
|
56
36
|
if (NOT GGML_BACKEND_DL)
|
|
57
|
-
# these examples use the backends directly and cannot be built with dynamic loading
|
|
58
37
|
add_subdirectory(convert-llama2c-to-ggml)
|
|
59
|
-
|
|
60
|
-
add_subdirectory(export-lora)
|
|
61
|
-
if (NOT WIN32)
|
|
62
|
-
# disabled on Windows because it uses internal functions not exported with LLAMA_API
|
|
63
|
-
add_subdirectory(quantize-stats)
|
|
64
|
-
endif()
|
|
65
|
-
add_subdirectory(llava)
|
|
66
|
-
if (GGML_RPC)
|
|
67
|
-
add_subdirectory(rpc)
|
|
68
|
-
endif()
|
|
38
|
+
# these examples use the backends directly and cannot be built with dynamic loading
|
|
69
39
|
if (GGML_SYCL)
|
|
70
40
|
add_subdirectory(sycl)
|
|
71
41
|
endif()
|
|
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {
|
|
|
41
41
|
|
|
42
42
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
43
43
|
|
|
44
|
-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
44
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
|
45
45
|
|
|
46
46
|
if (model == NULL) {
|
|
47
47
|
LOG_ERR("%s: error: unable to load model\n" , __func__);
|
|
@@ -35,23 +35,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
|
|
|
35
35
|
|
|
36
36
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
|
37
37
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
|
38
|
-
const struct llama_model * model = llama_get_model(ctx);
|
|
39
38
|
|
|
40
39
|
// clear previous kv_cache values (irrelevant for embeddings)
|
|
41
40
|
llama_kv_self_clear(ctx);
|
|
42
41
|
|
|
43
42
|
// run model
|
|
44
43
|
LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
|
|
45
|
-
if (
|
|
46
|
-
|
|
47
|
-
if (llama_encode(ctx, batch) < 0) {
|
|
48
|
-
LOG_ERR("%s : failed to encode\n", __func__);
|
|
49
|
-
}
|
|
50
|
-
} else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
|
|
51
|
-
// decoder-only model
|
|
52
|
-
if (llama_decode(ctx, batch) < 0) {
|
|
53
|
-
LOG_ERR("%s : failed to decode\n", __func__);
|
|
54
|
-
}
|
|
44
|
+
if (llama_encode(ctx, batch) < 0) {
|
|
45
|
+
LOG_ERR("%s : failed to encode\n", __func__);
|
|
55
46
|
}
|
|
56
47
|
|
|
57
48
|
for (int i = 0; i < batch.n_tokens; i++) {
|
|
@@ -89,6 +80,13 @@ int main(int argc, char ** argv) {
|
|
|
89
80
|
common_init();
|
|
90
81
|
|
|
91
82
|
params.embedding = true;
|
|
83
|
+
|
|
84
|
+
// utilize the full context
|
|
85
|
+
if (params.n_batch < params.n_ctx) {
|
|
86
|
+
LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
|
|
87
|
+
params.n_batch = params.n_ctx;
|
|
88
|
+
}
|
|
89
|
+
|
|
92
90
|
// For non-causal models, batch size must be equal to ubatch size
|
|
93
91
|
params.n_ubatch = params.n_batch;
|
|
94
92
|
|
|
@@ -134,7 +132,6 @@ int main(int argc, char ** argv) {
|
|
|
134
132
|
|
|
135
133
|
// max batch size
|
|
136
134
|
const uint64_t n_batch = params.n_batch;
|
|
137
|
-
GGML_ASSERT(params.n_batch >= params.n_ctx);
|
|
138
135
|
|
|
139
136
|
// tokenize the prompts and trim
|
|
140
137
|
std::vector<std::vector<int32_t>> inputs;
|
|
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {
|
|
|
168
168
|
|
|
169
169
|
llama_backend_init();
|
|
170
170
|
|
|
171
|
-
llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
|
|
171
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
|
|
172
172
|
|
|
173
173
|
// create generation context
|
|
174
174
|
llama_context * ctx = llama_init_from_model(model, cparams);
|
|
@@ -34,11 +34,61 @@ static std::string k_system =
|
|
|
34
34
|
R"(Transcript of a never ending dialog, where the User interacts with an Assistant.
|
|
35
35
|
The Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
|
36
36
|
|
|
37
|
-
User:
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
User:
|
|
37
|
+
User:
|
|
38
|
+
Recommend a nice restaurant in the area.
|
|
39
|
+
Assistant:
|
|
40
|
+
I recommend the restaurant "The Golden Duck". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.
|
|
41
|
+
User:
|
|
42
|
+
Who is Richard Feynman?
|
|
43
|
+
Assistant:
|
|
44
|
+
Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including "Surely You're Joking, Mr. Feynman!" and "What Do You Care What Other People Think?".
|
|
45
|
+
)";
|
|
46
|
+
|
|
47
|
+
static std::vector<std::string> k_questions = {
|
|
48
|
+
"What is the tallest mountain in the world?",
|
|
49
|
+
"Who was the first person to win two Nobel Prizes?",
|
|
50
|
+
"Which country invented paper?",
|
|
51
|
+
"What organ is primarily responsible for pumping blood throughout the body?",
|
|
52
|
+
"Which planet is known for its prominent ring system?",
|
|
53
|
+
"Who directed the movie 'Inception'?",
|
|
54
|
+
"What is the freezing point of water in Fahrenheit?",
|
|
55
|
+
"Which animal is known to have the longest lifespan?",
|
|
56
|
+
"What language has the most native speakers worldwide?",
|
|
57
|
+
"What is the capital city of Canada?",
|
|
58
|
+
"Who is credited with inventing the World Wide Web?",
|
|
59
|
+
"Which metal is liquid at room temperature?",
|
|
60
|
+
"What is the term for an animal that eats both plants and meat?",
|
|
61
|
+
"Who painted 'The Starry Night'?",
|
|
62
|
+
"What gas do humans exhale that plants use for photosynthesis?",
|
|
63
|
+
"What year did World War II end?",
|
|
64
|
+
"Which continent has the most countries?",
|
|
65
|
+
"Who wrote the novel 'Frankenstein'?",
|
|
66
|
+
"What does DNA stand for?",
|
|
67
|
+
"What is the main ingredient in traditional Japanese miso soup?"
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
static std::vector<std::string> k_answers = {
|
|
71
|
+
"The tallest mountain in the world is Mount Everest.",
|
|
72
|
+
"Marie Curie was the first person to win two Nobel Prizes.",
|
|
73
|
+
"Paper was invented in China.",
|
|
74
|
+
"The heart is the organ responsible for pumping blood.",
|
|
75
|
+
"Saturn is known for its prominent ring system.",
|
|
76
|
+
"Christopher Nolan directed the movie 'Inception'.",
|
|
77
|
+
"The freezing point of water in Fahrenheit is 32°F.",
|
|
78
|
+
"The bowhead whale is known to have the longest lifespan among mammals.",
|
|
79
|
+
"Mandarin Chinese has the most native speakers in the world.",
|
|
80
|
+
"The capital city of Canada is Ottawa.",
|
|
81
|
+
"Tim Berners-Lee is credited with inventing the World Wide Web.",
|
|
82
|
+
"Mercury is the metal that is liquid at room temperature.",
|
|
83
|
+
"An animal that eats both plants and meat is called an omnivore.",
|
|
84
|
+
"'The Starry Night' was painted by Vincent van Gogh.",
|
|
85
|
+
"Humans exhale carbon dioxide, which plants use in photosynthesis.",
|
|
86
|
+
"World War II ended in 1945.",
|
|
87
|
+
"Africa is the continent with the most countries.",
|
|
88
|
+
"The novel 'Frankenstein' was written by Mary Shelley.",
|
|
89
|
+
"DNA stands for Deoxyribonucleic Acid.",
|
|
90
|
+
"The main ingredient in traditional Japanese miso soup is fermented soybean paste."
|
|
91
|
+
};
|
|
42
92
|
|
|
43
93
|
static std::vector<std::string> k_prompts = {
|
|
44
94
|
"What is the meaning of life?",
|
|
@@ -49,7 +99,7 @@ static std::vector<std::string> k_prompts = {
|
|
|
49
99
|
"What is the best way to learn a new language?",
|
|
50
100
|
"How to get a job at Google?",
|
|
51
101
|
"If you could have any superpower, what would it be?",
|
|
52
|
-
"I want to learn how to play the piano.",
|
|
102
|
+
"I want to learn how to play the piano. What would be the best way to do it?",
|
|
53
103
|
};
|
|
54
104
|
|
|
55
105
|
struct client {
|
|
@@ -68,6 +118,7 @@ struct client {
|
|
|
68
118
|
int64_t t_start_prompt;
|
|
69
119
|
int64_t t_start_gen;
|
|
70
120
|
|
|
121
|
+
int32_t n_past = 0;
|
|
71
122
|
int32_t n_prompt = 0;
|
|
72
123
|
int32_t n_decoded = 0;
|
|
73
124
|
int32_t i_batch = -1;
|
|
@@ -106,6 +157,9 @@ int main(int argc, char ** argv) {
|
|
|
106
157
|
|
|
107
158
|
common_params params;
|
|
108
159
|
|
|
160
|
+
params.n_predict = 128;
|
|
161
|
+
params.n_junk = 0;
|
|
162
|
+
|
|
109
163
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
|
110
164
|
return 1;
|
|
111
165
|
}
|
|
@@ -126,6 +180,12 @@ int main(int argc, char ** argv) {
|
|
|
126
180
|
|
|
127
181
|
const bool dump_kv_cache = params.dump_kv_cache;
|
|
128
182
|
|
|
183
|
+
// is the system prompt shared in the cache
|
|
184
|
+
const bool is_sp_shared = params.is_pp_shared;
|
|
185
|
+
|
|
186
|
+
// extra text to insert in each client's prompt in order to make it larger
|
|
187
|
+
const int32_t n_junk = params.n_junk;
|
|
188
|
+
|
|
129
189
|
// init llama.cpp
|
|
130
190
|
llama_backend_init();
|
|
131
191
|
llama_numa_init(params.numa);
|
|
@@ -167,6 +227,7 @@ int main(int argc, char ** argv) {
|
|
|
167
227
|
}
|
|
168
228
|
|
|
169
229
|
std::vector<llama_token> tokens_system;
|
|
230
|
+
|
|
170
231
|
tokens_system = common_tokenize(ctx, k_system, true);
|
|
171
232
|
const int32_t n_tokens_system = tokens_system.size();
|
|
172
233
|
|
|
@@ -188,7 +249,7 @@ int main(int argc, char ** argv) {
|
|
|
188
249
|
LOG_INF("%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
|
189
250
|
LOG_INF("\n");
|
|
190
251
|
|
|
191
|
-
{
|
|
252
|
+
if (is_sp_shared) {
|
|
192
253
|
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
|
193
254
|
|
|
194
255
|
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
|
@@ -226,7 +287,7 @@ int main(int argc, char ** argv) {
|
|
|
226
287
|
|
|
227
288
|
client.i_batch = batch.n_tokens;
|
|
228
289
|
|
|
229
|
-
common_batch_add(batch, client.sampled,
|
|
290
|
+
common_batch_add(batch, client.sampled, client.n_past++, { client.id + 1 }, true);
|
|
230
291
|
|
|
231
292
|
client.n_decoded += 1;
|
|
232
293
|
}
|
|
@@ -252,9 +313,23 @@ int main(int argc, char ** argv) {
|
|
|
252
313
|
client.t_start_gen = 0;
|
|
253
314
|
|
|
254
315
|
client.input = k_prompts[rand() % k_prompts.size()];
|
|
255
|
-
client.prompt = client.input + "\nAssistant:";
|
|
256
316
|
client.response = "";
|
|
257
317
|
|
|
318
|
+
// construct the prompt:
|
|
319
|
+
// [system prompt] + [junk] + [user prompt]
|
|
320
|
+
client.n_past = 0;
|
|
321
|
+
client.prompt = "";
|
|
322
|
+
if (is_sp_shared) {
|
|
323
|
+
client.n_past = n_tokens_system;
|
|
324
|
+
} else {
|
|
325
|
+
client.prompt += k_system;
|
|
326
|
+
}
|
|
327
|
+
for (int i = 0; i < n_junk; ++i) {
|
|
328
|
+
const int r = rand() % k_questions.size();
|
|
329
|
+
client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n";
|
|
330
|
+
}
|
|
331
|
+
client.prompt += "User:\n" + client.input + "\nAssistant:\n";
|
|
332
|
+
|
|
258
333
|
common_sampler_reset(client.smpl);
|
|
259
334
|
|
|
260
335
|
// do not prepend BOS because we have a system prompt!
|
|
@@ -262,7 +337,7 @@ int main(int argc, char ** argv) {
|
|
|
262
337
|
tokens_prompt = common_tokenize(ctx, client.prompt, false);
|
|
263
338
|
|
|
264
339
|
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
|
265
|
-
common_batch_add(batch, tokens_prompt[i],
|
|
340
|
+
common_batch_add(batch, tokens_prompt[i], client.n_past++, { client.id + 1 }, false);
|
|
266
341
|
}
|
|
267
342
|
|
|
268
343
|
// extract the logits only for the last token
|
|
@@ -361,10 +436,9 @@ int main(int argc, char ** argv) {
|
|
|
361
436
|
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
|
362
437
|
|
|
363
438
|
if (client.n_decoded > 2 &&
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
client.response.find('\n') != std::string::npos)) {
|
|
439
|
+
(llama_vocab_is_eog(vocab, id) ||
|
|
440
|
+
(params.n_predict > 0 && client.n_decoded >= params.n_predict) ||
|
|
441
|
+
client.response.find("User:") != std::string::npos)) {
|
|
368
442
|
// basic reverse prompt
|
|
369
443
|
const size_t pos = client.response.find("User:");
|
|
370
444
|
if (pos != std::string::npos) {
|
|
@@ -405,7 +479,7 @@ int main(int argc, char ** argv) {
|
|
|
405
479
|
params.prompt_file = "used built-in defaults";
|
|
406
480
|
}
|
|
407
481
|
LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
|
408
|
-
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
|
482
|
+
LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str());
|
|
409
483
|
|
|
410
484
|
LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
|
411
485
|
LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
|
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
|
|
|
64
64
|
|
|
65
65
|
llama_model_params model_params = common_model_params_to_llama(params);
|
|
66
66
|
|
|
67
|
-
llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
|
|
67
|
+
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
|
|
68
68
|
|
|
69
69
|
if (model == NULL) {
|
|
70
70
|
LOG_ERR("%s: unable to load model\n" , __func__);
|
|
@@ -8,10 +8,10 @@ cd build
|
|
|
8
8
|
source /opt/intel/oneapi/setvars.sh
|
|
9
9
|
|
|
10
10
|
#for FP16
|
|
11
|
-
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
|
|
11
|
+
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
|
|
12
12
|
|
|
13
13
|
#for FP32
|
|
14
|
-
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
|
14
|
+
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
|
|
15
15
|
|
|
16
16
|
#build example/main
|
|
17
17
|
#cmake --build . --config Release --target main
|
|
@@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
|
|
13
13
|
|
|
14
14
|
:: for FP16
|
|
15
15
|
:: faster for long-prompt inference
|
|
16
|
-
:: cmake -G "MinGW Makefiles" ..
|
|
16
|
+
:: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
|
17
17
|
|
|
18
18
|
:: for FP32
|
|
19
|
-
cmake -G "Ninja" ..
|
|
19
|
+
cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
|
20
20
|
if %errorlevel% neq 0 goto ERROR
|
|
21
21
|
:: build example/main only
|
|
22
22
|
:: make main
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
#include "arg.h"
|
|
2
|
+
#include "common.h"
|
|
3
|
+
#include "log.h"
|
|
4
|
+
#include "llama.h"
|
|
5
|
+
|
|
6
|
+
#include <cmath>
|
|
7
|
+
#include <cstdio>
|
|
8
|
+
#include <cstring>
|
|
9
|
+
#include <ctime>
|
|
10
|
+
#include <vector>
|
|
11
|
+
|
|
12
|
+
#if defined(_MSC_VER)
|
|
13
|
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
14
|
+
#endif
|
|
15
|
+
|
|
16
|
+
int main(int argc, char ** argv) {
|
|
17
|
+
common_params params;
|
|
18
|
+
|
|
19
|
+
params.escape = false;
|
|
20
|
+
|
|
21
|
+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
|
22
|
+
return 1;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if (params.use_mmap) {
|
|
26
|
+
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
|
|
27
|
+
params.use_mmap = false;
|
|
28
|
+
}
|
|
29
|
+
if (params.cache_type_k != GGML_TYPE_F32) {
|
|
30
|
+
LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
|
31
|
+
params.cache_type_k = GGML_TYPE_F32;
|
|
32
|
+
}
|
|
33
|
+
if (params.cache_type_v != GGML_TYPE_F32) {
|
|
34
|
+
LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
|
35
|
+
params.cache_type_v = GGML_TYPE_F32;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
common_init();
|
|
39
|
+
llama_backend_init();
|
|
40
|
+
llama_numa_init(params.numa);
|
|
41
|
+
|
|
42
|
+
// load the model and apply lora adapter, if any
|
|
43
|
+
common_init_result llama_init = common_init_from_params(params);
|
|
44
|
+
llama_model_ptr & model = llama_init.model;
|
|
45
|
+
llama_context_ptr & ctx = llama_init.context;
|
|
46
|
+
|
|
47
|
+
if (model == NULL) {
|
|
48
|
+
LOG_ERR("%s: unable to load model\n", __func__);
|
|
49
|
+
return 1;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// print system information
|
|
53
|
+
{
|
|
54
|
+
LOG_INF("\n");
|
|
55
|
+
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
constexpr float val_split = 0.05f;
|
|
59
|
+
|
|
60
|
+
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
|
61
|
+
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
|
|
62
|
+
|
|
63
|
+
struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
|
|
64
|
+
optimizer_params.adamw.alpha = 1e-7f; // learning rate
|
|
65
|
+
|
|
66
|
+
struct llama_opt_params lopt_params {
|
|
67
|
+
/*n_ctx_train =*/ 0,
|
|
68
|
+
/*param_filter =*/ llama_opt_param_filter_all,
|
|
69
|
+
/*param_filter_ud =*/ nullptr,
|
|
70
|
+
/*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
|
|
71
|
+
/*get_opt_pars_ud =*/ &optimizer_params,
|
|
72
|
+
};
|
|
73
|
+
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
|
74
|
+
|
|
75
|
+
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
|
|
76
|
+
|
|
77
|
+
ggml_opt_result_t result_train = ggml_opt_result_init();
|
|
78
|
+
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
|
79
|
+
|
|
80
|
+
for (int epoch = 0; epoch < 2; ++epoch) {
|
|
81
|
+
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
|
82
|
+
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
|
83
|
+
fprintf(stderr, "\n");
|
|
84
|
+
|
|
85
|
+
ggml_opt_result_reset(result_train);
|
|
86
|
+
ggml_opt_result_reset(result_eval);
|
|
87
|
+
}
|
|
88
|
+
ggml_opt_result_free(result_train);
|
|
89
|
+
ggml_opt_result_free(result_eval);
|
|
90
|
+
|
|
91
|
+
llama_model_save_to_file(model.get(), "finetuned-model.gguf");
|
|
92
|
+
|
|
93
|
+
llama_backend_free();
|
|
94
|
+
|
|
95
|
+
return 0;
|
|
96
|
+
}
|
|
@@ -100,9 +100,14 @@ else()
|
|
|
100
100
|
set(INS_ENB ON)
|
|
101
101
|
endif()
|
|
102
102
|
|
|
103
|
+
message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
|
|
104
|
+
message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
|
105
|
+
message(DEBUG "INS_ENB : ${INS_ENB}")
|
|
106
|
+
|
|
103
107
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
|
104
108
|
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
|
105
109
|
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
|
110
|
+
option(GGML_SSE42 "ggml: enable SSE 4.2" ${INS_ENB})
|
|
106
111
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
|
107
112
|
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
|
|
108
113
|
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
|
@@ -123,10 +128,12 @@ endif()
|
|
|
123
128
|
option(GGML_LASX "ggml: enable lasx" ON)
|
|
124
129
|
option(GGML_LSX "ggml: enable lsx" ON)
|
|
125
130
|
option(GGML_RVV "ggml: enable rvv" ON)
|
|
131
|
+
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
|
126
132
|
option(GGML_VXE "ggml: enable vxe" ON)
|
|
127
133
|
|
|
128
134
|
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
|
129
|
-
set(GGML_CPU_ARM_ARCH
|
|
135
|
+
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
|
136
|
+
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
|
130
137
|
|
|
131
138
|
|
|
132
139
|
if (WIN32)
|
|
@@ -164,7 +171,6 @@ option(GGML_HIP "ggml: use HIP"
|
|
|
164
171
|
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
|
|
165
172
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
|
166
173
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
|
167
|
-
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
|
168
174
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
|
169
175
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
|
170
176
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
|
@@ -187,6 +193,7 @@ option(GGML_RPC "ggml: use RPC"
|
|
|
187
193
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
|
188
194
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
|
189
195
|
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
|
196
|
+
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
|
|
190
197
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
|
191
198
|
"ggml: sycl target device")
|
|
192
199
|
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
|
@@ -354,3 +361,29 @@ write_basic_package_version_file(
|
|
|
354
361
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
|
|
355
362
|
${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
|
|
356
363
|
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)
|
|
364
|
+
|
|
365
|
+
if (MSVC)
|
|
366
|
+
set(MSVC_WARNING_FLAGS
|
|
367
|
+
/wd4005 # Macro redefinition
|
|
368
|
+
/wd4244 # Conversion from one type to another type, possible loss of data
|
|
369
|
+
/wd4267 # Conversion from 'size_t' to a smaller type, possible loss of data
|
|
370
|
+
/wd4996 # Disable POSIX deprecation warnings
|
|
371
|
+
/wd4702 # Unreachable code warnings
|
|
372
|
+
)
|
|
373
|
+
function(disable_msvc_warnings target_name)
|
|
374
|
+
if(TARGET ${target_name})
|
|
375
|
+
target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
|
|
376
|
+
endif()
|
|
377
|
+
endfunction()
|
|
378
|
+
|
|
379
|
+
disable_msvc_warnings(ggml-base)
|
|
380
|
+
disable_msvc_warnings(ggml)
|
|
381
|
+
disable_msvc_warnings(ggml-cpu)
|
|
382
|
+
disable_msvc_warnings(ggml-cpu-x64)
|
|
383
|
+
disable_msvc_warnings(ggml-cpu-sse42)
|
|
384
|
+
disable_msvc_warnings(ggml-cpu-sandybridge)
|
|
385
|
+
disable_msvc_warnings(ggml-cpu-haswell)
|
|
386
|
+
disable_msvc_warnings(ggml-cpu-skylakex)
|
|
387
|
+
disable_msvc_warnings(ggml-cpu-icelake)
|
|
388
|
+
disable_msvc_warnings(ggml-cpu-alderlake)
|
|
389
|
+
endif()
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
find_package(Git)
|
|
2
|
+
|
|
3
|
+
# the commit's SHA1
|
|
4
|
+
execute_process(COMMAND
|
|
5
|
+
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
|
6
|
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
7
|
+
OUTPUT_VARIABLE GIT_SHA1
|
|
8
|
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
9
|
+
|
|
10
|
+
# the date of the commit
|
|
11
|
+
execute_process(COMMAND
|
|
12
|
+
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
|
13
|
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
14
|
+
OUTPUT_VARIABLE GIT_DATE
|
|
15
|
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
16
|
+
|
|
17
|
+
# the subject of the commit
|
|
18
|
+
execute_process(COMMAND
|
|
19
|
+
"${GIT_EXECUTABLE}" log -1 --format=%s
|
|
20
|
+
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
21
|
+
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
|
22
|
+
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
@@ -38,7 +38,7 @@ extern "C" {
|
|
|
38
38
|
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
|
39
39
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
|
40
40
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
|
41
|
-
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
|
41
|
+
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
|
42
42
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
|
43
43
|
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
|
44
44
|
|
|
@@ -59,7 +59,7 @@ extern "C" {
|
|
|
59
59
|
GGML_API enum ggml_status ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
60
60
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
|
61
61
|
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
|
62
|
-
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
|
62
|
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor);
|
|
63
63
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
|
64
64
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
|
65
65
|
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
|
@@ -248,7 +248,7 @@ extern "C" {
|
|
|
248
248
|
// preferrably to run on the same backend as the buffer
|
|
249
249
|
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
250
250
|
|
|
251
|
-
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
|
|
251
|
+
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
|
|
252
252
|
|
|
253
253
|
// initialize buffers from a max size graph (optional)
|
|
254
254
|
reserve_graph = build_graph(sched, max_batch_size);
|
|
@@ -289,7 +289,7 @@ extern "C" {
|
|
|
289
289
|
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
|
290
290
|
|
|
291
291
|
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
|
292
|
-
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
|
292
|
+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
|
|
293
293
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
|
294
294
|
|
|
295
295
|
// Initialize backend buffers from a measure graph
|
|
@@ -24,7 +24,7 @@ typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
|
|
|
24
24
|
|
|
25
25
|
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
|
|
26
26
|
|
|
27
|
-
typedef std::unique_ptr<
|
|
27
|
+
typedef std::unique_ptr<ggml_gallocr, ggml_gallocr_deleter> ggml_gallocr_ptr;
|
|
28
28
|
|
|
29
29
|
// ggml-backend
|
|
30
30
|
|
|
@@ -133,6 +133,11 @@ extern "C" {
|
|
|
133
133
|
|
|
134
134
|
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
|
135
135
|
|
|
136
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
|
|
137
|
+
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
|
|
138
|
+
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
|
|
139
|
+
GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
|
|
140
|
+
|
|
136
141
|
#ifdef __cplusplus
|
|
137
142
|
}
|
|
138
143
|
#endif
|