@fugood/llama.node 0.6.3 → 1.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +40 -30
- package/README.md +4 -1
- package/lib/binding.js +41 -29
- package/lib/binding.ts +26 -25
- package/package.json +40 -7
- package/scripts/build.js +47 -0
- package/scripts/llama.cpp.patch +109 -0
- package/src/anyascii.c +22223 -0
- package/src/anyascii.h +42 -0
- package/src/tts_utils.cpp +20 -7
- package/src/tts_utils.h +2 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +0 -233
- package/src/llama.cpp/.github/workflows/build.yml +0 -1078
- package/src/llama.cpp/.github/workflows/close-issue.yml +0 -28
- package/src/llama.cpp/.github/workflows/docker.yml +0 -178
- package/src/llama.cpp/.github/workflows/editorconfig.yml +0 -29
- package/src/llama.cpp/.github/workflows/gguf-publish.yml +0 -44
- package/src/llama.cpp/.github/workflows/labeler.yml +0 -17
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +0 -33
- package/src/llama.cpp/.github/workflows/python-lint.yml +0 -30
- package/src/llama.cpp/.github/workflows/python-type-check.yml +0 -40
- package/src/llama.cpp/.github/workflows/release.yml +0 -739
- package/src/llama.cpp/.github/workflows/server.yml +0 -237
- package/src/llama.cpp/.github/workflows/winget.yml +0 -42
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +0 -16
- package/src/llama.cpp/cmake/arm64-windows-llvm.cmake +0 -16
- package/src/llama.cpp/cmake/build-info.cmake +0 -64
- package/src/llama.cpp/cmake/common.cmake +0 -35
- package/src/llama.cpp/cmake/git-vars.cmake +0 -22
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -5
- package/src/llama.cpp/common/build-info.cpp.in +0 -4
- package/src/llama.cpp/docs/build.md +0 -561
- package/src/llama.cpp/examples/CMakeLists.txt +0 -43
- package/src/llama.cpp/examples/batched/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/batched/batched.cpp +0 -246
- package/src/llama.cpp/examples/chat-13B.bat +0 -57
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +0 -941
- package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +0 -35
- package/src/llama.cpp/examples/embedding/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/embedding/embedding.cpp +0 -323
- package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +0 -10
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +0 -194
- package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +0 -83
- package/src/llama.cpp/examples/gguf/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/gguf/gguf.cpp +0 -265
- package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +0 -22
- package/src/llama.cpp/examples/gguf-hash/deps/rotate-bits/rotate-bits.h +0 -46
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.c +0 -295
- package/src/llama.cpp/examples/gguf-hash/deps/sha1/sha1.h +0 -52
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.c +0 -221
- package/src/llama.cpp/examples/gguf-hash/deps/sha256/sha256.h +0 -24
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.c +0 -42
- package/src/llama.cpp/examples/gguf-hash/deps/xxhash/xxhash.h +0 -7093
- package/src/llama.cpp/examples/gguf-hash/gguf-hash.cpp +0 -694
- package/src/llama.cpp/examples/gritlm/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +0 -229
- package/src/llama.cpp/examples/jeopardy/questions.txt +0 -100
- package/src/llama.cpp/examples/llama.android/app/build.gradle.kts +0 -65
- package/src/llama.cpp/examples/llama.android/build.gradle.kts +0 -6
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +0 -71
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/CMakeLists.txt +0 -53
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +0 -452
- package/src/llama.cpp/examples/llama.android/settings.gradle.kts +0 -18
- package/src/llama.cpp/examples/lookahead/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -472
- package/src/llama.cpp/examples/lookup/CMakeLists.txt +0 -23
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +0 -40
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +0 -47
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -157
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -242
- package/src/llama.cpp/examples/parallel/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -492
- package/src/llama.cpp/examples/passkey/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/passkey/passkey.cpp +0 -277
- package/src/llama.cpp/examples/retrieval/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +0 -304
- package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -246
- package/src/llama.cpp/examples/simple/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/simple/simple.cpp +0 -206
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +0 -206
- package/src/llama.cpp/examples/simple-cmake-pkg/CMakeLists.txt +0 -11
- package/src/llama.cpp/examples/speculative/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/speculative/speculative.cpp +0 -644
- package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +0 -261
- package/src/llama.cpp/examples/sycl/CMakeLists.txt +0 -9
- package/src/llama.cpp/examples/sycl/build.sh +0 -23
- package/src/llama.cpp/examples/sycl/ls-sycl-device.cpp +0 -13
- package/src/llama.cpp/examples/sycl/run-llama2.sh +0 -27
- package/src/llama.cpp/examples/sycl/run-llama3.sh +0 -28
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +0 -33
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +0 -9
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +0 -9
- package/src/llama.cpp/examples/training/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/training/finetune.cpp +0 -96
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +0 -22
- package/src/llama.cpp/ggml/cmake/common.cmake +0 -26
- package/src/llama.cpp/ggml/src/ggml-alloc.c +0 -1042
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +0 -255
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +0 -586
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +0 -2008
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +0 -87
- package/src/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +0 -517
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -74
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +0 -179
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +0 -258
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +0 -2863
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +0 -1110
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +0 -420
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +0 -2570
- package/src/llama.cpp/ggml/src/ggml-common.h +0 -1857
- package/src/llama.cpp/ggml/src/ggml-cpu/cmake/FindSIMD.cmake +0 -100
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +0 -184
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +0 -15
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +0 -243
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +0 -140
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -131
- package/src/llama.cpp/ggml/src/ggml-impl.h +0 -601
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +0 -166
- package/src/llama.cpp/ggml/src/ggml-kompute/ggml-kompute.cpp +0 -2251
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +0 -120
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +0 -622
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +0 -113
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +0 -96
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +0 -5124
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +0 -1037
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -5232
- package/src/llama.cpp/ggml/src/ggml-quants.h +0 -100
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +0 -9
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +0 -1813
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +0 -189
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +0 -37
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +0 -239
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +0 -39
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -83
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +0 -493
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +0 -100
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +0 -623
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +0 -34
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +0 -701
- package/src/llama.cpp/ggml/src/ggml-sycl/cpy.hpp +0 -11
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +0 -791
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +0 -1160
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.hpp +0 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +0 -2957
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +0 -1536
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +0 -75
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +0 -99
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +0 -311
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +0 -4443
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.cpp +0 -105
- package/src/llama.cpp/ggml/src/ggml-sycl/gla.hpp +0 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +0 -136
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +0 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +0 -3030
- package/src/llama.cpp/ggml/src/ggml-sycl/mmq.hpp +0 -33
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +0 -1108
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.hpp +0 -27
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +0 -474
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +0 -26
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +0 -46
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +0 -10
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +0 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +0 -83
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +0 -362
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +0 -264
- package/src/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +0 -13
- package/src/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +0 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +0 -73
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +0 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +0 -1215
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +0 -305
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv.hpp +0 -10
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +0 -12
- package/src/llama.cpp/ggml/src/ggml-threading.h +0 -14
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +0 -196
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +0 -10699
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -39
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +0 -751
- package/src/llama.cpp/ggml/src/ggml.c +0 -6550
- package/src/llama.cpp/ggml/src/gguf.cpp +0 -1330
- package/src/llama.cpp/models/.editorconfig +0 -1
- package/src/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-deepseek-r1-qwen.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-gpt-4o.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +0 -46
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/src/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/src/llama.cpp/pocs/CMakeLists.txt +0 -14
- package/src/llama.cpp/pocs/vdot/CMakeLists.txt +0 -9
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +0 -173
- package/src/llama.cpp/pocs/vdot/vdot.cpp +0 -311
- package/src/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/src/llama.cpp/prompts/alpaca.txt +0 -1
- package/src/llama.cpp/prompts/assistant.txt +0 -31
- package/src/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/src/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/src/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/src/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/src/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/src/llama.cpp/prompts/chat.txt +0 -28
- package/src/llama.cpp/prompts/dan-modified.txt +0 -1
- package/src/llama.cpp/prompts/dan.txt +0 -1
- package/src/llama.cpp/prompts/mnemonics.txt +0 -93
- package/src/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/src/llama.cpp/prompts/reason-act.txt +0 -18
- package/src/llama.cpp/requirements/requirements-all.txt +0 -15
- package/src/llama.cpp/requirements/requirements-compare-llama-bench.txt +0 -2
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +0 -7
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +0 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +0 -5
- package/src/llama.cpp/requirements/requirements-convert_llama_ggml_to_gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +0 -4
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +0 -3
- package/src/llama.cpp/requirements/requirements-pydantic.txt +0 -3
- package/src/llama.cpp/requirements/requirements-test-tokenizer-random.txt +0 -1
- package/src/llama.cpp/requirements/requirements-tool_bench.txt +0 -12
- package/src/llama.cpp/requirements.txt +0 -13
- package/src/llama.cpp/scripts/build-info.sh +0 -30
- package/src/llama.cpp/scripts/install-oneapi.bat +0 -19
- package/src/llama.cpp/scripts/xxd.cmake +0 -16
- package/src/llama.cpp/tests/CMakeLists.txt +0 -177
- package/src/llama.cpp/tests/get-model.cpp +0 -21
- package/src/llama.cpp/tests/get-model.h +0 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +0 -178
- package/src/llama.cpp/tests/test-autorelease.cpp +0 -24
- package/src/llama.cpp/tests/test-backend-ops.cpp +0 -4793
- package/src/llama.cpp/tests/test-barrier.cpp +0 -94
- package/src/llama.cpp/tests/test-c.c +0 -7
- package/src/llama.cpp/tests/test-chat-template.cpp +0 -417
- package/src/llama.cpp/tests/test-chat.cpp +0 -985
- package/src/llama.cpp/tests/test-double-float.cpp +0 -57
- package/src/llama.cpp/tests/test-gbnf-validator.cpp +0 -109
- package/src/llama.cpp/tests/test-gguf.cpp +0 -1338
- package/src/llama.cpp/tests/test-grammar-integration.cpp +0 -1308
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +0 -1201
- package/src/llama.cpp/tests/test-grammar-parser.cpp +0 -519
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +0 -1304
- package/src/llama.cpp/tests/test-llama-grammar.cpp +0 -408
- package/src/llama.cpp/tests/test-log.cpp +0 -39
- package/src/llama.cpp/tests/test-model-load-cancel.cpp +0 -27
- package/src/llama.cpp/tests/test-mtmd-c-api.c +0 -63
- package/src/llama.cpp/tests/test-opt.cpp +0 -904
- package/src/llama.cpp/tests/test-quantize-fns.cpp +0 -186
- package/src/llama.cpp/tests/test-quantize-perf.cpp +0 -365
- package/src/llama.cpp/tests/test-quantize-stats.cpp +0 -424
- package/src/llama.cpp/tests/test-regex-partial.cpp +0 -288
- package/src/llama.cpp/tests/test-rope.cpp +0 -262
- package/src/llama.cpp/tests/test-sampling.cpp +0 -399
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +0 -312
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +0 -155
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +0 -125
- package/src/llama.cpp/tools/CMakeLists.txt +0 -39
- package/src/llama.cpp/tools/batched-bench/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/batched-bench/batched-bench.cpp +0 -204
- package/src/llama.cpp/tools/cvector-generator/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/cvector-generator/completions.txt +0 -582
- package/src/llama.cpp/tools/cvector-generator/cvector-generator.cpp +0 -508
- package/src/llama.cpp/tools/cvector-generator/mean.hpp +0 -48
- package/src/llama.cpp/tools/cvector-generator/negative.txt +0 -4
- package/src/llama.cpp/tools/cvector-generator/pca.hpp +0 -315
- package/src/llama.cpp/tools/cvector-generator/positive.txt +0 -4
- package/src/llama.cpp/tools/export-lora/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/export-lora/export-lora.cpp +0 -434
- package/src/llama.cpp/tools/gguf-split/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/gguf-split/gguf-split.cpp +0 -583
- package/src/llama.cpp/tools/imatrix/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/imatrix/imatrix.cpp +0 -667
- package/src/llama.cpp/tools/llama-bench/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +0 -2024
- package/src/llama.cpp/tools/main/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/main/main.cpp +0 -977
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +0 -58
- package/src/llama.cpp/tools/mtmd/clip-impl.h +0 -462
- package/src/llama.cpp/tools/mtmd/clip.cpp +0 -4024
- package/src/llama.cpp/tools/mtmd/clip.h +0 -101
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +0 -22
- package/src/llama.cpp/tools/mtmd/miniaudio.h +0 -93468
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +0 -855
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +0 -62
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +0 -377
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +0 -297
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +0 -942
- package/src/llama.cpp/tools/mtmd/mtmd.h +0 -362
- package/src/llama.cpp/tools/mtmd/requirements.txt +0 -5
- package/src/llama.cpp/tools/perplexity/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/perplexity/perplexity.cpp +0 -2063
- package/src/llama.cpp/tools/quantize/CMakeLists.txt +0 -6
- package/src/llama.cpp/tools/quantize/quantize.cpp +0 -519
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +0 -4
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +0 -322
- package/src/llama.cpp/tools/run/CMakeLists.txt +0 -16
- package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.cpp +0 -1995
- package/src/llama.cpp/tools/run/linenoise.cpp/linenoise.h +0 -137
- package/src/llama.cpp/tools/run/run.cpp +0 -1261
- package/src/llama.cpp/tools/server/CMakeLists.txt +0 -51
- package/src/llama.cpp/tools/server/bench/requirements.txt +0 -2
- package/src/llama.cpp/tools/server/httplib.h +0 -10506
- package/src/llama.cpp/tools/server/server.cpp +0 -4966
- package/src/llama.cpp/tools/server/tests/requirements.txt +0 -8
- package/src/llama.cpp/tools/server/utils.hpp +0 -1337
- package/src/llama.cpp/tools/tokenize/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/tokenize/tokenize.cpp +0 -416
- package/src/llama.cpp/tools/tts/CMakeLists.txt +0 -5
- package/src/llama.cpp/tools/tts/tts.cpp +0 -1092
|
@@ -1,261 +0,0 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "common.h"
|
|
3
|
-
#include "sampling.h"
|
|
4
|
-
#include "speculative.h"
|
|
5
|
-
#include "log.h"
|
|
6
|
-
#include "llama.h"
|
|
7
|
-
|
|
8
|
-
#include <cstdio>
|
|
9
|
-
#include <cstring>
|
|
10
|
-
#include <string>
|
|
11
|
-
#include <vector>
|
|
12
|
-
|
|
13
|
-
int main(int argc, char ** argv) {
|
|
14
|
-
common_params params;
|
|
15
|
-
|
|
16
|
-
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
|
17
|
-
return 1;
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
if (params.n_predict < -1) {
|
|
21
|
-
LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
|
|
22
|
-
return 1;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
common_init();
|
|
26
|
-
|
|
27
|
-
if (params.speculative.model.path.empty()) {
|
|
28
|
-
LOG_ERR("%s: --model-draft is required\n", __func__);
|
|
29
|
-
return 1;
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
// init llama.cpp
|
|
33
|
-
llama_backend_init();
|
|
34
|
-
llama_numa_init(params.numa);
|
|
35
|
-
|
|
36
|
-
llama_model * model_tgt = NULL;
|
|
37
|
-
//llama_model * model_dft = NULL;
|
|
38
|
-
|
|
39
|
-
llama_context * ctx_tgt = NULL;
|
|
40
|
-
llama_context * ctx_dft = NULL;
|
|
41
|
-
|
|
42
|
-
// load the target model
|
|
43
|
-
common_init_result llama_init_tgt = common_init_from_params(params);
|
|
44
|
-
|
|
45
|
-
model_tgt = llama_init_tgt.model.get();
|
|
46
|
-
ctx_tgt = llama_init_tgt.context.get();
|
|
47
|
-
|
|
48
|
-
const llama_vocab * vocab = llama_model_get_vocab(model_tgt);
|
|
49
|
-
|
|
50
|
-
// load the draft model
|
|
51
|
-
params.devices = params.speculative.devices;
|
|
52
|
-
params.model = params.speculative.model;
|
|
53
|
-
params.n_ctx = params.speculative.n_ctx;
|
|
54
|
-
params.n_batch = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
|
|
55
|
-
params.n_gpu_layers = params.speculative.n_gpu_layers;
|
|
56
|
-
|
|
57
|
-
if (params.speculative.cpuparams.n_threads > 0) {
|
|
58
|
-
params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
|
62
|
-
common_init_result llama_init_dft = common_init_from_params(params);
|
|
63
|
-
|
|
64
|
-
//model_dft = llama_init_dft.model.get();
|
|
65
|
-
ctx_dft = llama_init_dft.context.get();
|
|
66
|
-
|
|
67
|
-
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
|
68
|
-
return 1;
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
// Tokenize the prompt
|
|
72
|
-
std::vector<llama_token> inp;
|
|
73
|
-
inp = common_tokenize(ctx_tgt, params.prompt, true, true);
|
|
74
|
-
|
|
75
|
-
if (llama_n_ctx(ctx_tgt) < (uint32_t) inp.size()) {
|
|
76
|
-
LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
|
|
77
|
-
|
|
78
|
-
return 1;
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
if (llama_n_batch(ctx_tgt) < (uint32_t) inp.size()) {
|
|
82
|
-
LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
|
|
83
|
-
|
|
84
|
-
return 1;
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
LOG("\n\n");
|
|
88
|
-
|
|
89
|
-
for (auto id : inp) {
|
|
90
|
-
LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
// how many tokens to draft each time
|
|
94
|
-
int n_draft = params.speculative.n_max;
|
|
95
|
-
int n_draft_min = params.speculative.n_min;
|
|
96
|
-
|
|
97
|
-
float p_min = params.speculative.p_min;
|
|
98
|
-
|
|
99
|
-
int n_predict = 0;
|
|
100
|
-
int n_drafted = 0;
|
|
101
|
-
int n_accept = 0;
|
|
102
|
-
|
|
103
|
-
// used to determine end of generation
|
|
104
|
-
bool has_eos = false;
|
|
105
|
-
|
|
106
|
-
// ================================================
|
|
107
|
-
// everything until here is standard initialization
|
|
108
|
-
// the relevant stuff for speculative decoding starts here
|
|
109
|
-
|
|
110
|
-
const auto t_enc_start = ggml_time_us();
|
|
111
|
-
|
|
112
|
-
// target model sampling context
|
|
113
|
-
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
|
|
114
|
-
|
|
115
|
-
// eval the prompt
|
|
116
|
-
llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
|
|
117
|
-
|
|
118
|
-
// note: keep the last token separate!
|
|
119
|
-
llama_token id_last = inp.back();
|
|
120
|
-
|
|
121
|
-
// all tokens currently in the target context
|
|
122
|
-
llama_tokens prompt_tgt(inp.begin(), inp.end() - 1);
|
|
123
|
-
prompt_tgt.reserve(llama_n_ctx(ctx_tgt));
|
|
124
|
-
|
|
125
|
-
int n_past = inp.size() - 1;
|
|
126
|
-
|
|
127
|
-
// init the speculator
|
|
128
|
-
struct common_speculative_params params_spec;
|
|
129
|
-
params_spec.n_draft = n_draft;
|
|
130
|
-
params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
|
|
131
|
-
params_spec.p_min = p_min;
|
|
132
|
-
|
|
133
|
-
struct common_speculative * spec = common_speculative_init(ctx_dft);
|
|
134
|
-
|
|
135
|
-
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
|
136
|
-
|
|
137
|
-
const auto t_enc_end = ggml_time_us();
|
|
138
|
-
|
|
139
|
-
const auto t_dec_start = ggml_time_us();
|
|
140
|
-
|
|
141
|
-
while (true) {
|
|
142
|
-
// optionally, generate draft tokens that can be appended to the target batch
|
|
143
|
-
//
|
|
144
|
-
// this is the most important part of the speculation. the more probable tokens that are provided here
|
|
145
|
-
// the better the performance will be. in theory, this computation can be performed asynchronously and even
|
|
146
|
-
// offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
|
|
147
|
-
// from a cache or lookup tables.
|
|
148
|
-
//
|
|
149
|
-
llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
|
|
150
|
-
|
|
151
|
-
//LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
|
|
152
|
-
|
|
153
|
-
// always have a token to evaluate from before - id_last
|
|
154
|
-
common_batch_clear(batch_tgt);
|
|
155
|
-
common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);
|
|
156
|
-
|
|
157
|
-
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
|
158
|
-
{
|
|
159
|
-
// do not waste time on small drafts
|
|
160
|
-
if (draft.size() < (size_t) n_draft_min) {
|
|
161
|
-
draft.clear();
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
for (size_t i = 0; i < draft.size(); ++i) {
|
|
165
|
-
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
//LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
|
|
169
|
-
|
|
170
|
-
llama_decode(ctx_tgt, batch_tgt);
|
|
171
|
-
}
|
|
172
|
-
|
|
173
|
-
// sample from the full target batch and return the accepted tokens based on the target sampler
|
|
174
|
-
//
|
|
175
|
-
// for each token to be accepted, the sampler would have to sample that same token
|
|
176
|
-
// in such cases, instead of decoding the sampled token as we normally do, we simply continue with the
|
|
177
|
-
// available logits from the batch and sample the next token until we run out of logits or the sampler
|
|
178
|
-
// disagrees with the draft
|
|
179
|
-
//
|
|
180
|
-
const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
|
|
181
|
-
|
|
182
|
-
//LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
|
|
183
|
-
|
|
184
|
-
GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
|
|
185
|
-
|
|
186
|
-
n_past += ids.size() - 1;
|
|
187
|
-
n_drafted += draft.size(); // note: we ignore the discarded small drafts
|
|
188
|
-
n_accept += ids.size() - 1;
|
|
189
|
-
n_predict += ids.size();
|
|
190
|
-
|
|
191
|
-
// process the accepted tokens and update contexts
|
|
192
|
-
//
|
|
193
|
-
// this is the standard token post-processing that we normally do
|
|
194
|
-
// in this case, we do it for a group of accepted tokens at once
|
|
195
|
-
//
|
|
196
|
-
for (size_t i = 0; i < ids.size(); ++i) {
|
|
197
|
-
prompt_tgt.push_back(id_last);
|
|
198
|
-
|
|
199
|
-
id_last = ids[i];
|
|
200
|
-
|
|
201
|
-
if (llama_vocab_is_eog(vocab, id_last)) {
|
|
202
|
-
has_eos = true;
|
|
203
|
-
break;
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
const std::string token_str = common_token_to_piece(ctx_tgt, id_last);
|
|
207
|
-
|
|
208
|
-
if (params.use_color && i + 1 < ids.size()) {
|
|
209
|
-
LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
|
|
210
|
-
} else {
|
|
211
|
-
LOG("%s", token_str.c_str());
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d)\n", (int) ids.size() - 1, (int) draft.size(), id_last);
|
|
216
|
-
|
|
217
|
-
{
|
|
218
|
-
LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
|
|
219
|
-
|
|
220
|
-
llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
|
224
|
-
break;
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
auto t_dec_end = ggml_time_us();
|
|
229
|
-
|
|
230
|
-
const int n_input = inp.size();
|
|
231
|
-
|
|
232
|
-
LOG("\n\n");
|
|
233
|
-
|
|
234
|
-
LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
|
235
|
-
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
|
236
|
-
|
|
237
|
-
LOG_INF("\n");
|
|
238
|
-
LOG_INF("n_draft = %d\n", n_draft);
|
|
239
|
-
LOG_INF("n_predict = %d\n", n_predict);
|
|
240
|
-
LOG_INF("n_drafted = %d\n", n_drafted);
|
|
241
|
-
LOG_INF("n_accept = %d\n", n_accept);
|
|
242
|
-
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
|
243
|
-
|
|
244
|
-
LOG_INF("\n");
|
|
245
|
-
LOG_INF("draft:\n\n");
|
|
246
|
-
|
|
247
|
-
llama_perf_context_print(ctx_dft);
|
|
248
|
-
|
|
249
|
-
LOG_INF("\n");
|
|
250
|
-
LOG_INF("target:\n\n");
|
|
251
|
-
common_perf_print(ctx_tgt, smpl);
|
|
252
|
-
|
|
253
|
-
common_sampler_free(smpl);
|
|
254
|
-
common_speculative_free(spec);
|
|
255
|
-
|
|
256
|
-
llama_backend_free();
|
|
257
|
-
|
|
258
|
-
LOG("\n\n");
|
|
259
|
-
|
|
260
|
-
return 0;
|
|
261
|
-
}
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
# MIT license
|
|
2
|
-
# Copyright (C) 2024 Intel Corporation
|
|
3
|
-
# SPDX-License-Identifier: MIT
|
|
4
|
-
|
|
5
|
-
set(TARGET llama-ls-sycl-device)
|
|
6
|
-
add_executable(${TARGET} ls-sycl-device.cpp)
|
|
7
|
-
install(TARGETS ${TARGET} RUNTIME)
|
|
8
|
-
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
|
9
|
-
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# MIT license
|
|
3
|
-
# Copyright (C) 2024 Intel Corporation
|
|
4
|
-
# SPDX-License-Identifier: MIT
|
|
5
|
-
|
|
6
|
-
mkdir -p build
|
|
7
|
-
cd build
|
|
8
|
-
source /opt/intel/oneapi/setvars.sh
|
|
9
|
-
|
|
10
|
-
#for FP16
|
|
11
|
-
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_CURL=OFF # faster for long-prompt inference
|
|
12
|
-
|
|
13
|
-
#for FP32
|
|
14
|
-
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=OFF
|
|
15
|
-
|
|
16
|
-
#build example/main
|
|
17
|
-
#cmake --build . --config Release --target main
|
|
18
|
-
|
|
19
|
-
#build example/llama-bench
|
|
20
|
-
#cmake --build . --config Release --target llama-bench
|
|
21
|
-
|
|
22
|
-
#build all binary
|
|
23
|
-
cmake --build . --config Release -j -v
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
|
|
3
|
-
# MIT license
|
|
4
|
-
# Copyright (C) 2024 Intel Corporation
|
|
5
|
-
# SPDX-License-Identifier: MIT
|
|
6
|
-
export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
|
7
|
-
source /opt/intel/oneapi/setvars.sh
|
|
8
|
-
|
|
9
|
-
#export GGML_SYCL_DEBUG=1
|
|
10
|
-
|
|
11
|
-
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
|
12
|
-
|
|
13
|
-
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
14
|
-
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
|
15
|
-
NGL=99
|
|
16
|
-
CONTEXT=4096
|
|
17
|
-
|
|
18
|
-
if [ $# -gt 0 ]; then
|
|
19
|
-
GGML_SYCL_DEVICE=$1
|
|
20
|
-
echo "use $GGML_SYCL_DEVICE as main GPU"
|
|
21
|
-
#use signle GPU only
|
|
22
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
|
23
|
-
|
|
24
|
-
else
|
|
25
|
-
#use multiple GPUs with same max compute units
|
|
26
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s 0 -c ${CONTEXT}
|
|
27
|
-
fi
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
|
|
3
|
-
# MIT license
|
|
4
|
-
# Copyright (C) 2025 Intel Corporation
|
|
5
|
-
# SPDX-License-Identifier: MIT
|
|
6
|
-
|
|
7
|
-
# If you want more control, DPC++ Allows selecting a specific device through the
|
|
8
|
-
# following environment variable
|
|
9
|
-
#export ONEAPI_DEVICE_SELECTOR="level_zero:0"
|
|
10
|
-
source /opt/intel/oneapi/setvars.sh
|
|
11
|
-
|
|
12
|
-
#export GGML_SYCL_DEBUG=1
|
|
13
|
-
|
|
14
|
-
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
|
15
|
-
|
|
16
|
-
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
17
|
-
MODEL_FILE=models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf
|
|
18
|
-
NGL=99 # Layers offloaded to the GPU. If the device runs out of memory, reduce this value according to the model you are using.
|
|
19
|
-
CONTEXT=4096
|
|
20
|
-
|
|
21
|
-
if [ $# -gt 0 ]; then
|
|
22
|
-
GGML_SYCL_DEVICE=$1
|
|
23
|
-
echo "Using $GGML_SYCL_DEVICE as the main GPU"
|
|
24
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT} -mg $GGML_SYCL_DEVICE -sm none
|
|
25
|
-
else
|
|
26
|
-
#use multiple GPUs with same max compute units
|
|
27
|
-
ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m ${MODEL_FILE} -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -c ${CONTEXT}
|
|
28
|
-
fi
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
:: MIT license
|
|
3
|
-
:: Copyright (C) 2024 Intel Corporation
|
|
4
|
-
:: SPDX-License-Identifier: MIT
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
IF not exist build (mkdir build)
|
|
8
|
-
cd build
|
|
9
|
-
if %errorlevel% neq 0 goto ERROR
|
|
10
|
-
|
|
11
|
-
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
12
|
-
if %errorlevel% neq 0 goto ERROR
|
|
13
|
-
|
|
14
|
-
:: for FP16
|
|
15
|
-
:: faster for long-prompt inference
|
|
16
|
-
:: cmake -G "MinGW Makefiles" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
|
17
|
-
|
|
18
|
-
:: for FP32
|
|
19
|
-
cmake -G "Ninja" .. -DLLAMA_CURL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
|
20
|
-
if %errorlevel% neq 0 goto ERROR
|
|
21
|
-
:: build example/main only
|
|
22
|
-
:: make main
|
|
23
|
-
|
|
24
|
-
:: build all binary
|
|
25
|
-
cmake --build . -j
|
|
26
|
-
if %errorlevel% neq 0 goto ERROR
|
|
27
|
-
|
|
28
|
-
cd ..
|
|
29
|
-
exit /B 0
|
|
30
|
-
|
|
31
|
-
:ERROR
|
|
32
|
-
echo comomand error: %errorlevel%
|
|
33
|
-
exit /B %errorlevel%
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
:: MIT license
|
|
2
|
-
:: Copyright (C) 2024 Intel Corporation
|
|
3
|
-
:: SPDX-License-Identifier: MIT
|
|
4
|
-
|
|
5
|
-
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
6
|
-
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
.\build\bin\llama-cli.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 99 -s 0
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
:: MIT license
|
|
2
|
-
:: Copyright (C) 2024 Intel Corporation
|
|
3
|
-
:: SPDX-License-Identifier: MIT
|
|
4
|
-
|
|
5
|
-
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
|
6
|
-
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
.\build\bin\llama-cli.exe -m models\Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf -p %INPUT2% -n 400 -e -ngl 99
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
#include "arg.h"
|
|
2
|
-
#include "common.h"
|
|
3
|
-
#include "log.h"
|
|
4
|
-
#include "llama.h"
|
|
5
|
-
|
|
6
|
-
#include <cmath>
|
|
7
|
-
#include <cstdio>
|
|
8
|
-
#include <cstring>
|
|
9
|
-
#include <ctime>
|
|
10
|
-
#include <vector>
|
|
11
|
-
|
|
12
|
-
#if defined(_MSC_VER)
|
|
13
|
-
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
14
|
-
#endif
|
|
15
|
-
|
|
16
|
-
int main(int argc, char ** argv) {
|
|
17
|
-
common_params params;
|
|
18
|
-
|
|
19
|
-
params.escape = false;
|
|
20
|
-
|
|
21
|
-
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
|
22
|
-
return 1;
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
if (params.use_mmap) {
|
|
26
|
-
LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
|
|
27
|
-
params.use_mmap = false;
|
|
28
|
-
}
|
|
29
|
-
if (params.cache_type_k != GGML_TYPE_F32) {
|
|
30
|
-
LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
|
31
|
-
params.cache_type_k = GGML_TYPE_F32;
|
|
32
|
-
}
|
|
33
|
-
if (params.cache_type_v != GGML_TYPE_F32) {
|
|
34
|
-
LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
|
|
35
|
-
params.cache_type_v = GGML_TYPE_F32;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
common_init();
|
|
39
|
-
llama_backend_init();
|
|
40
|
-
llama_numa_init(params.numa);
|
|
41
|
-
|
|
42
|
-
// load the model and apply lora adapter, if any
|
|
43
|
-
common_init_result llama_init = common_init_from_params(params);
|
|
44
|
-
llama_model_ptr & model = llama_init.model;
|
|
45
|
-
llama_context_ptr & ctx = llama_init.context;
|
|
46
|
-
|
|
47
|
-
if (model == NULL) {
|
|
48
|
-
LOG_ERR("%s: unable to load model\n", __func__);
|
|
49
|
-
return 1;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
// print system information
|
|
53
|
-
{
|
|
54
|
-
LOG_INF("\n");
|
|
55
|
-
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
constexpr float val_split = 0.05f;
|
|
59
|
-
|
|
60
|
-
std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
|
|
61
|
-
ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
|
|
62
|
-
|
|
63
|
-
struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
|
|
64
|
-
optimizer_params.adamw.alpha = 1e-7f; // learning rate
|
|
65
|
-
|
|
66
|
-
struct llama_opt_params lopt_params {
|
|
67
|
-
/*n_ctx_train =*/ 0,
|
|
68
|
-
/*param_filter =*/ llama_opt_param_filter_all,
|
|
69
|
-
/*param_filter_ud =*/ nullptr,
|
|
70
|
-
/*get_opt_pars =*/ ggml_opt_get_constant_optimizer_params,
|
|
71
|
-
/*get_opt_pars_ud =*/ &optimizer_params,
|
|
72
|
-
};
|
|
73
|
-
llama_opt_init(ctx.get(), model.get(), lopt_params);
|
|
74
|
-
|
|
75
|
-
const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
|
|
76
|
-
|
|
77
|
-
ggml_opt_result_t result_train = ggml_opt_result_init();
|
|
78
|
-
ggml_opt_result_t result_eval = ggml_opt_result_init();
|
|
79
|
-
|
|
80
|
-
for (int epoch = 0; epoch < 2; ++epoch) {
|
|
81
|
-
llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
|
|
82
|
-
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
|
|
83
|
-
fprintf(stderr, "\n");
|
|
84
|
-
|
|
85
|
-
ggml_opt_result_reset(result_train);
|
|
86
|
-
ggml_opt_result_reset(result_eval);
|
|
87
|
-
}
|
|
88
|
-
ggml_opt_result_free(result_train);
|
|
89
|
-
ggml_opt_result_free(result_eval);
|
|
90
|
-
|
|
91
|
-
llama_model_save_to_file(model.get(), "finetuned-model.gguf");
|
|
92
|
-
|
|
93
|
-
llama_backend_free();
|
|
94
|
-
|
|
95
|
-
return 0;
|
|
96
|
-
}
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
find_package(Git)
|
|
2
|
-
|
|
3
|
-
# the commit's SHA1
|
|
4
|
-
execute_process(COMMAND
|
|
5
|
-
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
|
6
|
-
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
7
|
-
OUTPUT_VARIABLE GIT_SHA1
|
|
8
|
-
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
9
|
-
|
|
10
|
-
# the date of the commit
|
|
11
|
-
execute_process(COMMAND
|
|
12
|
-
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
|
13
|
-
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
14
|
-
OUTPUT_VARIABLE GIT_DATE
|
|
15
|
-
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
16
|
-
|
|
17
|
-
# the subject of the commit
|
|
18
|
-
execute_process(COMMAND
|
|
19
|
-
"${GIT_EXECUTABLE}" log -1 --format=%s
|
|
20
|
-
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
|
21
|
-
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
|
22
|
-
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
function(ggml_get_flags CCID CCVER)
|
|
2
|
-
set(C_FLAGS "")
|
|
3
|
-
set(CXX_FLAGS "")
|
|
4
|
-
|
|
5
|
-
if (CCID MATCHES "Clang")
|
|
6
|
-
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
|
7
|
-
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
|
8
|
-
|
|
9
|
-
if (
|
|
10
|
-
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
|
11
|
-
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
|
12
|
-
)
|
|
13
|
-
list(APPEND C_FLAGS -Wdouble-promotion)
|
|
14
|
-
endif()
|
|
15
|
-
elseif (CCID STREQUAL "GNU")
|
|
16
|
-
set(C_FLAGS -Wdouble-promotion)
|
|
17
|
-
set(CXX_FLAGS -Wno-array-bounds)
|
|
18
|
-
|
|
19
|
-
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
|
20
|
-
list(APPEND CXX_FLAGS -Wextra-semi)
|
|
21
|
-
endif()
|
|
22
|
-
endif()
|
|
23
|
-
|
|
24
|
-
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
|
25
|
-
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
|
26
|
-
endfunction()
|