@fugood/llama.node 0.3.16 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +6 -1
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +44 -2
- package/lib/index.js +132 -1
- package/lib/index.ts +203 -3
- package/package.json +2 -1
- package/src/EmbeddingWorker.cpp +1 -1
- package/src/LlamaCompletionWorker.cpp +374 -19
- package/src/LlamaCompletionWorker.h +31 -10
- package/src/LlamaContext.cpp +216 -7
- package/src/LlamaContext.h +12 -0
- package/src/common.hpp +15 -0
- package/src/llama.cpp/.github/workflows/build-linux-cross.yml +233 -0
- package/src/llama.cpp/.github/workflows/build.yml +89 -767
- package/src/llama.cpp/.github/workflows/docker.yml +9 -6
- package/src/llama.cpp/.github/workflows/release.yml +716 -0
- package/src/llama.cpp/.github/workflows/server.yml +19 -23
- package/src/llama.cpp/CMakeLists.txt +11 -1
- package/src/llama.cpp/cmake/build-info.cmake +8 -2
- package/src/llama.cpp/cmake/x64-windows-llvm.cmake +0 -6
- package/src/llama.cpp/common/CMakeLists.txt +35 -4
- package/src/llama.cpp/common/arg.cpp +844 -121
- package/src/llama.cpp/common/arg.h +9 -0
- package/src/llama.cpp/common/chat.cpp +129 -107
- package/src/llama.cpp/common/chat.h +2 -0
- package/src/llama.cpp/common/common.cpp +64 -518
- package/src/llama.cpp/common/common.h +35 -45
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +3 -0
- package/src/llama.cpp/common/llguidance.cpp +31 -47
- package/src/llama.cpp/common/minja/chat-template.hpp +23 -11
- package/src/llama.cpp/common/minja/minja.hpp +186 -127
- package/src/llama.cpp/common/regex-partial.cpp +204 -0
- package/src/llama.cpp/common/regex-partial.h +56 -0
- package/src/llama.cpp/common/sampling.cpp +60 -50
- package/src/llama.cpp/docs/build.md +122 -7
- package/src/llama.cpp/examples/CMakeLists.txt +2 -32
- package/src/llama.cpp/examples/batched/batched.cpp +1 -1
- package/src/llama.cpp/examples/embedding/embedding.cpp +9 -12
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +1 -1
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/parallel/parallel.cpp +89 -15
- package/src/llama.cpp/examples/passkey/passkey.cpp +1 -1
- package/src/llama.cpp/examples/speculative/speculative.cpp +1 -1
- package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +1 -1
- package/src/llama.cpp/examples/sycl/build.sh +2 -2
- package/src/llama.cpp/examples/sycl/win-build-sycl.bat +2 -2
- package/src/llama.cpp/examples/training/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/training/finetune.cpp +96 -0
- package/src/llama.cpp/ggml/CMakeLists.txt +35 -2
- package/src/llama.cpp/ggml/cmake/GitVars.cmake +22 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +4 -4
- package/src/llama.cpp/ggml/include/ggml-cpp.h +1 -1
- package/src/llama.cpp/ggml/include/ggml-cpu.h +5 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +47 -28
- package/src/llama.cpp/ggml/include/ggml-rpc.h +6 -1
- package/src/llama.cpp/ggml/include/ggml.h +76 -106
- package/src/llama.cpp/ggml/src/CMakeLists.txt +11 -8
- package/src/llama.cpp/ggml/src/ggml-alloc.c +4 -1
- package/src/llama.cpp/ggml/src/ggml-backend.cpp +9 -5
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +0 -2
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +8 -4
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +5 -5
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +692 -1534
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +613 -122
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +135 -1
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +507 -137
- package/src/llama.cpp/ggml/src/ggml-common.h +12 -6
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +66 -33
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.cpp +158 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/binary-ops.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/common.h +72 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +896 -194
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +2 -21
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +1060 -410
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1008 -13533
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +31 -16
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +90 -12
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +47 -13
- package/src/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +266 -72
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1034 -88
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +8796 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.h +110 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +892 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +186 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +28 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.cpp +252 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +802 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +23 -4
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +7 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +1 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +0 -4
- package/src/llama.cpp/ggml/src/ggml-impl.h +52 -18
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +106 -14
- package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +67 -119
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1023 -262
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +368 -190
- package/src/llama.cpp/ggml/src/ggml-quants.c +0 -6
- package/src/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +307 -40
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +125 -45
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +10 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +239 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/binbcast.hpp +39 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +0 -35
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +9 -307
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +72 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +14 -7
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +59 -21
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +7 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +79 -90
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +944 -438
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +22 -23
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +37 -8
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +24 -20
- package/src/llama.cpp/ggml/src/ggml-sycl/getrows.hpp +1 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +507 -411
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +84 -74
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +185 -89
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +37 -49
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.hpp +7 -22
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +4 -14
- package/src/llama.cpp/ggml/src/ggml-sycl/quants.hpp +83 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +204 -118
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.hpp +1 -3
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +128 -53
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +83 -49
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +1278 -282
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +32 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +133 -30
- package/src/llama.cpp/ggml/src/ggml.c +170 -265
- package/src/llama.cpp/ggml/src/gguf.cpp +34 -33
- package/src/llama.cpp/include/llama.h +82 -22
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-llama4.gguf.out +46 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-pixtral.gguf.out +46 -0
- package/src/llama.cpp/requirements/requirements-all.txt +5 -3
- package/src/llama.cpp/requirements/requirements-gguf_editor_gui.txt +3 -0
- package/src/llama.cpp/scripts/xxd.cmake +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +4 -2
- package/src/llama.cpp/src/llama-adapter.cpp +43 -1
- package/src/llama.cpp/src/llama-arch.cpp +163 -17
- package/src/llama.cpp/src/llama-arch.h +16 -0
- package/src/llama.cpp/src/llama-batch.cpp +5 -1
- package/src/llama.cpp/src/llama-batch.h +2 -1
- package/src/llama.cpp/src/llama-chat.cpp +91 -16
- package/src/llama.cpp/src/llama-chat.h +7 -2
- package/src/llama.cpp/src/llama-context.cpp +479 -575
- package/src/llama.cpp/src/llama-context.h +44 -33
- package/src/llama.cpp/src/llama-cparams.h +1 -0
- package/src/llama.cpp/src/llama-graph.cpp +209 -157
- package/src/llama.cpp/src/llama-graph.h +38 -14
- package/src/llama.cpp/src/llama-hparams.h +13 -0
- package/src/llama.cpp/src/llama-kv-cache.cpp +1604 -543
- package/src/llama.cpp/src/llama-kv-cache.h +283 -171
- package/src/llama.cpp/src/llama-memory.h +12 -2
- package/src/llama.cpp/src/llama-mmap.cpp +1 -1
- package/src/llama.cpp/src/llama-model-loader.cpp +34 -20
- package/src/llama.cpp/src/llama-model-loader.h +5 -3
- package/src/llama.cpp/src/llama-model-saver.cpp +281 -0
- package/src/llama.cpp/src/llama-model-saver.h +37 -0
- package/src/llama.cpp/src/llama-model.cpp +1803 -330
- package/src/llama.cpp/src/llama-model.h +21 -2
- package/src/llama.cpp/src/llama-quant.cpp +33 -10
- package/src/llama.cpp/src/llama-sampling.cpp +25 -7
- package/src/llama.cpp/src/llama-vocab.cpp +86 -10
- package/src/llama.cpp/src/llama-vocab.h +6 -0
- package/src/llama.cpp/src/llama.cpp +15 -1
- package/src/llama.cpp/tests/CMakeLists.txt +52 -31
- package/src/llama.cpp/tests/test-arg-parser.cpp +51 -4
- package/src/llama.cpp/tests/test-backend-ops.cpp +189 -90
- package/src/llama.cpp/tests/test-chat-template.cpp +26 -6
- package/src/llama.cpp/tests/test-chat.cpp +15 -3
- package/src/llama.cpp/{examples/gbnf-validator/gbnf-validator.cpp → tests/test-gbnf-validator.cpp} +2 -2
- package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -2
- package/src/llama.cpp/tests/test-grammar-llguidance.cpp +63 -2
- package/src/llama.cpp/tests/test-grammar-parser.cpp +3 -1
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -1
- package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -1
- package/src/llama.cpp/tests/test-mtmd-c-api.c +63 -0
- package/src/llama.cpp/tests/test-opt.cpp +33 -21
- package/src/llama.cpp/{examples/quantize-stats/quantize-stats.cpp → tests/test-quantize-stats.cpp} +3 -1
- package/src/llama.cpp/tests/test-regex-partial.cpp +288 -0
- package/src/llama.cpp/tests/test-sampling.cpp +1 -1
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +2 -1
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +2 -1
- package/src/llama.cpp/tools/CMakeLists.txt +39 -0
- package/src/llama.cpp/{examples → tools}/batched-bench/batched-bench.cpp +3 -3
- package/src/llama.cpp/{examples → tools}/export-lora/export-lora.cpp +1 -1
- package/src/llama.cpp/{examples → tools}/gguf-split/gguf-split.cpp +15 -16
- package/src/llama.cpp/{examples → tools}/imatrix/imatrix.cpp +11 -9
- package/src/llama.cpp/{examples → tools}/llama-bench/llama-bench.cpp +623 -274
- package/src/llama.cpp/{examples → tools}/main/main.cpp +22 -14
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +47 -0
- package/src/llama.cpp/tools/mtmd/clip-impl.h +365 -0
- package/src/llama.cpp/tools/mtmd/clip.cpp +3646 -0
- package/src/llama.cpp/tools/mtmd/clip.h +99 -0
- package/src/llama.cpp/tools/mtmd/deprecation-warning.cpp +22 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +370 -0
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +310 -0
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +678 -0
- package/src/llama.cpp/tools/mtmd/mtmd.h +331 -0
- package/src/llama.cpp/{examples → tools}/perplexity/perplexity.cpp +21 -5
- package/src/llama.cpp/{examples → tools}/quantize/quantize.cpp +53 -3
- package/src/llama.cpp/tools/rpc/CMakeLists.txt +4 -0
- package/src/llama.cpp/tools/rpc/rpc-server.cpp +322 -0
- package/src/llama.cpp/tools/run/CMakeLists.txt +16 -0
- package/src/llama.cpp/{examples → tools}/run/run.cpp +30 -30
- package/src/llama.cpp/{examples → tools}/server/CMakeLists.txt +2 -1
- package/src/llama.cpp/{examples → tools}/server/httplib.h +313 -247
- package/src/llama.cpp/{examples → tools}/server/server.cpp +529 -215
- package/src/llama.cpp/{examples → tools}/server/utils.hpp +427 -6
- package/src/llama.cpp/{examples → tools}/tts/tts.cpp +6 -9
- package/src/llama.cpp/cmake/arm64-windows-msvc.cmake +0 -6
- package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/CMakeLists.txt +0 -5
- package/src/llama.cpp/examples/infill/infill.cpp +0 -590
- package/src/llama.cpp/examples/llava/CMakeLists.txt +0 -66
- package/src/llama.cpp/examples/llava/android/build_64.sh +0 -8
- package/src/llama.cpp/examples/llava/clip-quantize-cli.cpp +0 -59
- package/src/llama.cpp/examples/llava/clip.cpp +0 -3206
- package/src/llama.cpp/examples/llava/clip.h +0 -118
- package/src/llama.cpp/examples/llava/gemma3-cli.cpp +0 -341
- package/src/llama.cpp/examples/llava/llava-cli.cpp +0 -332
- package/src/llama.cpp/examples/llava/llava.cpp +0 -574
- package/src/llama.cpp/examples/llava/llava.h +0 -49
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +0 -354
- package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +0 -584
- package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/rpc/CMakeLists.txt +0 -2
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +0 -171
- package/src/llama.cpp/examples/run/CMakeLists.txt +0 -5
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +0 -30
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +0 -19
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +0 -234
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +0 -197
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +0 -190
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +0 -204
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +0 -191
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +0 -218
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +0 -216
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +0 -295
- /package/src/llama.cpp/{examples → tools}/batched-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/completions.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/cvector-generator.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/mean.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/negative.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/pca.hpp +0 -0
- /package/src/llama.cpp/{examples → tools}/cvector-generator/positive.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/export-lora/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/gguf-split/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/imatrix/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/llama-bench/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/main/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples/llava → tools/mtmd}/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/perplexity/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/quantize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/run/linenoise.cpp/linenoise.h +0 -0
- /package/src/llama.cpp/{examples → tools}/server/bench/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/server/tests/requirements.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/CMakeLists.txt +0 -0
- /package/src/llama.cpp/{examples → tools}/tokenize/tokenize.cpp +0 -0
- /package/src/llama.cpp/{examples → tools}/tts/CMakeLists.txt +0 -0
|
@@ -12,12 +12,30 @@ if (CUDAToolkit_FOUND)
|
|
|
12
12
|
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
|
13
13
|
# 70 == V100, FP16 tensor cores
|
|
14
14
|
# 75 == Turing, int8 tensor cores
|
|
15
|
+
# 80 == Ampere, asynchronous data loading, faster tensor core instructions
|
|
16
|
+
# 86 == RTX 3000, needs CUDA v11.1
|
|
17
|
+
# 89 == RTX 4000, needs CUDA v11.8
|
|
18
|
+
#
|
|
19
|
+
# XX-virtual == compile CUDA code as PTX, do JIT compilation to binary code on first run
|
|
20
|
+
# XX-real == compile CUDA code as device code for this specific architecture
|
|
21
|
+
# no suffix == compile as both PTX and device code
|
|
22
|
+
#
|
|
23
|
+
# The default behavior for a non-native is to build virtual architectures as needed to cover all features needed
|
|
24
|
+
# for best performance and to also build real architectures for the most commonly used GPUs.
|
|
15
25
|
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
|
16
26
|
set(CMAKE_CUDA_ARCHITECTURES "native")
|
|
17
27
|
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
|
18
|
-
|
|
28
|
+
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
|
29
|
+
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
|
30
|
+
else()
|
|
31
|
+
set(CMAKE_CUDA_ARCHITECTURES "60-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
|
32
|
+
endif()
|
|
19
33
|
else()
|
|
20
|
-
|
|
34
|
+
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.8")
|
|
35
|
+
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real;89-real")
|
|
36
|
+
else()
|
|
37
|
+
set(CMAKE_CUDA_ARCHITECTURES "50-virtual;61-virtual;70-virtual;75-virtual;80-virtual;86-real")
|
|
38
|
+
endif()
|
|
21
39
|
endif()
|
|
22
40
|
endif()
|
|
23
41
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
|
@@ -100,7 +118,7 @@ if (CUDAToolkit_FOUND)
|
|
|
100
118
|
|
|
101
119
|
set(CUDA_CXX_FLAGS "")
|
|
102
120
|
|
|
103
|
-
set(CUDA_FLAGS -use_fast_math)
|
|
121
|
+
set(CUDA_FLAGS -use_fast_math -extended-lambda)
|
|
104
122
|
|
|
105
123
|
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
|
|
106
124
|
# Options are:
|
|
@@ -133,6 +151,7 @@ if (CUDAToolkit_FOUND)
|
|
|
133
151
|
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
|
|
134
152
|
OUTPUT_VARIABLE CUDA_CCVER
|
|
135
153
|
ERROR_QUIET
|
|
154
|
+
OUTPUT_STRIP_TRAILING_WHITESPACE
|
|
136
155
|
)
|
|
137
156
|
else()
|
|
138
157
|
if (CUDA_CCFULLVER MATCHES Apple)
|
|
@@ -143,7 +162,7 @@ if (CUDAToolkit_FOUND)
|
|
|
143
162
|
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
|
|
144
163
|
endif()
|
|
145
164
|
|
|
146
|
-
message("
|
|
165
|
+
message(STATUS "CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
|
147
166
|
|
|
148
167
|
ggml_get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
|
149
168
|
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
|
21
21
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
|
22
22
|
#define CUDA_R_16F HIPBLAS_R_16F
|
|
23
|
+
#define CUDA_R_16BF HIPBLAS_R_16B
|
|
23
24
|
#define CUDA_R_32F HIPBLAS_R_32F
|
|
24
25
|
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
|
|
25
26
|
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
|
|
@@ -70,6 +71,8 @@
|
|
|
70
71
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
|
71
72
|
#define cudaMalloc hipMalloc
|
|
72
73
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
|
74
|
+
#define cudaMallocManaged hipMallocManaged
|
|
75
|
+
#define cudaMemAdvise hipMemAdvise
|
|
73
76
|
#define cudaMemcpy hipMemcpy
|
|
74
77
|
#define cudaMemcpyAsync hipMemcpyAsync
|
|
75
78
|
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
|
@@ -151,6 +154,10 @@
|
|
|
151
154
|
#define CDNA
|
|
152
155
|
#endif
|
|
153
156
|
|
|
157
|
+
#if defined(__GFX12__)
|
|
158
|
+
#define RDNA4
|
|
159
|
+
#endif
|
|
160
|
+
|
|
154
161
|
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
|
|
155
162
|
defined(__gfx1150__) || defined(__gfx1151__)
|
|
156
163
|
#define RDNA3
|
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
|
16
16
|
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
|
17
17
|
#define CUDA_R_16F MUSA_R_16F
|
|
18
|
+
#define CUDA_R_16BF MUSA_R_16BF
|
|
18
19
|
#define CUDA_R_32F MUSA_R_32F
|
|
19
20
|
#define cublasComputeType_t cudaDataType_t
|
|
20
21
|
#define cublasCreate mublasCreate
|
|
@@ -148,8 +148,14 @@ struct ggml_map_custom2_op_params {
|
|
|
148
148
|
|
|
149
149
|
struct ggml_map_custom3_op_params {
|
|
150
150
|
ggml_custom3_op_t fun;
|
|
151
|
-
int
|
|
152
|
-
void
|
|
151
|
+
int n_tasks;
|
|
152
|
+
void * userdata;
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
struct ggml_custom_op_params {
|
|
156
|
+
ggml_custom_op_t fun;
|
|
157
|
+
int n_tasks;
|
|
158
|
+
void * userdata;
|
|
153
159
|
};
|
|
154
160
|
|
|
155
161
|
// bitset
|
|
@@ -311,29 +317,28 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
311
317
|
|
|
312
318
|
// FP16 to FP32 conversion
|
|
313
319
|
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
#if defined(__ARM_NEON) && !defined(_MSC_VER) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11)
|
|
320
|
+
// 16-bit float
|
|
321
|
+
// on Arm, we use __fp16
|
|
322
|
+
// on x86, we use uint16_t
|
|
323
|
+
//
|
|
324
|
+
// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
|
|
325
|
+
// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
|
|
326
|
+
//
|
|
327
|
+
#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
323
328
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
324
329
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
325
330
|
|
|
326
331
|
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
327
332
|
|
|
328
333
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
329
|
-
|
|
334
|
+
__fp16 tmp;
|
|
330
335
|
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
|
331
336
|
return (float)tmp;
|
|
332
337
|
}
|
|
333
338
|
|
|
334
339
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
335
340
|
ggml_fp16_t res;
|
|
336
|
-
|
|
341
|
+
__fp16 tmp = f;
|
|
337
342
|
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
|
338
343
|
return res;
|
|
339
344
|
}
|
|
@@ -357,8 +362,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
357
362
|
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
358
363
|
|
|
359
364
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
360
|
-
|
|
361
|
-
|
|
365
|
+
float f;
|
|
366
|
+
double d;
|
|
362
367
|
__asm__(
|
|
363
368
|
"mtfprd %0,%2\n"
|
|
364
369
|
"xscvhpdp %0,%0\n"
|
|
@@ -370,8 +375,8 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
370
375
|
}
|
|
371
376
|
|
|
372
377
|
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
373
|
-
|
|
374
|
-
|
|
378
|
+
double d;
|
|
379
|
+
ggml_fp16_t r;
|
|
375
380
|
__asm__( /* xscvdphp can work on double or single precision */
|
|
376
381
|
"xscvdphp %0,%2\n"
|
|
377
382
|
"mffprd %1,%0\n" :
|
|
@@ -381,6 +386,35 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
381
386
|
return r;
|
|
382
387
|
}
|
|
383
388
|
|
|
389
|
+
#elif defined(__riscv) && defined(GGML_RV_ZFH)
|
|
390
|
+
|
|
391
|
+
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
|
392
|
+
float f;
|
|
393
|
+
__asm__(
|
|
394
|
+
"fmv.h.x %[f], %[h]\n\t"
|
|
395
|
+
"fcvt.s.h %[f], %[f]"
|
|
396
|
+
: [f] "=&f" (f)
|
|
397
|
+
: [h] "r" (h)
|
|
398
|
+
);
|
|
399
|
+
return f;
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
|
403
|
+
ggml_fp16_t res;
|
|
404
|
+
__asm__(
|
|
405
|
+
"fcvt.h.s %[f], %[f]\n\t"
|
|
406
|
+
"fmv.x.h %[h], %[f]"
|
|
407
|
+
: [h] "=&r" (res)
|
|
408
|
+
: [f] "f" (f)
|
|
409
|
+
);
|
|
410
|
+
return res;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
414
|
+
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
415
|
+
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
|
416
|
+
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
|
417
|
+
|
|
384
418
|
#else
|
|
385
419
|
|
|
386
420
|
// FP16 <-> FP32
|
|
@@ -456,7 +490,7 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
|
|
|
456
490
|
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
|
457
491
|
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
|
458
492
|
|
|
459
|
-
#endif // defined(__ARM_NEON) && (!defined(
|
|
493
|
+
#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
|
|
460
494
|
|
|
461
495
|
// precomputed f32 table for f16 (256 KB)
|
|
462
496
|
// defined in ggml.c, initialized in ggml_init()
|
|
@@ -1,6 +1,70 @@
|
|
|
1
1
|
#ifndef GGML_METAL_IMPL
|
|
2
2
|
#define GGML_METAL_IMPL
|
|
3
3
|
|
|
4
|
+
// kernel parameters for mat-vec threadgroups
|
|
5
|
+
//
|
|
6
|
+
// N_R0: number of src0 rows to process per simdgroup
|
|
7
|
+
// N_SG: number of simdgroups per threadgroup
|
|
8
|
+
//
|
|
9
|
+
// TODO: for optimal performance, become function of the device and work size
|
|
10
|
+
|
|
11
|
+
#define N_R0_Q4_0 4
|
|
12
|
+
#define N_SG_Q4_0 2
|
|
13
|
+
|
|
14
|
+
#define N_R0_Q4_1 4
|
|
15
|
+
#define N_SG_Q4_1 2
|
|
16
|
+
|
|
17
|
+
#define N_R0_Q5_0 4
|
|
18
|
+
#define N_SG_Q5_0 2
|
|
19
|
+
|
|
20
|
+
#define N_R0_Q5_1 4
|
|
21
|
+
#define N_SG_Q5_1 2
|
|
22
|
+
|
|
23
|
+
#define N_R0_Q8_0 4
|
|
24
|
+
#define N_SG_Q8_0 2
|
|
25
|
+
|
|
26
|
+
#define N_R0_Q2_K 4
|
|
27
|
+
#define N_SG_Q2_K 2
|
|
28
|
+
|
|
29
|
+
#define N_R0_Q3_K 2
|
|
30
|
+
#define N_SG_Q3_K 2
|
|
31
|
+
|
|
32
|
+
#define N_R0_Q4_K 4
|
|
33
|
+
#define N_SG_Q4_K 2
|
|
34
|
+
|
|
35
|
+
#define N_R0_Q5_K 2
|
|
36
|
+
#define N_SG_Q5_K 2
|
|
37
|
+
|
|
38
|
+
#define N_R0_Q6_K 1
|
|
39
|
+
#define N_SG_Q6_K 2
|
|
40
|
+
|
|
41
|
+
#define N_R0_IQ1_S 4
|
|
42
|
+
#define N_SG_IQ1_S 2
|
|
43
|
+
|
|
44
|
+
#define N_R0_IQ1_M 4
|
|
45
|
+
#define N_SG_IQ1_M 2
|
|
46
|
+
|
|
47
|
+
#define N_R0_IQ2_XXS 4
|
|
48
|
+
#define N_SG_IQ2_XXS 2
|
|
49
|
+
|
|
50
|
+
#define N_R0_IQ2_XS 4
|
|
51
|
+
#define N_SG_IQ2_XS 2
|
|
52
|
+
|
|
53
|
+
#define N_R0_IQ2_S 4
|
|
54
|
+
#define N_SG_IQ2_S 2
|
|
55
|
+
|
|
56
|
+
#define N_R0_IQ3_XXS 4
|
|
57
|
+
#define N_SG_IQ3_XXS 2
|
|
58
|
+
|
|
59
|
+
#define N_R0_IQ3_S 4
|
|
60
|
+
#define N_SG_IQ3_S 2
|
|
61
|
+
|
|
62
|
+
#define N_R0_IQ4_NL 2
|
|
63
|
+
#define N_SG_IQ4_NL 2
|
|
64
|
+
|
|
65
|
+
#define N_R0_IQ4_XS 2
|
|
66
|
+
#define N_SG_IQ4_XS 2
|
|
67
|
+
|
|
4
68
|
// kernel argument structs
|
|
5
69
|
//
|
|
6
70
|
// - element counters (e.g. ne00) typically use int32_t to reduce register usage
|
|
@@ -143,6 +207,10 @@ typedef struct {
|
|
|
143
207
|
float attn_factor;
|
|
144
208
|
float beta_fast;
|
|
145
209
|
float beta_slow;
|
|
210
|
+
int32_t sect_0;
|
|
211
|
+
int32_t sect_1;
|
|
212
|
+
int32_t sect_2;
|
|
213
|
+
int32_t sect_3;
|
|
146
214
|
} ggml_metal_kargs_rope;
|
|
147
215
|
|
|
148
216
|
typedef struct {
|
|
@@ -155,9 +223,12 @@ typedef struct {
|
|
|
155
223
|
int32_t ne11;
|
|
156
224
|
int32_t ne_12_2; // assume K and V are same shape
|
|
157
225
|
int32_t ne_12_3;
|
|
158
|
-
uint64_t
|
|
159
|
-
uint64_t
|
|
160
|
-
uint64_t
|
|
226
|
+
uint64_t nb11;
|
|
227
|
+
uint64_t nb12;
|
|
228
|
+
uint64_t nb13;
|
|
229
|
+
uint64_t nb21;
|
|
230
|
+
uint64_t nb22;
|
|
231
|
+
uint64_t nb23;
|
|
161
232
|
uint64_t nb31;
|
|
162
233
|
int32_t ne1;
|
|
163
234
|
int32_t ne2;
|
|
@@ -232,21 +303,42 @@ typedef struct {
|
|
|
232
303
|
} ggml_metal_kargs_mul_mv_ext;
|
|
233
304
|
|
|
234
305
|
typedef struct {
|
|
235
|
-
int32_t
|
|
236
|
-
int32_t
|
|
237
|
-
uint64_t
|
|
306
|
+
int32_t ne10;
|
|
307
|
+
int32_t ne11; // n_expert_used (bcast)
|
|
308
|
+
uint64_t nb11;
|
|
309
|
+
uint64_t nb12;
|
|
310
|
+
int32_t neh11; // n_tokens
|
|
311
|
+
uint64_t nbh11;
|
|
312
|
+
int32_t ne20; // n_expert_used
|
|
313
|
+
uint64_t nb21;
|
|
314
|
+
} ggml_metal_kargs_mul_mm_id_map0;
|
|
315
|
+
|
|
316
|
+
typedef struct {
|
|
317
|
+
int32_t ne20; // n_expert_used
|
|
318
|
+
int32_t neh0;
|
|
319
|
+
int32_t neh1;
|
|
320
|
+
uint64_t nbh1;
|
|
321
|
+
uint64_t nbh2;
|
|
322
|
+
int32_t ne0;
|
|
323
|
+
uint64_t nb1;
|
|
324
|
+
uint64_t nb2;
|
|
325
|
+
} ggml_metal_kargs_mul_mm_id_map1;
|
|
326
|
+
|
|
327
|
+
typedef struct {
|
|
238
328
|
int32_t ne00;
|
|
239
329
|
int32_t ne02;
|
|
240
330
|
uint64_t nb01;
|
|
241
331
|
uint64_t nb02;
|
|
242
|
-
|
|
243
|
-
int32_t
|
|
244
|
-
|
|
245
|
-
uint64_t
|
|
246
|
-
uint64_t
|
|
247
|
-
uint64_t
|
|
248
|
-
int32_t
|
|
249
|
-
int32_t
|
|
332
|
+
uint64_t nb03;
|
|
333
|
+
int32_t neh12;
|
|
334
|
+
uint64_t nbh10;
|
|
335
|
+
uint64_t nbh11;
|
|
336
|
+
uint64_t nbh12;
|
|
337
|
+
uint64_t nbh13;
|
|
338
|
+
int32_t neh0;
|
|
339
|
+
int32_t neh1;
|
|
340
|
+
int16_t r2;
|
|
341
|
+
int16_t r3;
|
|
250
342
|
} ggml_metal_kargs_mul_mm_id;
|
|
251
343
|
|
|
252
344
|
typedef struct {
|
|
@@ -25,124 +25,72 @@ endif ()
|
|
|
25
25
|
if (GGML_OPENCL_EMBED_KERNELS)
|
|
26
26
|
add_compile_definitions(GGML_OPENCL_EMBED_KERNELS)
|
|
27
27
|
|
|
28
|
-
set(
|
|
29
|
-
|
|
30
|
-
set(OPENCL_CVT_CL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h")
|
|
28
|
+
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
|
|
29
|
+
file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
|
|
31
30
|
|
|
32
|
-
|
|
33
|
-
set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h")
|
|
34
|
-
set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h")
|
|
35
|
-
set(OPENCL_TRANSPOSE_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h")
|
|
36
|
-
set(OPENCL_TRANSPOSE_32_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h")
|
|
37
|
-
set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h")
|
|
38
|
-
|
|
39
|
-
set(EMBED_KERNEL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py")
|
|
40
|
-
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
|
|
41
|
-
|
|
42
|
-
include_directories("${CMAKE_BINARY_DIR}/autogenerated")
|
|
43
|
-
|
|
44
|
-
# Python must be accessible from command line
|
|
45
|
-
add_custom_command(
|
|
46
|
-
OUTPUT ${OPENCL_CL_SOURCE_EMBED}
|
|
47
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
48
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl
|
|
49
|
-
${OPENCL_CL_SOURCE_EMBED}
|
|
50
|
-
DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT}
|
|
51
|
-
COMMENT "Generate ggml-opencl.cl.h"
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
add_custom_command(
|
|
55
|
-
OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED}
|
|
56
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
57
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl
|
|
58
|
-
${OPENCL_MM_CL_SOURCE_EMBED}
|
|
59
|
-
DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT}
|
|
60
|
-
COMMENT "Generate ggml-opencl_mm.cl.h"
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
add_custom_command(
|
|
64
|
-
OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED}
|
|
65
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
66
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl
|
|
67
|
-
${OPENCL_CVT_CL_SOURCE_EMBED}
|
|
68
|
-
DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT}
|
|
69
|
-
COMMENT "Generate ggml-opencl_cvt.cl.h"
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
add_custom_command(
|
|
73
|
-
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
|
74
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
75
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl
|
|
76
|
-
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
|
77
|
-
DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT}
|
|
78
|
-
COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h"
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
add_custom_command(
|
|
82
|
-
OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
|
83
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
84
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl
|
|
85
|
-
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
|
86
|
-
DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT}
|
|
87
|
-
COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h"
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
add_custom_command(
|
|
91
|
-
OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
|
92
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
93
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
|
|
94
|
-
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
|
95
|
-
DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT}
|
|
96
|
-
COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h"
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
add_custom_command(
|
|
100
|
-
OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
|
101
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
102
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl
|
|
103
|
-
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
|
104
|
-
DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT}
|
|
105
|
-
COMMENT "Generate ggml-opencl_transpose_16.cl.h"
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
add_custom_command(
|
|
109
|
-
OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
|
110
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
111
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl
|
|
112
|
-
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
|
113
|
-
DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT}
|
|
114
|
-
COMMENT "Generate ggml-opencl_transpose_32.cl.h"
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
add_custom_command(
|
|
118
|
-
OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
|
119
|
-
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT}
|
|
120
|
-
${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl
|
|
121
|
-
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}
|
|
122
|
-
DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT}
|
|
123
|
-
COMMENT "Generate ggml-opencl_transpose_32_16.cl.h"
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
target_sources(${TARGET_NAME} PRIVATE
|
|
127
|
-
${OPENCL_CL_SOURCE_EMBED}
|
|
128
|
-
${OPENCL_MM_CL_SOURCE_EMBED}
|
|
129
|
-
${OPENCL_CVT_CL_SOURCE_EMBED}
|
|
130
|
-
${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED}
|
|
131
|
-
${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED}
|
|
132
|
-
${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED}
|
|
133
|
-
${OPENCL_TRANSPOSE_16_SOURCE_EMBED}
|
|
134
|
-
${OPENCL_TRANSPOSE_32_SOURCE_EMBED}
|
|
135
|
-
${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED})
|
|
136
|
-
else ()
|
|
137
|
-
# copy ggml-opencl.cl to bin directory
|
|
138
|
-
configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY)
|
|
139
|
-
configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY)
|
|
140
|
-
configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY)
|
|
141
|
-
|
|
142
|
-
configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY)
|
|
143
|
-
configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY)
|
|
144
|
-
configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY)
|
|
145
|
-
configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY)
|
|
146
|
-
configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY)
|
|
147
|
-
configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY)
|
|
31
|
+
target_include_directories(${TARGET_NAME} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/autogenerated")
|
|
148
32
|
endif ()
|
|
33
|
+
|
|
34
|
+
function(ggml_opencl_add_kernel KNAME)
|
|
35
|
+
set(KERN_HDR ${CMAKE_CURRENT_BINARY_DIR}/autogenerated/${KNAME}.cl.h)
|
|
36
|
+
set(KERN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernels/${KNAME}.cl)
|
|
37
|
+
|
|
38
|
+
if (GGML_OPENCL_EMBED_KERNELS)
|
|
39
|
+
message(STATUS "opencl: embedding kernel ${KNAME}")
|
|
40
|
+
|
|
41
|
+
# Python must be accessible from command line
|
|
42
|
+
add_custom_command(
|
|
43
|
+
OUTPUT ${KERN_HDR}
|
|
44
|
+
COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} ${KERN_SRC} ${KERN_HDR}
|
|
45
|
+
DEPENDS ${KERN_SRC} ${EMBED_KERNEL_SCRIPT}
|
|
46
|
+
COMMENT "Generate ${KERN_HDR}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
target_sources(${TARGET_NAME} PRIVATE ${KERN_HDR})
|
|
50
|
+
else ()
|
|
51
|
+
message(STATUS "opencl: adding kernel ${KNAME}")
|
|
52
|
+
configure_file(${KERN_SRC} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${KNAME}.cl COPYONLY)
|
|
53
|
+
endif ()
|
|
54
|
+
endfunction()
|
|
55
|
+
|
|
56
|
+
set(GGML_OPENCL_KERNELS
|
|
57
|
+
add
|
|
58
|
+
clamp
|
|
59
|
+
cpy
|
|
60
|
+
cvt
|
|
61
|
+
diag_mask_inf
|
|
62
|
+
gelu
|
|
63
|
+
gemv_noshuffle_general
|
|
64
|
+
gemv_noshuffle
|
|
65
|
+
get_rows
|
|
66
|
+
im2col_f32
|
|
67
|
+
im2col_f16
|
|
68
|
+
mul_mat_Ab_Bi_8x4
|
|
69
|
+
mul_mv_f16_f16
|
|
70
|
+
mul_mv_f16_f32_1row
|
|
71
|
+
mul_mv_f16_f32_l4
|
|
72
|
+
mul_mv_f16_f32
|
|
73
|
+
mul_mv_f32_f32
|
|
74
|
+
mul_mv_q4_0_f32
|
|
75
|
+
mul_mv_q4_0_f32_v
|
|
76
|
+
mul_mv_q4_0_f32_8x_flat
|
|
77
|
+
mul_mv_q4_0_f32_1d_8x_flat
|
|
78
|
+
mul_mv_q4_0_f32_1d_16x_flat
|
|
79
|
+
mul_mv_q6_k
|
|
80
|
+
mul
|
|
81
|
+
norm
|
|
82
|
+
relu
|
|
83
|
+
rms_norm
|
|
84
|
+
rope
|
|
85
|
+
scale
|
|
86
|
+
silu
|
|
87
|
+
softmax_4_f32
|
|
88
|
+
softmax_4_f16
|
|
89
|
+
softmax_f32
|
|
90
|
+
softmax_f16
|
|
91
|
+
transpose
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
foreach (K ${GGML_OPENCL_KERNELS})
|
|
95
|
+
ggml_opencl_add_kernel(${K})
|
|
96
|
+
endforeach()
|