@fugood/llama.node 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +1 -8
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/package.json +4 -2
- package/src/DetokenizeWorker.cpp +1 -1
- package/src/EmbeddingWorker.cpp +2 -2
- package/src/LlamaCompletionWorker.cpp +10 -10
- package/src/LlamaCompletionWorker.h +2 -2
- package/src/LlamaContext.cpp +14 -17
- package/src/TokenizeWorker.cpp +1 -1
- package/src/common.hpp +5 -4
- package/src/llama.cpp/.github/workflows/build.yml +137 -29
- package/src/llama.cpp/.github/workflows/close-issue.yml +5 -0
- package/src/llama.cpp/.github/workflows/docker.yml +46 -34
- package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +7 -0
- package/src/llama.cpp/.github/workflows/nix-ci.yml +7 -0
- package/src/llama.cpp/.github/workflows/python-check-requirements.yml +2 -4
- package/src/llama.cpp/.github/workflows/python-type-check.yml +3 -1
- package/src/llama.cpp/.github/workflows/server.yml +7 -0
- package/src/llama.cpp/CMakeLists.txt +26 -11
- package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
- package/src/llama.cpp/common/CMakeLists.txt +10 -10
- package/src/llama.cpp/common/arg.cpp +2041 -0
- package/src/llama.cpp/common/arg.h +77 -0
- package/src/llama.cpp/common/common.cpp +523 -1861
- package/src/llama.cpp/common/common.h +234 -106
- package/src/llama.cpp/common/console.cpp +3 -0
- package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
- package/src/llama.cpp/common/log.cpp +401 -0
- package/src/llama.cpp/common/log.h +66 -698
- package/src/llama.cpp/common/ngram-cache.cpp +39 -36
- package/src/llama.cpp/common/ngram-cache.h +19 -19
- package/src/llama.cpp/common/sampling.cpp +356 -350
- package/src/llama.cpp/common/sampling.h +62 -139
- package/src/llama.cpp/common/stb_image.h +5990 -6398
- package/src/llama.cpp/docs/build.md +72 -17
- package/src/llama.cpp/examples/CMakeLists.txt +1 -2
- package/src/llama.cpp/examples/batched/batched.cpp +49 -65
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +42 -53
- package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +55 -52
- package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +22 -22
- package/src/llama.cpp/examples/cvector-generator/pca.hpp +3 -13
- package/src/llama.cpp/examples/embedding/embedding.cpp +147 -91
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +37 -37
- package/src/llama.cpp/examples/export-lora/export-lora.cpp +39 -38
- package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +14 -39
- package/src/llama.cpp/examples/{baby-llama → gen-docs}/CMakeLists.txt +2 -2
- package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +83 -0
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +58 -39
- package/src/llama.cpp/examples/gritlm/gritlm.cpp +46 -39
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +75 -69
- package/src/llama.cpp/examples/infill/infill.cpp +131 -192
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +276 -178
- package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +1 -0
- package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +40 -36
- package/src/llama.cpp/examples/llava/CMakeLists.txt +7 -0
- package/src/llama.cpp/examples/llava/clip.cpp +686 -150
- package/src/llama.cpp/examples/llava/clip.h +11 -2
- package/src/llama.cpp/examples/llava/llava-cli.cpp +60 -71
- package/src/llama.cpp/examples/llava/llava.cpp +146 -26
- package/src/llama.cpp/examples/llava/llava.h +2 -3
- package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +323 -0
- package/src/llama.cpp/examples/llava/requirements.txt +1 -0
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +55 -56
- package/src/llama.cpp/examples/lookup/lookup-create.cpp +15 -13
- package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +34 -33
- package/src/llama.cpp/examples/lookup/lookup.cpp +60 -63
- package/src/llama.cpp/examples/main/main.cpp +216 -313
- package/src/llama.cpp/examples/parallel/parallel.cpp +58 -59
- package/src/llama.cpp/examples/passkey/passkey.cpp +53 -61
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +277 -311
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +27 -9
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +12 -12
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +57 -52
- package/src/llama.cpp/examples/rpc/rpc-server.cpp +27 -2
- package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +60 -46
- package/src/llama.cpp/examples/server/CMakeLists.txt +7 -18
- package/src/llama.cpp/examples/server/server.cpp +1347 -1531
- package/src/llama.cpp/examples/server/tests/requirements.txt +2 -1
- package/src/llama.cpp/examples/server/utils.hpp +396 -107
- package/src/llama.cpp/examples/simple/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/simple/simple.cpp +132 -106
- package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +197 -0
- package/src/llama.cpp/examples/speculative/speculative.cpp +153 -124
- package/src/llama.cpp/examples/sycl/run-llama2.sh +10 -19
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/tokenize/tokenize.cpp +27 -29
- package/src/llama.cpp/ggml/CMakeLists.txt +29 -12
- package/src/llama.cpp/ggml/include/ggml-alloc.h +3 -3
- package/src/llama.cpp/ggml/include/ggml-amx.h +25 -0
- package/src/llama.cpp/ggml/include/ggml-backend.h +166 -68
- package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
- package/src/llama.cpp/ggml/include/ggml-cann.h +17 -19
- package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
- package/src/llama.cpp/ggml/include/ggml-cpu.h +177 -0
- package/src/llama.cpp/ggml/include/ggml-cuda.h +17 -17
- package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
- package/src/llama.cpp/ggml/include/ggml-metal.h +13 -12
- package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
- package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
- package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
- package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
- package/src/llama.cpp/ggml/include/ggml.h +272 -505
- package/src/llama.cpp/ggml/src/CMakeLists.txt +69 -1110
- package/src/llama.cpp/ggml/src/ggml-aarch64.c +52 -2116
- package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -20
- package/src/llama.cpp/ggml/src/ggml-alloc.c +29 -27
- package/src/llama.cpp/ggml/src/ggml-amx/CMakeLists.txt +107 -0
- package/src/llama.cpp/ggml/src/ggml-amx/common.h +94 -0
- package/src/llama.cpp/ggml/src/ggml-amx/ggml-amx.cpp +446 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.cpp +2510 -0
- package/src/llama.cpp/ggml/src/ggml-amx/mmq.h +17 -0
- package/src/llama.cpp/ggml/src/ggml-backend-impl.h +144 -81
- package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +195 -0
- package/src/llama.cpp/ggml/src/{ggml-backend.c → ggml-backend.cpp} +394 -635
- package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +91 -0
- package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +217 -70
- package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +46 -0
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +4 -27
- package/src/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +32 -4
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +179 -41
- package/src/llama.cpp/ggml/src/ggml-cann/common.h +1 -0
- package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +458 -353
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -1
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/ascendc_kernels.h +2 -0
- package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +278 -0
- package/src/llama.cpp/ggml/src/ggml-common.h +20 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +261 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.c +3560 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +30 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +371 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10822 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +13970 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +663 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +1885 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +155 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/cuda.h +14 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +178 -0
- package/src/llama.cpp/ggml/src/ggml-cuda/vendors/musa.h +134 -0
- package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +106 -0
- package/src/llama.cpp/ggml/src/ggml-impl.h +380 -584
- package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +162 -0
- package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +233 -87
- package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +108 -0
- package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +249 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +100 -0
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +867 -0
- package/src/llama.cpp/ggml/src/ggml-quants.c +369 -9994
- package/src/llama.cpp/ggml/src/ggml-quants.h +78 -110
- package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +11 -0
- package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +560 -335
- package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +81 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +51 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +310 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +1 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.cpp +99 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/conv.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +57 -57
- package/src/llama.cpp/ggml/src/ggml-sycl/convert.hpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +106 -106
- package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +18 -25
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1011 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +101 -0
- package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3350 -3980
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +125 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/im2col.hpp +23 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +70 -68
- package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +9 -6
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +8 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +1 -1
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +71 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.hpp +21 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +138 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
- package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
- package/src/llama.cpp/ggml/src/ggml-threading.h +12 -0
- package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +78 -0
- package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2034 -1718
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +2 -0
- package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp +152 -185
- package/src/llama.cpp/ggml/src/ggml.c +2075 -16579
- package/src/llama.cpp/include/llama.h +296 -285
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.inp +112 -0
- package/src/llama.cpp/models/ggml-vocab-chameleon.gguf.out +46 -0
- package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
- package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
- package/src/llama.cpp/requirements/requirements-convert_legacy_llama.txt +1 -1
- package/src/llama.cpp/src/CMakeLists.txt +2 -1
- package/src/llama.cpp/src/llama-grammar.cpp +721 -122
- package/src/llama.cpp/src/llama-grammar.h +120 -15
- package/src/llama.cpp/src/llama-impl.h +156 -1
- package/src/llama.cpp/src/llama-sampling.cpp +2058 -346
- package/src/llama.cpp/src/llama-sampling.h +39 -47
- package/src/llama.cpp/src/llama-vocab.cpp +390 -127
- package/src/llama.cpp/src/llama-vocab.h +60 -20
- package/src/llama.cpp/src/llama.cpp +6215 -3263
- package/src/llama.cpp/src/unicode-data.cpp +6 -4
- package/src/llama.cpp/src/unicode-data.h +4 -4
- package/src/llama.cpp/src/unicode.cpp +15 -7
- package/src/llama.cpp/tests/CMakeLists.txt +4 -2
- package/src/llama.cpp/tests/test-arg-parser.cpp +131 -0
- package/src/llama.cpp/tests/test-backend-ops.cpp +1725 -297
- package/src/llama.cpp/tests/test-barrier.cpp +94 -0
- package/src/llama.cpp/tests/test-chat-template.cpp +9 -5
- package/src/llama.cpp/tests/test-grammar-integration.cpp +23 -38
- package/src/llama.cpp/tests/test-grammar-parser.cpp +6 -4
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +23 -8
- package/src/llama.cpp/tests/test-llama-grammar.cpp +9 -8
- package/src/llama.cpp/tests/test-log.cpp +39 -0
- package/src/llama.cpp/tests/test-opt.cpp +853 -142
- package/src/llama.cpp/tests/test-quantize-fns.cpp +28 -19
- package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
- package/src/llama.cpp/tests/test-rope.cpp +2 -1
- package/src/llama.cpp/tests/test-sampling.cpp +226 -142
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +56 -36
- package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
- package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
- package/patches/llama.patch +0 -22
- package/src/llama.cpp/.github/workflows/bench.yml +0 -310
- package/src/llama.cpp/common/grammar-parser.cpp +0 -536
- package/src/llama.cpp/common/grammar-parser.h +0 -29
- package/src/llama.cpp/common/train.cpp +0 -1513
- package/src/llama.cpp/common/train.h +0 -233
- package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1640
- package/src/llama.cpp/examples/benchmark/CMakeLists.txt +0 -6
- package/src/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -275
- package/src/llama.cpp/ggml/src/llamafile/sgemm.cpp +0 -1027
- package/src/llama.cpp/tests/test-grad0.cpp +0 -1566
- /package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
- /package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
find_package (Threads REQUIRED)
|
|
1
2
|
|
|
2
3
|
set(TARGET vulkan-shaders-gen)
|
|
3
4
|
add_executable(${TARGET} vulkan-shaders-gen.cpp)
|
|
4
5
|
install(TARGETS ${TARGET} RUNTIME)
|
|
5
6
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
7
|
+
target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
|
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/vulkan-shaders-gen.cpp
RENAMED
|
@@ -16,12 +16,14 @@
|
|
|
16
16
|
#include <cstdio>
|
|
17
17
|
#include <cstring>
|
|
18
18
|
#include <cstdlib>
|
|
19
|
+
#include <cassert>
|
|
19
20
|
#include <sys/stat.h>
|
|
20
21
|
#include <sys/types.h>
|
|
21
22
|
|
|
22
23
|
#ifdef _WIN32
|
|
23
24
|
#include <windows.h>
|
|
24
25
|
#include <direct.h> // For _mkdir on Windows
|
|
26
|
+
#include <algorithm> // For std::replace on w64devkit
|
|
25
27
|
#else
|
|
26
28
|
#include <unistd.h>
|
|
27
29
|
#include <sys/wait.h>
|
|
@@ -30,20 +32,6 @@
|
|
|
30
32
|
|
|
31
33
|
#define ASYNCIO_CONCURRENCY 64
|
|
32
34
|
|
|
33
|
-
// define prototypes
|
|
34
|
-
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
|
|
35
|
-
bool directory_exists(const std::string& path);
|
|
36
|
-
bool create_directory(const std::string& path);
|
|
37
|
-
std::string to_uppercase(const std::string& input);
|
|
38
|
-
bool string_ends_with(const std::string& str, const std::string& suffix);
|
|
39
|
-
std::string join_paths(const std::string& path1, const std::string& path2);
|
|
40
|
-
std::string basename(const std::string &path);
|
|
41
|
-
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
|
|
42
|
-
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
|
|
43
|
-
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
|
|
44
|
-
void process_shaders(std::vector<std::future<void>>& tasks);
|
|
45
|
-
void write_output_files();
|
|
46
|
-
|
|
47
35
|
std::mutex lock;
|
|
48
36
|
std::vector<std::pair<std::string, std::string>> shader_fnames;
|
|
49
37
|
|
|
@@ -52,7 +40,7 @@ std::string input_dir = "vulkan-shaders";
|
|
|
52
40
|
std::string output_dir = "/tmp";
|
|
53
41
|
std::string target_hpp = "ggml-vulkan-shaders.hpp";
|
|
54
42
|
std::string target_cpp = "ggml-vulkan-shaders.cpp";
|
|
55
|
-
bool
|
|
43
|
+
bool no_clean = false;
|
|
56
44
|
|
|
57
45
|
const std::vector<std::string> type_names = {
|
|
58
46
|
"f32",
|
|
@@ -105,11 +93,11 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s
|
|
|
105
93
|
std::array<char, 128> buffer;
|
|
106
94
|
DWORD bytes_read;
|
|
107
95
|
|
|
108
|
-
while (ReadFile(stdout_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
|
96
|
+
while (ReadFile(stdout_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
|
109
97
|
stdout_str.append(buffer.data(), bytes_read);
|
|
110
98
|
}
|
|
111
99
|
|
|
112
|
-
while (ReadFile(stderr_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
|
100
|
+
while (ReadFile(stderr_read, buffer.data(), (DWORD)buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
|
|
113
101
|
stderr_str.append(buffer.data(), bytes_read);
|
|
114
102
|
}
|
|
115
103
|
|
|
@@ -193,11 +181,7 @@ bool string_ends_with(const std::string& str, const std::string& suffix) {
|
|
|
193
181
|
return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
|
|
194
182
|
}
|
|
195
183
|
|
|
196
|
-
|
|
197
|
-
static const char path_separator = '\\';
|
|
198
|
-
#else
|
|
199
|
-
static const char path_separator = '/';
|
|
200
|
-
#endif
|
|
184
|
+
static const char path_separator = '/';
|
|
201
185
|
|
|
202
186
|
std::string join_paths(const std::string& path1, const std::string& path2) {
|
|
203
187
|
return path1 + path_separator + path2;
|
|
@@ -207,12 +191,26 @@ std::string basename(const std::string &path) {
|
|
|
207
191
|
return path.substr(path.find_last_of("/\\") + 1);
|
|
208
192
|
}
|
|
209
193
|
|
|
210
|
-
|
|
194
|
+
// variables to track number of compiles in progress
|
|
195
|
+
static uint32_t compile_count = 0;
|
|
196
|
+
static std::mutex compile_count_mutex;
|
|
197
|
+
static std::condition_variable compile_count_cond;
|
|
198
|
+
|
|
199
|
+
void string_to_spv_func(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
|
|
211
200
|
std::string name = _name + (fp16 ? "" : "_fp32");
|
|
212
201
|
std::string out_fname = join_paths(output_dir, name + ".spv");
|
|
213
202
|
std::string in_path = join_paths(input_dir, in_fname);
|
|
214
203
|
|
|
215
|
-
|
|
204
|
+
#ifdef _WIN32
|
|
205
|
+
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
|
|
206
|
+
#else
|
|
207
|
+
std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname};
|
|
208
|
+
#endif
|
|
209
|
+
|
|
210
|
+
#ifdef GGML_VULKAN_SHADER_DEBUG_INFO
|
|
211
|
+
cmd.push_back("-g");
|
|
212
|
+
#endif
|
|
213
|
+
|
|
216
214
|
for (const auto& define : defines) {
|
|
217
215
|
cmd.push_back("-D" + define.first + "=" + define.second);
|
|
218
216
|
}
|
|
@@ -241,6 +239,12 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const
|
|
|
241
239
|
} catch (const std::exception& e) {
|
|
242
240
|
std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
|
|
243
241
|
}
|
|
242
|
+
{
|
|
243
|
+
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
|
244
|
+
assert(compile_count > 0);
|
|
245
|
+
compile_count--;
|
|
246
|
+
}
|
|
247
|
+
compile_count_cond.notify_all();
|
|
244
248
|
}
|
|
245
249
|
|
|
246
250
|
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
|
|
@@ -249,7 +253,22 @@ std::map<std::string, std::string> merge_maps(const std::map<std::string, std::s
|
|
|
249
253
|
return result;
|
|
250
254
|
}
|
|
251
255
|
|
|
252
|
-
|
|
256
|
+
static std::vector<std::future<void>> compiles;
|
|
257
|
+
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
|
|
258
|
+
{
|
|
259
|
+
// wait until fewer than N compiles are in progress.
|
|
260
|
+
// 16 is an arbitrary limit, the goal is to avoid "failed to create pipe" errors.
|
|
261
|
+
uint32_t N = 16;
|
|
262
|
+
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
|
263
|
+
while (compile_count >= N) {
|
|
264
|
+
compile_count_cond.wait(guard);
|
|
265
|
+
}
|
|
266
|
+
compile_count++;
|
|
267
|
+
}
|
|
268
|
+
compiles.push_back(std::async(string_to_spv_func, _name, in_fname, defines, fp16));
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
void matmul_shaders(bool fp16, bool matmul_id) {
|
|
253
272
|
std::string load_vec = fp16 ? "8" : "4";
|
|
254
273
|
std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4";
|
|
255
274
|
std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4";
|
|
@@ -267,39 +286,30 @@ void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmu
|
|
|
267
286
|
}
|
|
268
287
|
|
|
269
288
|
// Shaders with f16 B_TYPE
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
}));
|
|
276
|
-
|
|
277
|
-
tasks.push_back(std::async(std::launch::async, [=] {
|
|
278
|
-
string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
|
279
|
-
}));
|
|
280
|
-
tasks.push_back(std::async(std::launch::async, [=] {
|
|
281
|
-
string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
|
282
|
-
}));
|
|
289
|
+
string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
|
290
|
+
string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
|
291
|
+
|
|
292
|
+
string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
|
|
293
|
+
string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
|
|
283
294
|
|
|
284
295
|
for (const auto& tname : type_names) {
|
|
285
296
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
|
297
|
+
// For unaligned, load one at a time for f32/f16, or two at a time for quants
|
|
298
|
+
std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
|
|
299
|
+
// For aligned matmul loads
|
|
286
300
|
std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
}));
|
|
290
|
-
tasks.push_back(std::async(std::launch::async, [=] {
|
|
291
|
-
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
|
292
|
-
}));
|
|
301
|
+
string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
|
|
302
|
+
string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
|
|
293
303
|
}
|
|
294
304
|
}
|
|
295
305
|
|
|
296
|
-
void process_shaders(
|
|
306
|
+
void process_shaders() {
|
|
297
307
|
std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
|
|
298
308
|
std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
|
|
299
309
|
|
|
300
310
|
for (const auto& fp16 : {false, true}) {
|
|
301
|
-
matmul_shaders(
|
|
302
|
-
matmul_shaders(
|
|
311
|
+
matmul_shaders(fp16, false);
|
|
312
|
+
matmul_shaders(fp16, true);
|
|
303
313
|
}
|
|
304
314
|
|
|
305
315
|
for (const auto& tname : type_names) {
|
|
@@ -307,137 +317,106 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
|
|
307
317
|
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
|
308
318
|
std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
|
|
309
319
|
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
}));
|
|
313
|
-
tasks.push_back(std::async(std::launch::async, [=] {
|
|
314
|
-
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
|
315
|
-
}));
|
|
320
|
+
string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
|
321
|
+
string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}}));
|
|
316
322
|
|
|
317
|
-
|
|
318
|
-
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
319
|
-
}));
|
|
323
|
+
string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"D_TYPE", "float"}}));
|
|
320
324
|
|
|
321
325
|
// Dequant shaders
|
|
322
326
|
if (tname != "f16") {
|
|
323
|
-
|
|
324
|
-
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
|
325
|
-
}));
|
|
327
|
+
string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
|
|
326
328
|
}
|
|
327
329
|
|
|
328
330
|
if (!string_ends_with(tname, "_k")) {
|
|
329
331
|
shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
|
|
330
332
|
|
|
331
333
|
if (tname == "f16") {
|
|
332
|
-
|
|
333
|
-
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
|
334
|
-
}));
|
|
334
|
+
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
|
335
335
|
} else {
|
|
336
|
-
|
|
337
|
-
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
|
|
338
|
-
}));
|
|
336
|
+
string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
|
|
339
337
|
}
|
|
340
|
-
|
|
341
|
-
string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
|
342
|
-
}));
|
|
338
|
+
string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
|
|
343
339
|
}
|
|
344
340
|
}
|
|
345
341
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
}));
|
|
349
|
-
tasks.push_back(std::async(std::launch::async, [] {
|
|
350
|
-
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
351
|
-
}));
|
|
342
|
+
string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
343
|
+
string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
352
344
|
|
|
353
345
|
// Norms
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
}));
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
})
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
})
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
})
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
})
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
})
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
})
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
}));
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
429
|
-
}));
|
|
430
|
-
tasks.push_back(std::async(std::launch::async, [] {
|
|
431
|
-
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
432
|
-
}));
|
|
433
|
-
|
|
434
|
-
tasks.push_back(std::async(std::launch::async, [] {
|
|
435
|
-
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
|
436
|
-
}));
|
|
437
|
-
|
|
438
|
-
tasks.push_back(std::async(std::launch::async, [=] {
|
|
439
|
-
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
440
|
-
}));
|
|
346
|
+
string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
347
|
+
string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
348
|
+
string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
349
|
+
|
|
350
|
+
string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
351
|
+
string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
|
352
|
+
string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
|
353
|
+
string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
354
|
+
string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
|
|
355
|
+
string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
|
356
|
+
|
|
357
|
+
string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
358
|
+
string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
|
|
359
|
+
|
|
360
|
+
string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
361
|
+
|
|
362
|
+
string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
|
|
363
|
+
|
|
364
|
+
string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
365
|
+
|
|
366
|
+
string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
367
|
+
|
|
368
|
+
string_to_spv("repeat_f32", "repeat.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
369
|
+
|
|
370
|
+
string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
371
|
+
|
|
372
|
+
string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
373
|
+
|
|
374
|
+
string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
375
|
+
|
|
376
|
+
string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
377
|
+
|
|
378
|
+
string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
|
|
379
|
+
|
|
380
|
+
string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
381
|
+
|
|
382
|
+
string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
383
|
+
string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
|
|
384
|
+
string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
|
|
385
|
+
|
|
386
|
+
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
387
|
+
|
|
388
|
+
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
389
|
+
string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
390
|
+
string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
391
|
+
string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
392
|
+
string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
393
|
+
string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
394
|
+
|
|
395
|
+
string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
396
|
+
|
|
397
|
+
string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
398
|
+
string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
|
|
399
|
+
|
|
400
|
+
string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
401
|
+
string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
402
|
+
|
|
403
|
+
string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
|
|
404
|
+
string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
|
|
405
|
+
|
|
406
|
+
string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
|
|
407
|
+
|
|
408
|
+
string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
409
|
+
|
|
410
|
+
string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
411
|
+
string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
|
|
412
|
+
|
|
413
|
+
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
414
|
+
|
|
415
|
+
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
|
416
|
+
|
|
417
|
+
for (auto &c : compiles) {
|
|
418
|
+
c.wait();
|
|
419
|
+
}
|
|
441
420
|
}
|
|
442
421
|
|
|
443
422
|
void write_output_files() {
|
|
@@ -449,10 +428,16 @@ void write_output_files() {
|
|
|
449
428
|
|
|
450
429
|
for (const auto& pair : shader_fnames) {
|
|
451
430
|
const std::string& name = pair.first;
|
|
452
|
-
|
|
431
|
+
#ifdef _WIN32
|
|
432
|
+
std::string path = pair.second;
|
|
433
|
+
std::replace(path.begin(), path.end(), '/', '\\' );
|
|
434
|
+
#else
|
|
435
|
+
const std::string& path = pair.second;
|
|
436
|
+
#endif
|
|
437
|
+
|
|
453
438
|
FILE* spv = fopen(path.c_str(), "rb");
|
|
454
439
|
if (!spv) {
|
|
455
|
-
std::cerr << "Error opening SPIR-V file: " << path << "\n";
|
|
440
|
+
std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
|
456
441
|
continue;
|
|
457
442
|
}
|
|
458
443
|
|
|
@@ -464,7 +449,7 @@ void write_output_files() {
|
|
|
464
449
|
size_t read_size = fread(data.data(), 1, size, spv);
|
|
465
450
|
fclose(spv);
|
|
466
451
|
if (read_size != size) {
|
|
467
|
-
std::cerr << "Error reading SPIR-V file: " << path << "\n";
|
|
452
|
+
std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
|
|
468
453
|
continue;
|
|
469
454
|
}
|
|
470
455
|
|
|
@@ -478,9 +463,8 @@ void write_output_files() {
|
|
|
478
463
|
}
|
|
479
464
|
fprintf(src, "\n};\n\n");
|
|
480
465
|
|
|
481
|
-
if (
|
|
466
|
+
if (!no_clean) {
|
|
482
467
|
std::remove(path.c_str());
|
|
483
|
-
// fprintf(stderr, "Removed: %s\n", path.c_str());
|
|
484
468
|
}
|
|
485
469
|
}
|
|
486
470
|
|
|
@@ -496,18 +480,6 @@ int main(int argc, char** argv) {
|
|
|
496
480
|
}
|
|
497
481
|
}
|
|
498
482
|
|
|
499
|
-
if (argc <= 1 || args.find("--help") != args.end()) {
|
|
500
|
-
std::cout << "Usage:\n"
|
|
501
|
-
"\tvulkan-shaders-gen [options]\n\n"
|
|
502
|
-
"Options:\n"
|
|
503
|
-
"\t--glslc <path> Path to glslc executable (default: /usr/bin/glslc)\n"
|
|
504
|
-
"\t--input-dir Directory containing shader sources (required)\n"
|
|
505
|
-
"\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n"
|
|
506
|
-
"\t--target-hpp <path> Path to generate a header file with shader declarations in C++ format\n"
|
|
507
|
-
"\t--target-cpp <path> Path to generate a source code file implementing the declared shaders (optional)\n"
|
|
508
|
-
"\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n";
|
|
509
|
-
return EXIT_SUCCESS;
|
|
510
|
-
}
|
|
511
483
|
if (args.find("--glslc") != args.end()) {
|
|
512
484
|
GLSLC = args["--glslc"]; // Path to glslc
|
|
513
485
|
}
|
|
@@ -524,7 +496,7 @@ int main(int argc, char** argv) {
|
|
|
524
496
|
target_cpp = args["--target-cpp"]; // Path to generated cpp file
|
|
525
497
|
}
|
|
526
498
|
if (args.find("--no-clean") != args.end()) {
|
|
527
|
-
|
|
499
|
+
no_clean = true; // Keep temporary SPIR-V files in output-dir after build
|
|
528
500
|
}
|
|
529
501
|
|
|
530
502
|
if (!directory_exists(input_dir)) {
|
|
@@ -539,12 +511,7 @@ int main(int argc, char** argv) {
|
|
|
539
511
|
}
|
|
540
512
|
}
|
|
541
513
|
|
|
542
|
-
|
|
543
|
-
process_shaders(tasks);
|
|
544
|
-
|
|
545
|
-
for (auto& task : tasks) {
|
|
546
|
-
task.get();
|
|
547
|
-
}
|
|
514
|
+
process_shaders();
|
|
548
515
|
|
|
549
516
|
write_output_files();
|
|
550
517
|
|