@novastera-oss/llamarn 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -2
- package/android/CMakeLists.txt +6 -3
- package/android/src/main/cpp/include/llama.h +140 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +48 -67
- package/cpp/LlamaCppModel.h +8 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +33 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +15 -28
- package/cpp/llama.cpp/common/arg.cpp +38 -12
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +9 -3
- package/cpp/llama.cpp/common/chat-parser.h +4 -1
- package/cpp/llama.cpp/common/chat.cpp +16 -13
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +52 -40
- package/cpp/llama.cpp/common/common.h +5 -2
- package/cpp/llama.cpp/common/json-partial.cpp +5 -4
- package/cpp/llama.cpp/common/json-partial.h +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +2 -1
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +4 -4
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +128 -84
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml.h +1 -3
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +49 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +10 -5
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +93 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4113 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2174 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2638 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2731 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2068 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +396 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1299 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1480 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4310 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +59 -3206
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +7 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +33 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +56 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1157 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1555 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +2 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +6 -8
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +25 -16
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +33 -8
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +135 -100
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +908 -3
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +19 -24
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +21 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +121 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +2 -96
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +164 -46
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +32 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +118 -11
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +26 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +432 -248
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +0 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +9 -8
- package/cpp/llama.cpp/ggml/src/ggml.cpp +26 -0
- package/cpp/llama.cpp/ggml/src/gguf.cpp +19 -2
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +57 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +4 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +14 -3
- package/cpp/llama.cpp/include/llama.h +140 -38
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +4 -1
- package/cpp/llama.cpp/src/llama-arch.cpp +95 -3
- package/cpp/llama.cpp/src/llama-arch.h +7 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +289 -31
- package/cpp/llama.cpp/src/llama-batch.h +47 -17
- package/cpp/llama.cpp/src/llama-chat.cpp +19 -2
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +488 -313
- package/cpp/llama.cpp/src/llama-context.h +38 -17
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +275 -152
- package/cpp/llama.cpp/src/llama-graph.h +109 -52
- package/cpp/llama.cpp/src/llama-hparams.cpp +6 -2
- package/cpp/llama.cpp/src/llama-hparams.h +8 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +281 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +133 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +1835 -0
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +308 -0
- package/cpp/llama.cpp/src/llama-kv-cells.h +53 -17
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +247 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +143 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +1116 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +188 -0
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +89 -4
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model.cpp +735 -143
- package/cpp/llama.cpp/src/llama-model.h +4 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +2 -1
- package/cpp/llama.cpp/src/llama-vocab.cpp +39 -25
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +10518 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +93468 -0
- package/cpp/llama.cpp/{common → vendor}/minja/chat-template.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor}/minja/minja.hpp +1 -1
- package/cpp/llama.cpp/{common → vendor/nlohmann}/json.hpp +3027 -2267
- package/cpp/llama.cpp/vendor/nlohmann/json_fwd.hpp +187 -0
- package/cpp/llama.cpp/vendor/stb/stb_image.h +7988 -0
- package/cpp/rn-completion.cpp +65 -10
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +8 -1
- package/ios/include/chat.h +1 -1
- package/ios/include/common/minja/chat-template.hpp +1 -1
- package/ios/include/common/minja/minja.hpp +1 -1
- package/ios/include/common.h +5 -2
- package/ios/include/json-schema-to-grammar.h +4 -4
- package/ios/include/llama.h +140 -38
- package/ios/include/{common → nlohmann}/json.hpp +3027 -2267
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4617
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3557
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4638
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3744 -3559
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4863 -4616
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4834 -4637
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3742 -3556
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4900 -4653
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4871 -4674
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3773 -3587
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +1 -3
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +140 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -2747
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -502
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
|
|
78
78
|
#define VK_VENDOR_ID_INTEL 0x8086
|
|
79
79
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
|
80
80
|
|
|
81
|
-
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE
|
|
81
|
+
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
|
|
82
82
|
|
|
83
83
|
#define GGML_VK_MAX_NODES 8192
|
|
84
84
|
|
|
@@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
|
|
102
102
|
|
|
103
103
|
struct ggml_backend_vk_context;
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
uint32_t queue_family_index;
|
|
107
|
-
vk::Queue queue;
|
|
108
|
-
vk::CommandPool pool;
|
|
109
|
-
uint32_t cmd_buffer_idx;
|
|
110
|
-
std::vector<vk::CommandBuffer> cmd_buffers;
|
|
111
|
-
|
|
112
|
-
vk::PipelineStageFlags stage_flags;
|
|
113
|
-
|
|
114
|
-
bool transfer_only;
|
|
115
|
-
};
|
|
105
|
+
#define MAX_PARAMETER_COUNT 8
|
|
116
106
|
|
|
117
107
|
struct vk_pipeline_struct {
|
|
118
108
|
std::string name;
|
|
119
109
|
vk::ShaderModule shader_module;
|
|
120
|
-
vk::DescriptorSetLayout dsl;
|
|
121
|
-
std::vector<vk::DescriptorPool> descriptor_pools;
|
|
122
|
-
std::vector<vk::DescriptorSet> descriptor_sets;
|
|
123
|
-
uint32_t descriptor_set_idx;
|
|
124
110
|
vk::PipelineLayout layout;
|
|
125
111
|
vk::Pipeline pipeline;
|
|
126
112
|
uint32_t push_constant_size;
|
|
@@ -167,6 +153,45 @@ struct ggml_backend_vk_buffer_type_context {
|
|
|
167
153
|
vk_device device;
|
|
168
154
|
};
|
|
169
155
|
|
|
156
|
+
struct vk_queue;
|
|
157
|
+
|
|
158
|
+
// Stores command pool/buffers. There's an instance of this
|
|
159
|
+
// for each (context,queue) pair and for each (device,queue) pair.
|
|
160
|
+
struct vk_command_pool {
|
|
161
|
+
void init(vk_device& device, vk_queue *q_);
|
|
162
|
+
void destroy(vk::Device& device);
|
|
163
|
+
|
|
164
|
+
vk::CommandPool pool;
|
|
165
|
+
uint32_t cmd_buffer_idx;
|
|
166
|
+
std::vector<vk::CommandBuffer> cmd_buffers;
|
|
167
|
+
|
|
168
|
+
vk_queue *q;
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
// Prevent simultaneous submissions to the same queue.
|
|
172
|
+
// This could be per vk_queue if we stopped having two vk_queue structures
|
|
173
|
+
// sharing the same vk::Queue.
|
|
174
|
+
static std::mutex queue_mutex;
|
|
175
|
+
|
|
176
|
+
struct vk_queue {
|
|
177
|
+
uint32_t queue_family_index;
|
|
178
|
+
vk::Queue queue;
|
|
179
|
+
|
|
180
|
+
vk_command_pool cmd_pool;
|
|
181
|
+
|
|
182
|
+
vk::PipelineStageFlags stage_flags;
|
|
183
|
+
|
|
184
|
+
bool transfer_only;
|
|
185
|
+
|
|
186
|
+
// copy everything except the cmd_pool
|
|
187
|
+
void copyFrom(vk_queue &other) {
|
|
188
|
+
queue_family_index = other.queue_family_index;
|
|
189
|
+
queue = other.queue;
|
|
190
|
+
stage_flags = other.stage_flags;
|
|
191
|
+
transfer_only = other.transfer_only;
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
|
|
170
195
|
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
|
171
196
|
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
|
172
197
|
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
|
@@ -196,6 +221,7 @@ enum vk_device_architecture {
|
|
|
196
221
|
AMD_RDNA1,
|
|
197
222
|
AMD_RDNA2,
|
|
198
223
|
AMD_RDNA3,
|
|
224
|
+
INTEL_XE2,
|
|
199
225
|
};
|
|
200
226
|
|
|
201
227
|
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
|
|
@@ -246,6 +272,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
|
|
246
272
|
}
|
|
247
273
|
return vk_device_architecture::AMD_RDNA2;
|
|
248
274
|
}
|
|
275
|
+
} else if (props.vendorID == VK_VENDOR_ID_INTEL) {
|
|
276
|
+
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
|
277
|
+
|
|
278
|
+
bool subgroup_size_control = false;
|
|
279
|
+
|
|
280
|
+
for (const auto& properties : ext_props) {
|
|
281
|
+
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
|
282
|
+
subgroup_size_control = true;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (!subgroup_size_control) {
|
|
287
|
+
return vk_device_architecture::OTHER;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
vk::PhysicalDeviceProperties2 props2;
|
|
291
|
+
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
|
292
|
+
|
|
293
|
+
props2.pNext = &subgroup_size_control_props;
|
|
294
|
+
device.getProperties2(&props2);
|
|
295
|
+
|
|
296
|
+
if (subgroup_size_control_props.minSubgroupSize == 16) {
|
|
297
|
+
// Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8.
|
|
298
|
+
// Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value.
|
|
299
|
+
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
|
|
300
|
+
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
|
301
|
+
return vk_device_architecture::INTEL_XE2;
|
|
302
|
+
}
|
|
249
303
|
}
|
|
250
304
|
return vk_device_architecture::OTHER;
|
|
251
305
|
}
|
|
@@ -312,6 +366,8 @@ struct vk_device_struct {
|
|
|
312
366
|
// set to true to indicate that some shaders need to be compiled after the dryrun
|
|
313
367
|
bool need_compiles {};
|
|
314
368
|
|
|
369
|
+
vk::DescriptorSetLayout dsl;
|
|
370
|
+
|
|
315
371
|
vk_matmul_pipeline pipeline_matmul_f32 {};
|
|
316
372
|
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
|
317
373
|
vk_matmul_pipeline pipeline_matmul_bf16 {};
|
|
@@ -396,6 +452,7 @@ struct vk_device_struct {
|
|
|
396
452
|
vk_pipeline pipeline_count_equal_i32;
|
|
397
453
|
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
|
398
454
|
vk_pipeline pipeline_timestep_embedding_f32;
|
|
455
|
+
vk_pipeline pipeline_conv_transpose_1d_f32;
|
|
399
456
|
vk_pipeline pipeline_pool2d_f32;
|
|
400
457
|
vk_pipeline pipeline_rwkv_wkv6_f32;
|
|
401
458
|
vk_pipeline pipeline_rwkv_wkv7_f32;
|
|
@@ -428,7 +485,6 @@ struct vk_device_struct {
|
|
|
428
485
|
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
|
429
486
|
|
|
430
487
|
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
|
431
|
-
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
|
432
488
|
|
|
433
489
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
|
434
490
|
|
|
@@ -444,7 +500,7 @@ struct vk_device_struct {
|
|
|
444
500
|
// for GGML_VK_PERF_LOGGER
|
|
445
501
|
std::unique_ptr<vk_perf_logger> perf_logger;
|
|
446
502
|
vk::QueryPool query_pool;
|
|
447
|
-
|
|
503
|
+
int32_t num_queries;
|
|
448
504
|
|
|
449
505
|
~vk_device_struct() {
|
|
450
506
|
VK_LOG_DEBUG("destroy device " << name);
|
|
@@ -453,10 +509,8 @@ struct vk_device_struct {
|
|
|
453
509
|
|
|
454
510
|
ggml_vk_destroy_buffer(sync_staging);
|
|
455
511
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
device.destroyCommandPool(transfer_queue.pool);
|
|
459
|
-
}
|
|
512
|
+
compute_queue.cmd_pool.destroy(device);
|
|
513
|
+
transfer_queue.cmd_pool.destroy(device);
|
|
460
514
|
|
|
461
515
|
for (auto& pipeline : pipelines) {
|
|
462
516
|
if (pipeline.second.expired()) {
|
|
@@ -468,10 +522,26 @@ struct vk_device_struct {
|
|
|
468
522
|
}
|
|
469
523
|
pipelines.clear();
|
|
470
524
|
|
|
525
|
+
device.destroyDescriptorSetLayout(dsl);
|
|
526
|
+
|
|
471
527
|
device.destroy();
|
|
472
528
|
}
|
|
473
529
|
};
|
|
474
530
|
|
|
531
|
+
void vk_command_pool::init(vk_device& device, vk_queue *q_) {
|
|
532
|
+
cmd_buffer_idx = 0;
|
|
533
|
+
q = q_;
|
|
534
|
+
|
|
535
|
+
vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
|
|
536
|
+
pool = device->device.createCommandPool(command_pool_create_info);
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
void vk_command_pool::destroy(vk::Device& device) {
|
|
540
|
+
device.destroyCommandPool(pool);
|
|
541
|
+
pool = nullptr;
|
|
542
|
+
cmd_buffers.clear();
|
|
543
|
+
}
|
|
544
|
+
|
|
475
545
|
struct vk_buffer_struct {
|
|
476
546
|
vk::Buffer buffer = VK_NULL_HANDLE;
|
|
477
547
|
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
|
|
@@ -706,6 +776,21 @@ struct vk_op_timestep_embedding_push_constants {
|
|
|
706
776
|
uint32_t max_period;
|
|
707
777
|
};
|
|
708
778
|
|
|
779
|
+
struct vk_op_conv_transpose_1d_push_constants {
|
|
780
|
+
uint32_t Cout;
|
|
781
|
+
uint32_t Cin;
|
|
782
|
+
uint32_t K;
|
|
783
|
+
uint32_t L;
|
|
784
|
+
uint32_t KL;
|
|
785
|
+
|
|
786
|
+
uint32_t nb01;
|
|
787
|
+
uint32_t nb02;
|
|
788
|
+
uint32_t nb11;
|
|
789
|
+
uint32_t nb1;
|
|
790
|
+
|
|
791
|
+
int32_t s0;
|
|
792
|
+
};
|
|
793
|
+
|
|
709
794
|
struct vk_op_pool2d_push_constants {
|
|
710
795
|
uint32_t IW; uint32_t IH;
|
|
711
796
|
uint32_t OW; uint32_t OH;
|
|
@@ -774,7 +859,7 @@ struct vk_context_struct {
|
|
|
774
859
|
std::vector<vk_staging_memcpy> in_memcpys;
|
|
775
860
|
std::vector<vk_staging_memcpy> out_memcpys;
|
|
776
861
|
|
|
777
|
-
|
|
862
|
+
vk_command_pool * p {};
|
|
778
863
|
};
|
|
779
864
|
typedef std::shared_ptr<vk_context_struct> vk_context;
|
|
780
865
|
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
|
|
@@ -885,6 +970,14 @@ struct ggml_backend_vk_context {
|
|
|
885
970
|
vk_context_ref transfer_ctx;
|
|
886
971
|
|
|
887
972
|
std::vector<vk_context_ref> tensor_ctxs;
|
|
973
|
+
|
|
974
|
+
std::vector<vk::DescriptorPool> descriptor_pools;
|
|
975
|
+
std::vector<vk::DescriptorSet> descriptor_sets;
|
|
976
|
+
uint32_t descriptor_set_idx {};
|
|
977
|
+
uint32_t pipeline_descriptor_set_requirements {};
|
|
978
|
+
|
|
979
|
+
vk_command_pool compute_cmd_pool;
|
|
980
|
+
vk_command_pool transfer_cmd_pool;
|
|
888
981
|
};
|
|
889
982
|
|
|
890
983
|
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
|
@@ -1015,39 +1108,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|
|
1015
1108
|
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
|
|
1016
1109
|
disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
|
1017
1110
|
GGML_ASSERT(parameter_count > 0);
|
|
1111
|
+
GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
|
|
1018
1112
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
|
1019
1113
|
|
|
1020
1114
|
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
|
1021
1115
|
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
|
1022
1116
|
|
|
1023
|
-
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
|
1024
|
-
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
|
1025
|
-
for (uint32_t i = 0; i < parameter_count; i++) {
|
|
1026
|
-
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
|
1027
|
-
dsl_binding_flags.push_back({});
|
|
1028
|
-
}
|
|
1029
|
-
|
|
1030
|
-
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
|
1031
|
-
|
|
1032
1117
|
vk::PushConstantRange pcr(
|
|
1033
1118
|
vk::ShaderStageFlagBits::eCompute,
|
|
1034
1119
|
0,
|
|
1035
1120
|
pipeline->push_constant_size
|
|
1036
1121
|
);
|
|
1037
1122
|
|
|
1038
|
-
vk::
|
|
1039
|
-
{},
|
|
1040
|
-
dsl_binding);
|
|
1041
|
-
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
|
1042
|
-
pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
|
1043
|
-
|
|
1044
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
1045
|
-
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
1046
|
-
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
1047
|
-
|
|
1048
|
-
pipeline->descriptor_set_idx = 0;
|
|
1049
|
-
|
|
1050
|
-
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
|
|
1123
|
+
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
|
|
1051
1124
|
pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
|
|
1052
1125
|
|
|
1053
1126
|
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
|
|
@@ -1122,15 +1195,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|
|
1122
1195
|
|
|
1123
1196
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
|
1124
1197
|
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
|
1125
|
-
for (auto& pool : pipeline->descriptor_pools) {
|
|
1126
|
-
device.destroyDescriptorPool(pool);
|
|
1127
|
-
}
|
|
1128
|
-
pipeline->descriptor_pools.clear();
|
|
1129
|
-
pipeline->descriptor_sets.clear();
|
|
1130
|
-
pipeline->descriptor_set_idx = 0;
|
|
1131
|
-
|
|
1132
|
-
device.destroyDescriptorSetLayout(pipeline->dsl);
|
|
1133
|
-
|
|
1134
1198
|
device.destroyPipelineLayout(pipeline->layout);
|
|
1135
1199
|
|
|
1136
1200
|
device.destroyShaderModule(pipeline->shader_module);
|
|
@@ -1138,97 +1202,77 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
|
|
1138
1202
|
device.destroyPipeline(pipeline->pipeline);
|
|
1139
1203
|
}
|
|
1140
1204
|
|
|
1141
|
-
static void ggml_pipeline_request_descriptor_sets(
|
|
1205
|
+
static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
|
|
1142
1206
|
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
|
1143
|
-
|
|
1207
|
+
ctx->pipeline_descriptor_set_requirements += n;
|
|
1144
1208
|
if (!pipeline->compiled) {
|
|
1145
1209
|
pipeline->needed = true;
|
|
1146
|
-
device->need_compiles = true;
|
|
1210
|
+
ctx->device->need_compiles = true;
|
|
1147
1211
|
}
|
|
1148
1212
|
}
|
|
1149
1213
|
|
|
1150
|
-
static void ggml_pipeline_allocate_descriptor_sets(
|
|
1151
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1152
|
-
|
|
1153
|
-
for (auto& pair : device->pipeline_descriptor_set_requirements) {
|
|
1154
|
-
vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
|
|
1155
|
-
const uint64_t n = pair.second;
|
|
1156
|
-
|
|
1157
|
-
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
|
1214
|
+
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
|
|
1158
1215
|
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1216
|
+
if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
|
|
1217
|
+
// Enough descriptors are available
|
|
1218
|
+
return;
|
|
1219
|
+
}
|
|
1163
1220
|
|
|
1164
|
-
|
|
1165
|
-
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1166
|
-
uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1221
|
+
vk_device& device = ctx->device;
|
|
1167
1222
|
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1223
|
+
uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
|
|
1224
|
+
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1225
|
+
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1172
1226
|
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
}
|
|
1227
|
+
while (to_alloc > 0) {
|
|
1228
|
+
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
|
1229
|
+
to_alloc -= alloc_count;
|
|
1230
|
+
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1178
1231
|
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
1185
|
-
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
|
|
1232
|
+
if (pool_idx >= ctx->descriptor_pools.size()) {
|
|
1233
|
+
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
1234
|
+
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
1235
|
+
ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
1236
|
+
}
|
|
1186
1237
|
|
|
1187
|
-
|
|
1238
|
+
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
|
1239
|
+
for (uint32_t i = 0; i < alloc_count; i++) {
|
|
1240
|
+
layouts[i] = device->dsl;
|
|
1188
1241
|
}
|
|
1189
|
-
|
|
1190
|
-
|
|
1242
|
+
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
|
1243
|
+
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
1244
|
+
ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
|
|
1191
1245
|
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
pipeline->descriptor_set_idx = 0;
|
|
1246
|
+
pool_idx++;
|
|
1247
|
+
}
|
|
1195
1248
|
}
|
|
1196
1249
|
|
|
1197
|
-
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device,
|
|
1250
|
+
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
|
|
1198
1251
|
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
|
1199
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1200
1252
|
|
|
1201
|
-
if (
|
|
1253
|
+
if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
|
|
1202
1254
|
// Reuse command buffer
|
|
1203
|
-
return
|
|
1255
|
+
return p.cmd_buffers[p.cmd_buffer_idx++];
|
|
1204
1256
|
}
|
|
1205
1257
|
|
|
1206
1258
|
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
|
1207
|
-
|
|
1259
|
+
p.pool,
|
|
1208
1260
|
vk::CommandBufferLevel::ePrimary,
|
|
1209
1261
|
1);
|
|
1210
1262
|
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
|
1211
1263
|
auto buf = cmd_buffers.front();
|
|
1212
1264
|
|
|
1213
|
-
|
|
1214
|
-
|
|
1265
|
+
p.cmd_buffers.push_back(buf);
|
|
1266
|
+
p.cmd_buffer_idx++;
|
|
1215
1267
|
|
|
1216
1268
|
return buf;
|
|
1217
1269
|
}
|
|
1218
1270
|
|
|
1219
|
-
static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
|
1220
|
-
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
|
1221
|
-
vk_submission s;
|
|
1222
|
-
s.buffer = ggml_vk_create_cmd_buffer(device, q);
|
|
1223
|
-
s.wait_semaphores = std::move(wait_semaphores);
|
|
1224
|
-
s.signal_semaphores = std::move(signal_semaphores);
|
|
1225
|
-
return s;
|
|
1226
|
-
}
|
|
1227
|
-
|
|
1228
1271
|
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
|
1229
1272
|
if (ctx->seqs.empty()) {
|
|
1230
1273
|
if (fence) {
|
|
1231
|
-
|
|
1274
|
+
std::lock_guard<std::mutex> guard(queue_mutex);
|
|
1275
|
+
ctx->p->q->queue.submit({}, fence);
|
|
1232
1276
|
}
|
|
1233
1277
|
return;
|
|
1234
1278
|
}
|
|
@@ -1267,7 +1311,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
|
|
1267
1311
|
tl_signal_vals.push_back({});
|
|
1268
1312
|
tl_signal_semaphores.push_back({});
|
|
1269
1313
|
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
|
1270
|
-
stage_flags[idx].push_back(ctx->q->stage_flags);
|
|
1314
|
+
stage_flags[idx].push_back(ctx->p->q->stage_flags);
|
|
1271
1315
|
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
|
1272
1316
|
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
|
1273
1317
|
}
|
|
@@ -1297,7 +1341,8 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
|
|
1297
1341
|
}
|
|
1298
1342
|
}
|
|
1299
1343
|
|
|
1300
|
-
|
|
1344
|
+
std::lock_guard<std::mutex> guard(queue_mutex);
|
|
1345
|
+
ctx->p->q->queue.submit(submit_infos, fence);
|
|
1301
1346
|
|
|
1302
1347
|
ctx->seqs.clear();
|
|
1303
1348
|
}
|
|
@@ -1355,28 +1400,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
|
|
|
1355
1400
|
q.queue_family_index = queue_family_index;
|
|
1356
1401
|
q.transfer_only = transfer_only;
|
|
1357
1402
|
|
|
1358
|
-
|
|
1359
|
-
q.pool = device->device.createCommandPool(command_pool_create_info_compute);
|
|
1360
|
-
|
|
1361
|
-
q.cmd_buffer_idx = 0;
|
|
1403
|
+
q.cmd_pool.init(device, &q);
|
|
1362
1404
|
|
|
1363
1405
|
q.queue = device->device.getQueue(queue_family_index, queue_index);
|
|
1364
1406
|
|
|
1365
1407
|
q.stage_flags = stage_flags;
|
|
1366
1408
|
}
|
|
1367
1409
|
|
|
1368
|
-
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx,
|
|
1410
|
+
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
|
|
1369
1411
|
vk_context result = std::make_shared<vk_context_struct>();
|
|
1370
1412
|
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
|
|
1371
1413
|
ctx->gc.contexts.emplace_back(result);
|
|
1372
|
-
result->
|
|
1414
|
+
result->p = &p;
|
|
1373
1415
|
return result;
|
|
1374
1416
|
}
|
|
1375
1417
|
|
|
1376
|
-
static vk_context ggml_vk_create_temporary_context(
|
|
1418
|
+
static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
|
|
1377
1419
|
vk_context result = std::make_shared<vk_context_struct>();
|
|
1378
1420
|
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
|
|
1379
|
-
result->
|
|
1421
|
+
result->p = &p;
|
|
1380
1422
|
return result;
|
|
1381
1423
|
}
|
|
1382
1424
|
|
|
@@ -1409,15 +1451,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
|
|
1409
1451
|
return ctx->gc.events[ctx->event_idx++];
|
|
1410
1452
|
}
|
|
1411
1453
|
|
|
1412
|
-
static void
|
|
1413
|
-
VK_LOG_DEBUG("
|
|
1414
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1454
|
+
static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
|
|
1455
|
+
VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
|
|
1415
1456
|
|
|
1416
1457
|
// Requires command buffers to be done
|
|
1417
|
-
device->device.resetCommandPool(
|
|
1418
|
-
|
|
1458
|
+
device->device.resetCommandPool(p.pool);
|
|
1459
|
+
p.cmd_buffer_idx = 0;
|
|
1460
|
+
}
|
|
1461
|
+
|
|
1462
|
+
static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
|
|
1463
|
+
VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
|
|
1464
|
+
|
|
1465
|
+
// Arbitrary frequency to cleanup/reuse command buffers
|
|
1466
|
+
static constexpr uint32_t cleanup_frequency = 10;
|
|
1467
|
+
|
|
1468
|
+
if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
|
1469
|
+
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
|
|
1470
|
+
}
|
|
1471
|
+
if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
|
1472
|
+
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
|
|
1473
|
+
}
|
|
1419
1474
|
}
|
|
1420
1475
|
|
|
1476
|
+
|
|
1421
1477
|
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
|
1422
1478
|
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
|
1423
1479
|
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
|
@@ -1436,8 +1492,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|
|
1436
1492
|
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
|
|
1437
1493
|
}
|
|
1438
1494
|
|
|
1439
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1440
|
-
|
|
1441
1495
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
|
1442
1496
|
|
|
1443
1497
|
if (size == 0) {
|
|
@@ -1566,11 +1620,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
|
1566
1620
|
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
|
1567
1621
|
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
|
1568
1622
|
|
|
1569
|
-
const bool transfer_queue = ctx->q->transfer_only;
|
|
1623
|
+
const bool transfer_queue = ctx->p->q->transfer_only;
|
|
1570
1624
|
|
|
1571
1625
|
ctx->s->buffer.pipelineBarrier(
|
|
1572
|
-
ctx->q->stage_flags,
|
|
1573
|
-
ctx->q->stage_flags,
|
|
1626
|
+
ctx->p->q->stage_flags,
|
|
1627
|
+
ctx->p->q->stage_flags,
|
|
1574
1628
|
{},
|
|
1575
1629
|
{ {
|
|
1576
1630
|
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
|
@@ -1589,8 +1643,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
|
|
1589
1643
|
|
|
1590
1644
|
ctx->s->buffer.waitEvents(
|
|
1591
1645
|
events,
|
|
1592
|
-
ctx->q->stage_flags,
|
|
1593
|
-
ctx->q->stage_flags,
|
|
1646
|
+
ctx->p->q->stage_flags,
|
|
1647
|
+
ctx->p->q->stage_flags,
|
|
1594
1648
|
{},
|
|
1595
1649
|
{},
|
|
1596
1650
|
{}
|
|
@@ -1652,7 +1706,7 @@ static std::array<uint32_t, 2> fa_rows_cols(FaCodePath path, uint32_t D, uint32_
|
|
|
1652
1706
|
return {64, 32};
|
|
1653
1707
|
}
|
|
1654
1708
|
return {64, 64};
|
|
1655
|
-
}
|
|
1709
|
+
}
|
|
1656
1710
|
|
|
1657
1711
|
static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector<uint32_t>& warptile, bool mul_mat_id, ggml_type src0_type) {
|
|
1658
1712
|
|
|
@@ -2726,6 +2780,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
2726
2780
|
|
|
2727
2781
|
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
|
2728
2782
|
|
|
2783
|
+
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
|
2784
|
+
|
|
2729
2785
|
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
|
2730
2786
|
|
|
2731
2787
|
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
|
@@ -3322,6 +3378,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
3322
3378
|
}
|
|
3323
3379
|
}
|
|
3324
3380
|
|
|
3381
|
+
|
|
3382
|
+
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
|
3383
|
+
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
|
3384
|
+
for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
|
|
3385
|
+
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
|
3386
|
+
dsl_binding_flags.push_back({});
|
|
3387
|
+
}
|
|
3388
|
+
|
|
3389
|
+
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
|
3390
|
+
|
|
3391
|
+
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
|
|
3392
|
+
{},
|
|
3393
|
+
dsl_binding);
|
|
3394
|
+
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
|
3395
|
+
device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
|
3396
|
+
|
|
3325
3397
|
ggml_vk_load_shaders(device);
|
|
3326
3398
|
|
|
3327
3399
|
if (!device->single_queue) {
|
|
@@ -3329,7 +3401,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
3329
3401
|
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
|
3330
3402
|
} else {
|
|
3331
3403
|
// TODO: Use pointer or reference to avoid copy
|
|
3332
|
-
device->transfer_queue
|
|
3404
|
+
device->transfer_queue.copyFrom(device->compute_queue);
|
|
3405
|
+
device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
|
|
3333
3406
|
}
|
|
3334
3407
|
|
|
3335
3408
|
device->buffer_type = {
|
|
@@ -3548,11 +3621,11 @@ static void ggml_vk_instance_init() {
|
|
|
3548
3621
|
|
|
3549
3622
|
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
|
3550
3623
|
|
|
3551
|
-
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
|
3552
|
-
|
|
3553
3624
|
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
|
3554
3625
|
char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
|
|
3555
3626
|
if (devices_env != nullptr) {
|
|
3627
|
+
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
|
3628
|
+
|
|
3556
3629
|
std::string devices(devices_env);
|
|
3557
3630
|
std::replace(devices.begin(), devices.end(), ',', ' ');
|
|
3558
3631
|
|
|
@@ -3568,9 +3641,9 @@ static void ggml_vk_instance_init() {
|
|
|
3568
3641
|
} else {
|
|
3569
3642
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
3570
3643
|
|
|
3571
|
-
//
|
|
3644
|
+
// If no vulkan devices are found, return early
|
|
3572
3645
|
if (devices.empty()) {
|
|
3573
|
-
|
|
3646
|
+
GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
|
|
3574
3647
|
return;
|
|
3575
3648
|
}
|
|
3576
3649
|
|
|
@@ -3653,9 +3726,20 @@ static void ggml_vk_instance_init() {
|
|
|
3653
3726
|
}
|
|
3654
3727
|
}
|
|
3655
3728
|
|
|
3656
|
-
// If no dedicated GPUs found, fall back to
|
|
3729
|
+
// If no dedicated GPUs found, fall back to the first non-CPU device.
|
|
3730
|
+
// If only CPU devices are available, return without devices.
|
|
3657
3731
|
if (vk_instance.device_indices.empty()) {
|
|
3658
|
-
|
|
3732
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
3733
|
+
if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) {
|
|
3734
|
+
vk_instance.device_indices.push_back(i);
|
|
3735
|
+
break;
|
|
3736
|
+
}
|
|
3737
|
+
}
|
|
3738
|
+
}
|
|
3739
|
+
|
|
3740
|
+
if (vk_instance.device_indices.empty()) {
|
|
3741
|
+
GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
|
|
3742
|
+
return;
|
|
3659
3743
|
}
|
|
3660
3744
|
}
|
|
3661
3745
|
GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
|
|
@@ -3684,6 +3768,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
|
3684
3768
|
ctx->fence = ctx->device->device.createFence({});
|
|
3685
3769
|
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
|
3686
3770
|
|
|
3771
|
+
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
|
|
3772
|
+
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
|
|
3773
|
+
|
|
3687
3774
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
3688
3775
|
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
|
3689
3776
|
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
|
|
@@ -4049,9 +4136,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
|
|
|
4049
4136
|
}
|
|
4050
4137
|
}
|
|
4051
4138
|
|
|
4052
|
-
static vk_submission ggml_vk_begin_submission(vk_device& device,
|
|
4139
|
+
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
|
|
4053
4140
|
vk_submission s;
|
|
4054
|
-
s.buffer = ggml_vk_create_cmd_buffer(device,
|
|
4141
|
+
s.buffer = ggml_vk_create_cmd_buffer(device, p);
|
|
4055
4142
|
if (one_time) {
|
|
4056
4143
|
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
|
4057
4144
|
} else {
|
|
@@ -4061,7 +4148,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
|
|
|
4061
4148
|
return s;
|
|
4062
4149
|
}
|
|
4063
4150
|
|
|
4064
|
-
|
|
4151
|
+
template <typename T> size_t push_constant_size(const T &t) {
|
|
4152
|
+
static_assert(std::is_class<T>::value, "T must be a struct/class");
|
|
4153
|
+
GGML_UNUSED(t);
|
|
4154
|
+
return sizeof(T);
|
|
4155
|
+
}
|
|
4156
|
+
template <typename T> size_t push_constant_size(const std::vector<T> &t) {
|
|
4157
|
+
GGML_UNUSED(t);
|
|
4158
|
+
return sizeof(T) * t.size();
|
|
4159
|
+
}
|
|
4160
|
+
template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
|
|
4161
|
+
GGML_UNUSED(t);
|
|
4162
|
+
return sizeof(T) * N;
|
|
4163
|
+
}
|
|
4164
|
+
|
|
4165
|
+
template <typename T> const T *push_constant_data(const T &t) {
|
|
4166
|
+
static_assert(std::is_class<T>::value, "T must be a struct/class");
|
|
4167
|
+
return &t;
|
|
4168
|
+
}
|
|
4169
|
+
template <typename T> const T *push_constant_data(const std::vector<T> &t) {
|
|
4170
|
+
return t.data();
|
|
4171
|
+
}
|
|
4172
|
+
template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
|
|
4173
|
+
return t.data();
|
|
4174
|
+
}
|
|
4175
|
+
|
|
4176
|
+
template <typename T>
|
|
4177
|
+
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
|
|
4065
4178
|
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
|
4066
4179
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
|
4067
4180
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
|
@@ -4070,14 +4183,14 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
|
|
4070
4183
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
|
4071
4184
|
}
|
|
4072
4185
|
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
|
4073
|
-
GGML_ASSERT(
|
|
4074
|
-
GGML_ASSERT(descriptor_buffer_infos.size()
|
|
4186
|
+
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
|
4187
|
+
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
|
4075
4188
|
|
|
4076
|
-
vk::DescriptorSet& descriptor_set =
|
|
4189
|
+
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
|
4077
4190
|
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
|
4078
4191
|
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
|
4079
4192
|
|
|
4080
|
-
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
|
|
4193
|
+
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
|
|
4081
4194
|
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
|
4082
4195
|
subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
|
4083
4196
|
pipeline->layout,
|
|
@@ -4110,7 +4223,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
|
|
4110
4223
|
ggml_vk_ctx_end(subctx);
|
|
4111
4224
|
}
|
|
4112
4225
|
|
|
4113
|
-
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->
|
|
4226
|
+
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
|
|
4114
4227
|
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
|
4115
4228
|
}
|
|
4116
4229
|
|
|
@@ -4311,7 +4424,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|
|
4311
4424
|
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
|
4312
4425
|
}
|
|
4313
4426
|
} else {
|
|
4314
|
-
|
|
4427
|
+
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
|
4428
|
+
|
|
4429
|
+
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
|
4315
4430
|
ggml_vk_ctx_begin(dst->device, subctx);
|
|
4316
4431
|
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
|
|
4317
4432
|
ggml_vk_ctx_end(subctx);
|
|
@@ -4323,6 +4438,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|
|
4323
4438
|
ggml_vk_submit(subctx, dst->device->fence);
|
|
4324
4439
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
|
4325
4440
|
dst->device->device.resetFences({ dst->device->fence });
|
|
4441
|
+
ggml_vk_queue_command_pools_cleanup(dst->device);
|
|
4326
4442
|
}
|
|
4327
4443
|
}
|
|
4328
4444
|
|
|
@@ -4399,7 +4515,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|
|
4399
4515
|
|
|
4400
4516
|
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
|
4401
4517
|
} else {
|
|
4402
|
-
|
|
4518
|
+
std::lock_guard<std::mutex> guard(src->device->mutex);
|
|
4519
|
+
|
|
4520
|
+
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
|
4403
4521
|
ggml_vk_ctx_begin(src->device, subctx);
|
|
4404
4522
|
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
|
|
4405
4523
|
ggml_vk_ctx_end(subctx);
|
|
@@ -4407,6 +4525,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|
|
4407
4525
|
ggml_vk_submit(subctx, src->device->fence);
|
|
4408
4526
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
|
4409
4527
|
src->device->device.resetFences({ src->device->fence });
|
|
4528
|
+
ggml_vk_queue_command_pools_cleanup(src->device);
|
|
4410
4529
|
|
|
4411
4530
|
for (auto& cpy : subctx->out_memcpys) {
|
|
4412
4531
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
|
@@ -4426,15 +4545,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
|
|
4426
4545
|
|
|
4427
4546
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
|
4428
4547
|
if (src->device == dst->device) {
|
|
4548
|
+
std::lock_guard<std::mutex> guard(src->device->mutex);
|
|
4429
4549
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
|
4430
4550
|
// Copy within the device
|
|
4431
|
-
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
|
4551
|
+
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
|
4432
4552
|
ggml_vk_ctx_begin(src->device, subctx);
|
|
4433
4553
|
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
|
4434
4554
|
ggml_vk_ctx_end(subctx);
|
|
4435
4555
|
ggml_vk_submit(subctx, src->device->fence);
|
|
4436
4556
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
|
4437
4557
|
src->device->device.resetFences({ src->device->fence });
|
|
4558
|
+
ggml_vk_queue_command_pools_cleanup(src->device);
|
|
4438
4559
|
} else {
|
|
4439
4560
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
|
4440
4561
|
// Copy device to device
|
|
@@ -4459,7 +4580,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
|
|
|
4459
4580
|
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
|
4460
4581
|
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
|
4461
4582
|
|
|
4462
|
-
|
|
4583
|
+
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
|
4584
|
+
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
|
4463
4585
|
ggml_vk_ctx_begin(dst->device, subctx);
|
|
4464
4586
|
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
|
4465
4587
|
ggml_vk_ctx_end(subctx);
|
|
@@ -4467,6 +4589,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
|
|
4467
4589
|
ggml_vk_submit(subctx, dst->device->fence);
|
|
4468
4590
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
|
4469
4591
|
dst->device->device.resetFences({ dst->device->fence });
|
|
4592
|
+
ggml_vk_queue_command_pools_cleanup(dst->device);
|
|
4470
4593
|
}
|
|
4471
4594
|
|
|
4472
4595
|
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
|
@@ -4540,7 +4663,7 @@ static void ggml_vk_matmul(
|
|
|
4540
4663
|
ggml_vk_sync_buffers(subctx);
|
|
4541
4664
|
if (split_k == 1) {
|
|
4542
4665
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
4543
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d },
|
|
4666
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
|
|
4544
4667
|
return;
|
|
4545
4668
|
}
|
|
4546
4669
|
|
|
@@ -4548,10 +4671,10 @@ static void ggml_vk_matmul(
|
|
|
4548
4671
|
|
|
4549
4672
|
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
4550
4673
|
// Make sure enough workgroups get assigned for split k to work
|
|
4551
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer },
|
|
4674
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
|
4552
4675
|
ggml_vk_sync_buffers(subctx);
|
|
4553
4676
|
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
|
|
4554
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2
|
|
4677
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
|
|
4555
4678
|
}
|
|
4556
4679
|
|
|
4557
4680
|
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
|
|
@@ -4599,7 +4722,7 @@ static void ggml_vk_matmul_id(
|
|
|
4599
4722
|
ggml_vk_sync_buffers(subctx);
|
|
4600
4723
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
|
4601
4724
|
nei0, nei1, nbi1, ne11, padded_n };
|
|
4602
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids },
|
|
4725
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
|
|
4603
4726
|
}
|
|
4604
4727
|
|
|
4605
4728
|
static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
|
|
@@ -4720,7 +4843,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
|
|
4720
4843
|
};
|
|
4721
4844
|
init_pushconst_fastdiv(pc);
|
|
4722
4845
|
ggml_vk_sync_buffers(subctx);
|
|
4723
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out },
|
|
4846
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
|
|
4724
4847
|
}
|
|
4725
4848
|
|
|
4726
4849
|
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
|
|
@@ -4739,7 +4862,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
4739
4862
|
vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
|
4740
4863
|
|
|
4741
4864
|
ggml_vk_sync_buffers(subctx);
|
|
4742
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out },
|
|
4865
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
|
|
4743
4866
|
}
|
|
4744
4867
|
|
|
4745
4868
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -4880,18 +5003,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
4880
5003
|
}
|
|
4881
5004
|
|
|
4882
5005
|
// Request descriptor sets
|
|
4883
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5006
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
4884
5007
|
if (qx_needs_dequant) {
|
|
4885
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5008
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
4886
5009
|
}
|
|
4887
5010
|
if (qy_needs_dequant) {
|
|
4888
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5011
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
4889
5012
|
}
|
|
4890
5013
|
if (quantize_y) {
|
|
4891
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5014
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
|
|
4892
5015
|
}
|
|
4893
5016
|
if (split_k > 1) {
|
|
4894
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5017
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
|
4895
5018
|
}
|
|
4896
5019
|
return;
|
|
4897
5020
|
}
|
|
@@ -4939,7 +5062,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
4939
5062
|
} else if (qx_needs_dequant) {
|
|
4940
5063
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
4941
5064
|
ggml_vk_sync_buffers(subctx);
|
|
4942
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc
|
|
5065
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
4943
5066
|
}
|
|
4944
5067
|
if (y_non_contig) {
|
|
4945
5068
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -5073,12 +5196,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5073
5196
|
|
|
5074
5197
|
// Request descriptor sets
|
|
5075
5198
|
if (qx_needs_dequant) {
|
|
5076
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5199
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
5077
5200
|
}
|
|
5078
5201
|
if (qy_needs_dequant) {
|
|
5079
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5202
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
5080
5203
|
}
|
|
5081
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5204
|
+
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
|
5082
5205
|
return;
|
|
5083
5206
|
}
|
|
5084
5207
|
|
|
@@ -5155,7 +5278,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5155
5278
|
ggml_vk_sync_buffers(subctx);
|
|
5156
5279
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
5157
5280
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
|
5158
|
-
|
|
5281
|
+
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
|
5159
5282
|
}
|
|
5160
5283
|
|
|
5161
5284
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5211,7 +5334,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
5211
5334
|
|
|
5212
5335
|
if (dryrun) {
|
|
5213
5336
|
// Request descriptor sets
|
|
5214
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5337
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
|
5215
5338
|
return;
|
|
5216
5339
|
}
|
|
5217
5340
|
|
|
@@ -5243,7 +5366,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
5243
5366
|
}
|
|
5244
5367
|
|
|
5245
5368
|
ggml_vk_sync_buffers(subctx);
|
|
5246
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } },
|
|
5369
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
|
|
5247
5370
|
}
|
|
5248
5371
|
|
|
5249
5372
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5300,7 +5423,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
5300
5423
|
|
|
5301
5424
|
if (dryrun) {
|
|
5302
5425
|
// Request descriptor sets
|
|
5303
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5426
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
|
5304
5427
|
return;
|
|
5305
5428
|
}
|
|
5306
5429
|
|
|
@@ -5326,7 +5449,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
5326
5449
|
const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
|
5327
5450
|
ggml_vk_sync_buffers(subctx);
|
|
5328
5451
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
|
5329
|
-
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } },
|
|
5452
|
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
5330
5453
|
}
|
|
5331
5454
|
|
|
5332
5455
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5487,12 +5610,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5487
5610
|
}
|
|
5488
5611
|
|
|
5489
5612
|
// Request descriptor sets
|
|
5490
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5613
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
5491
5614
|
if (qx_needs_dequant) {
|
|
5492
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5615
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
5493
5616
|
}
|
|
5494
5617
|
if (qy_needs_dequant) {
|
|
5495
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5618
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
5496
5619
|
}
|
|
5497
5620
|
return;
|
|
5498
5621
|
}
|
|
@@ -5542,7 +5665,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5542
5665
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
5543
5666
|
ggml_vk_sync_buffers(subctx);
|
|
5544
5667
|
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
|
5545
|
-
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc
|
|
5668
|
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
5546
5669
|
}
|
|
5547
5670
|
if (y_non_contig) {
|
|
5548
5671
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -5681,12 +5804,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
5681
5804
|
|
|
5682
5805
|
// Request descriptor sets
|
|
5683
5806
|
if (qx_needs_dequant) {
|
|
5684
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5807
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
5685
5808
|
}
|
|
5686
5809
|
if (qy_needs_dequant) {
|
|
5687
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5810
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
5688
5811
|
}
|
|
5689
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5812
|
+
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
|
5690
5813
|
return;
|
|
5691
5814
|
}
|
|
5692
5815
|
|
|
@@ -5762,7 +5885,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
5762
5885
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
5763
5886
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
|
5764
5887
|
vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
|
|
5765
|
-
|
|
5888
|
+
pc, { groups_x, (uint32_t)nei0, groups_z });
|
|
5766
5889
|
}
|
|
5767
5890
|
|
|
5768
5891
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -6006,9 +6129,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6006
6129
|
|
|
6007
6130
|
if (dryrun) {
|
|
6008
6131
|
// Request descriptor sets
|
|
6009
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
6132
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
6010
6133
|
if (split_k > 1) {
|
|
6011
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
6134
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
|
6012
6135
|
}
|
|
6013
6136
|
return;
|
|
6014
6137
|
}
|
|
@@ -6112,7 +6235,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6112
6235
|
// there's no more than one tile of rows (i.e. workgroups_x would have been
|
|
6113
6236
|
// one). We reuse workgroups_x to mean the number of splits, so we need to
|
|
6114
6237
|
// cancel out the divide by wg_denoms[0].
|
|
6115
|
-
|
|
6238
|
+
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
|
6116
6239
|
|
|
6117
6240
|
ggml_vk_sync_buffers(subctx);
|
|
6118
6241
|
const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
|
|
@@ -6121,7 +6244,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6121
6244
|
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
|
6122
6245
|
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
|
6123
6246
|
},
|
|
6124
|
-
pc2
|
|
6247
|
+
pc2, { (uint32_t)ne1, 1, 1 });
|
|
6125
6248
|
} else {
|
|
6126
6249
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
6127
6250
|
{
|
|
@@ -6131,7 +6254,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6131
6254
|
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
|
6132
6255
|
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
|
6133
6256
|
},
|
|
6134
|
-
|
|
6257
|
+
pc, { workgroups_x, workgroups_y, workgroups_z });
|
|
6135
6258
|
}
|
|
6136
6259
|
}
|
|
6137
6260
|
|
|
@@ -6392,6 +6515,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
6392
6515
|
return ctx->device->pipeline_timestep_embedding_f32;
|
|
6393
6516
|
}
|
|
6394
6517
|
return nullptr;
|
|
6518
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
6519
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
6520
|
+
return ctx->device->pipeline_conv_transpose_1d_f32;
|
|
6521
|
+
}
|
|
6522
|
+
return nullptr;
|
|
6395
6523
|
case GGML_OP_POOL_2D:
|
|
6396
6524
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
6397
6525
|
return ctx->device->pipeline_pool2d_f32;
|
|
@@ -6566,7 +6694,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6566
6694
|
}
|
|
6567
6695
|
|
|
6568
6696
|
if (dryrun) {
|
|
6569
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
6697
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
6570
6698
|
return;
|
|
6571
6699
|
}
|
|
6572
6700
|
|
|
@@ -6726,6 +6854,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6726
6854
|
uint32_t half_ceil = (dim + 1) / 2;
|
|
6727
6855
|
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
|
6728
6856
|
} break;
|
|
6857
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
6858
|
+
{
|
|
6859
|
+
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
|
|
6860
|
+
} break;
|
|
6729
6861
|
case GGML_OP_POOL_2D:
|
|
6730
6862
|
{
|
|
6731
6863
|
const uint32_t N = dst->ne[3];
|
|
@@ -6800,7 +6932,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6800
6932
|
}
|
|
6801
6933
|
|
|
6802
6934
|
ggml_vk_sync_buffers(subctx);
|
|
6803
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6935
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6804
6936
|
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
|
|
6805
6937
|
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
6806
6938
|
vk_subbuffer subbuf_z;
|
|
@@ -6811,26 +6943,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6811
6943
|
}
|
|
6812
6944
|
|
|
6813
6945
|
ggml_vk_sync_buffers(subctx);
|
|
6814
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6946
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6815
6947
|
} else if (op == GGML_OP_IM2COL) {
|
|
6816
6948
|
// im2col uses only src1 and dst buffers
|
|
6817
6949
|
ggml_vk_sync_buffers(subctx);
|
|
6818
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6950
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6819
6951
|
} else if (op == GGML_OP_COUNT_EQUAL) {
|
|
6820
6952
|
ggml_vk_sync_buffers(subctx);
|
|
6821
6953
|
// count_equal assumes that destination buffer is initialized with zeroes
|
|
6822
6954
|
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
|
|
6823
6955
|
ggml_vk_sync_buffers(subctx);
|
|
6824
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6956
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6825
6957
|
} else if (use_src2) {
|
|
6826
6958
|
ggml_vk_sync_buffers(subctx);
|
|
6827
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6959
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6828
6960
|
} else if (use_src1) {
|
|
6829
6961
|
ggml_vk_sync_buffers(subctx);
|
|
6830
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6962
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6831
6963
|
} else {
|
|
6832
6964
|
ggml_vk_sync_buffers(subctx);
|
|
6833
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6965
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6834
6966
|
}
|
|
6835
6967
|
}
|
|
6836
6968
|
|
|
@@ -6943,7 +7075,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6943
7075
|
GGML_ASSERT(pipeline != nullptr);
|
|
6944
7076
|
|
|
6945
7077
|
if (dryrun) {
|
|
6946
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7078
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
6947
7079
|
return;
|
|
6948
7080
|
}
|
|
6949
7081
|
|
|
@@ -6999,7 +7131,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6999
7131
|
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
|
|
7000
7132
|
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
|
7001
7133
|
vk_subbuffer{ d_D, dst_offset, dst_size }
|
|
7002
|
-
},
|
|
7134
|
+
}, pc, elements);
|
|
7003
7135
|
} else if (version == 7) {
|
|
7004
7136
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
|
7005
7137
|
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
|
|
@@ -7010,7 +7142,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
7010
7142
|
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
|
7011
7143
|
vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
|
|
7012
7144
|
vk_subbuffer{ d_D, dst_offset, dst_size }
|
|
7013
|
-
},
|
|
7145
|
+
}, pc, elements);
|
|
7014
7146
|
} else {
|
|
7015
7147
|
// shouldn't happen
|
|
7016
7148
|
GGML_ASSERT(false);
|
|
@@ -7082,7 +7214,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
|
|
7082
7214
|
GGML_ASSERT(pipeline != nullptr);
|
|
7083
7215
|
|
|
7084
7216
|
if (dryrun) {
|
|
7085
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7217
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
7086
7218
|
return;
|
|
7087
7219
|
}
|
|
7088
7220
|
|
|
@@ -7147,7 +7279,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
|
|
7147
7279
|
vk_subbuffer{ d_GM, gm_offset, gm_size },
|
|
7148
7280
|
vk_subbuffer{ d_GV, gv_offset, gv_size },
|
|
7149
7281
|
vk_subbuffer{ d_P, p_offset, p_size },
|
|
7150
|
-
},
|
|
7282
|
+
}, pc, elements);
|
|
7151
7283
|
}
|
|
7152
7284
|
|
|
7153
7285
|
static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -7529,6 +7661,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
|
|
7529
7661
|
}, dryrun);
|
|
7530
7662
|
}
|
|
7531
7663
|
|
|
7664
|
+
static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
7665
|
+
// src0: (K, Cout, Cin, 1) -- kernel
|
|
7666
|
+
// src1: (L, Cin, 1, 1) -- input
|
|
7667
|
+
// dst: (*, Cout, 1, 1)
|
|
7668
|
+
|
|
7669
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
7670
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7671
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
7672
|
+
|
|
7673
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
7674
|
+
|
|
7675
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
|
7676
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7677
|
+
|
|
7678
|
+
const int32_t s0 = dst->op_params[0];
|
|
7679
|
+
|
|
7680
|
+
vk_op_conv_transpose_1d_push_constants p{};
|
|
7681
|
+
p.Cout = static_cast<uint32_t>(ne01);
|
|
7682
|
+
p.Cin = static_cast<uint32_t>(ne02);
|
|
7683
|
+
p.K = static_cast<uint32_t>(ne00);
|
|
7684
|
+
p.L = static_cast<uint32_t>(ne10);
|
|
7685
|
+
p.KL = static_cast<uint32_t>(ne0);
|
|
7686
|
+
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
|
|
7687
|
+
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
|
|
7688
|
+
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
|
|
7689
|
+
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
|
7690
|
+
p.s0 = static_cast<uint32_t>(s0);
|
|
7691
|
+
|
|
7692
|
+
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
|
|
7693
|
+
}
|
|
7694
|
+
|
|
7532
7695
|
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
7533
7696
|
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
|
7534
7697
|
const int32_t k1 = dst->op_params[1];
|
|
@@ -7729,9 +7892,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7729
7892
|
}
|
|
7730
7893
|
}
|
|
7731
7894
|
|
|
7732
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7895
|
+
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
|
7733
7896
|
if (split_k > 1) {
|
|
7734
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7897
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
|
7735
7898
|
|
|
7736
7899
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
|
7737
7900
|
// Resize buffer
|
|
@@ -7746,7 +7909,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7746
7909
|
ggml_vk_load_shaders(ctx->device);
|
|
7747
7910
|
}
|
|
7748
7911
|
|
|
7749
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
7912
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
7750
7913
|
|
|
7751
7914
|
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
7752
7915
|
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
@@ -7788,7 +7951,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7788
7951
|
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
|
7789
7952
|
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
|
7790
7953
|
|
|
7791
|
-
vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
7954
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
7792
7955
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
7793
7956
|
for (size_t i = 0; i < num_it; i++) {
|
|
7794
7957
|
ggml_vk_matmul(
|
|
@@ -7804,6 +7967,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7804
7967
|
ggml_vk_submit(subctx, ctx->fence);
|
|
7805
7968
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
|
|
7806
7969
|
ctx->device->device.resetFences({ ctx->fence });
|
|
7970
|
+
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
7807
7971
|
|
|
7808
7972
|
auto end = std::chrono::high_resolution_clock::now();
|
|
7809
7973
|
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
|
@@ -7905,16 +8069,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7905
8069
|
|
|
7906
8070
|
free(d_chk);
|
|
7907
8071
|
|
|
7908
|
-
|
|
7909
|
-
|
|
8072
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
|
8073
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
|
7910
8074
|
|
|
7911
8075
|
ggml_vk_destroy_buffer(d_X);
|
|
7912
8076
|
ggml_vk_destroy_buffer(d_Y);
|
|
7913
8077
|
ggml_vk_destroy_buffer(d_D);
|
|
7914
8078
|
|
|
7915
|
-
ggml_pipeline_cleanup(p);
|
|
7916
|
-
ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
|
|
7917
|
-
|
|
7918
8079
|
free(x);
|
|
7919
8080
|
free(y);
|
|
7920
8081
|
free(d);
|
|
@@ -7992,20 +8153,20 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
7992
8153
|
ggml_vk_quantize_data(x, qx, ne, quant);
|
|
7993
8154
|
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
|
7994
8155
|
|
|
7995
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8156
|
+
ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
|
7996
8157
|
|
|
7997
8158
|
if (ctx->device->need_compiles) {
|
|
7998
8159
|
ggml_vk_load_shaders(ctx->device);
|
|
7999
8160
|
}
|
|
8000
8161
|
|
|
8001
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
8162
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
8002
8163
|
|
|
8003
8164
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
|
8004
8165
|
|
|
8005
|
-
vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
8166
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8006
8167
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
8007
8168
|
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
|
8008
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc
|
|
8169
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
|
|
8009
8170
|
ggml_vk_ctx_end(subctx);
|
|
8010
8171
|
|
|
8011
8172
|
auto begin = std::chrono::high_resolution_clock::now();
|
|
@@ -8013,6 +8174,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
8013
8174
|
ggml_vk_submit(subctx, ctx->fence);
|
|
8014
8175
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
|
8015
8176
|
ctx->device->device.resetFences({ ctx->fence });
|
|
8177
|
+
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
8016
8178
|
|
|
8017
8179
|
auto end = std::chrono::high_resolution_clock::now();
|
|
8018
8180
|
|
|
@@ -8092,17 +8254,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
8092
8254
|
//
|
|
8093
8255
|
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
|
|
8094
8256
|
//
|
|
8095
|
-
// ggml_pipeline_request_descriptor_sets(ctx
|
|
8257
|
+
// ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
|
8096
8258
|
//
|
|
8097
8259
|
// if (ctx->device->need_compiles) {
|
|
8098
8260
|
// ggml_vk_load_shaders(ctx->device);
|
|
8099
8261
|
// }
|
|
8100
8262
|
//
|
|
8101
|
-
// ggml_pipeline_allocate_descriptor_sets(ctx
|
|
8263
|
+
// ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
8102
8264
|
//
|
|
8103
8265
|
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
|
|
8104
8266
|
//
|
|
8105
|
-
// vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
8267
|
+
// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8106
8268
|
// ggml_vk_ctx_begin(ctx->device, subctx);
|
|
8107
8269
|
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
|
|
8108
8270
|
// ggml_vk_ctx_end(subctx);
|
|
@@ -8112,6 +8274,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
8112
8274
|
// ggml_vk_submit(subctx, ctx->fence);
|
|
8113
8275
|
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
|
|
8114
8276
|
// ctx->device->device.resetFences({ ctx->fence });
|
|
8277
|
+
// ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
8115
8278
|
//
|
|
8116
8279
|
// auto end = std::chrono::high_resolution_clock::now();
|
|
8117
8280
|
//
|
|
@@ -8251,9 +8414,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
8251
8414
|
// y[i] = i % k;
|
|
8252
8415
|
}
|
|
8253
8416
|
|
|
8254
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8417
|
+
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
|
8255
8418
|
if (split_k > 1) {
|
|
8256
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8419
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
|
8257
8420
|
|
|
8258
8421
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
|
8259
8422
|
// Resize buffer
|
|
@@ -8264,19 +8427,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
8264
8427
|
}
|
|
8265
8428
|
}
|
|
8266
8429
|
if (mmq) {
|
|
8267
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8430
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
|
|
8268
8431
|
}
|
|
8269
8432
|
|
|
8270
8433
|
if (ctx->device->need_compiles) {
|
|
8271
8434
|
ggml_vk_load_shaders(ctx->device);
|
|
8272
8435
|
}
|
|
8273
8436
|
|
|
8274
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
8437
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
8275
8438
|
|
|
8276
8439
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
|
8277
8440
|
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
|
8278
8441
|
|
|
8279
|
-
vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
8442
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8280
8443
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
8281
8444
|
if (mmq) {
|
|
8282
8445
|
for (size_t i = 0; i < num_it; i++) {
|
|
@@ -8305,6 +8468,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
8305
8468
|
ggml_vk_submit(subctx, ctx->fence);
|
|
8306
8469
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
|
8307
8470
|
ctx->device->device.resetFences({ ctx->fence });
|
|
8471
|
+
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
8308
8472
|
|
|
8309
8473
|
auto end = std::chrono::high_resolution_clock::now();
|
|
8310
8474
|
|
|
@@ -8600,6 +8764,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8600
8764
|
case GGML_OP_COUNT_EQUAL:
|
|
8601
8765
|
case GGML_OP_IM2COL:
|
|
8602
8766
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
8767
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
8603
8768
|
case GGML_OP_POOL_2D:
|
|
8604
8769
|
case GGML_OP_CONV_2D_DW:
|
|
8605
8770
|
case GGML_OP_RWKV_WKV6:
|
|
@@ -8618,7 +8783,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8618
8783
|
|
|
8619
8784
|
if (!dryrun) {
|
|
8620
8785
|
if (ctx->compute_ctx.expired()) {
|
|
8621
|
-
compute_ctx = ggml_vk_create_context(ctx, ctx->
|
|
8786
|
+
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8622
8787
|
ctx->compute_ctx = compute_ctx;
|
|
8623
8788
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
|
8624
8789
|
} else {
|
|
@@ -8664,6 +8829,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8664
8829
|
case GGML_OP_COUNT_EQUAL:
|
|
8665
8830
|
case GGML_OP_IM2COL:
|
|
8666
8831
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
8832
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
8667
8833
|
case GGML_OP_POOL_2D:
|
|
8668
8834
|
case GGML_OP_CONV_2D_DW:
|
|
8669
8835
|
case GGML_OP_LEAKY_RELU:
|
|
@@ -8671,7 +8837,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8671
8837
|
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
|
8672
8838
|
// do the only thing needed for the dryrun.
|
|
8673
8839
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
|
|
8674
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8840
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
8675
8841
|
return false;
|
|
8676
8842
|
}
|
|
8677
8843
|
default:
|
|
@@ -8835,6 +9001,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8835
9001
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
8836
9002
|
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
|
8837
9003
|
|
|
9004
|
+
break;
|
|
9005
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
9006
|
+
ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
9007
|
+
|
|
8838
9008
|
break;
|
|
8839
9009
|
case GGML_OP_POOL_2D:
|
|
8840
9010
|
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
|
@@ -8963,6 +9133,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
8963
9133
|
case GGML_OP_COUNT_EQUAL:
|
|
8964
9134
|
case GGML_OP_IM2COL:
|
|
8965
9135
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
9136
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
8966
9137
|
case GGML_OP_POOL_2D:
|
|
8967
9138
|
case GGML_OP_CONV_2D_DW:
|
|
8968
9139
|
case GGML_OP_RWKV_WKV6:
|
|
@@ -9058,19 +9229,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
9058
9229
|
}
|
|
9059
9230
|
ctx->gc.temp_buffers.clear();
|
|
9060
9231
|
|
|
9061
|
-
|
|
9062
|
-
|
|
9063
|
-
|
|
9064
|
-
if (plr.expired()) {
|
|
9065
|
-
continue;
|
|
9066
|
-
}
|
|
9067
|
-
|
|
9068
|
-
vk_pipeline pl = plr.lock();
|
|
9069
|
-
ggml_pipeline_cleanup(pl);
|
|
9070
|
-
}
|
|
9071
|
-
|
|
9072
|
-
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
|
9073
|
-
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
|
9232
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
|
9233
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
|
9074
9234
|
|
|
9075
9235
|
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
|
9076
9236
|
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
|
@@ -9091,7 +9251,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
9091
9251
|
|
|
9092
9252
|
ctx->tensor_ctxs.clear();
|
|
9093
9253
|
ctx->gc.contexts.clear();
|
|
9094
|
-
ctx->
|
|
9254
|
+
ctx->pipeline_descriptor_set_requirements = 0;
|
|
9255
|
+
ctx->descriptor_set_idx = 0;
|
|
9095
9256
|
}
|
|
9096
9257
|
|
|
9097
9258
|
// Clean up on backend free
|
|
@@ -9118,6 +9279,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
9118
9279
|
|
|
9119
9280
|
ctx->device->device.destroyFence(ctx->fence);
|
|
9120
9281
|
ctx->device->device.destroyFence(ctx->almost_ready_fence);
|
|
9282
|
+
|
|
9283
|
+
for (auto& pool : ctx->descriptor_pools) {
|
|
9284
|
+
ctx->device->device.destroyDescriptorPool(pool);
|
|
9285
|
+
}
|
|
9286
|
+
ctx->descriptor_pools.clear();
|
|
9287
|
+
ctx->descriptor_sets.clear();
|
|
9288
|
+
|
|
9289
|
+
ctx->compute_cmd_pool.destroy(ctx->device->device);
|
|
9290
|
+
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
|
9121
9291
|
}
|
|
9122
9292
|
|
|
9123
9293
|
static int ggml_vk_get_device_count() {
|
|
@@ -9325,6 +9495,12 @@ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer
|
|
|
9325
9495
|
UNUSED(buft);
|
|
9326
9496
|
}
|
|
9327
9497
|
|
|
9498
|
+
static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
9499
|
+
return vk_instance.devices[0]->suballocation_block_size;
|
|
9500
|
+
|
|
9501
|
+
UNUSED(buft);
|
|
9502
|
+
}
|
|
9503
|
+
|
|
9328
9504
|
// Should be changed to return device-specific host buffer type
|
|
9329
9505
|
// but that probably requires changes in llama.cpp
|
|
9330
9506
|
ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
@@ -9333,7 +9509,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
|
9333
9509
|
/* .get_name = */ ggml_backend_vk_host_buffer_type_name,
|
|
9334
9510
|
/* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
|
|
9335
9511
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
|
9336
|
-
/* .get_max_size = */
|
|
9512
|
+
/* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size,
|
|
9337
9513
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
9338
9514
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
9339
9515
|
},
|
|
@@ -9384,7 +9560,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
|
|
9384
9560
|
|
|
9385
9561
|
if (ctx->transfer_ctx.expired()) {
|
|
9386
9562
|
// Initialize new transfer context
|
|
9387
|
-
transfer_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9563
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
|
9388
9564
|
ctx->transfer_ctx = transfer_ctx;
|
|
9389
9565
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
9390
9566
|
} else {
|
|
@@ -9407,7 +9583,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
|
|
9407
9583
|
|
|
9408
9584
|
if (ctx->transfer_ctx.expired()) {
|
|
9409
9585
|
// Initialize new transfer context
|
|
9410
|
-
transfer_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9586
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
|
9411
9587
|
ctx->transfer_ctx = transfer_ctx;
|
|
9412
9588
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
9413
9589
|
} else {
|
|
@@ -9430,7 +9606,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
|
|
|
9430
9606
|
|
|
9431
9607
|
if (ctx->transfer_ctx.expired()) {
|
|
9432
9608
|
// Initialize new transfer context
|
|
9433
|
-
transfer_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9609
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
|
9434
9610
|
ctx->transfer_ctx = transfer_ctx;
|
|
9435
9611
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
9436
9612
|
} else {
|
|
@@ -9491,7 +9667,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9491
9667
|
ggml_vk_load_shaders(ctx->device);
|
|
9492
9668
|
}
|
|
9493
9669
|
ggml_vk_preallocate_buffers(ctx);
|
|
9494
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
9670
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
9495
9671
|
|
|
9496
9672
|
int last_node = cgraph->n_nodes - 1;
|
|
9497
9673
|
|
|
@@ -9513,8 +9689,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9513
9689
|
if (ctx->device->query_pool) {
|
|
9514
9690
|
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
|
|
9515
9691
|
}
|
|
9516
|
-
|
|
9517
|
-
query_create_info.queryType =
|
|
9692
|
+
vk::QueryPoolCreateInfo query_create_info;
|
|
9693
|
+
query_create_info.queryType = vk::QueryType::eTimestamp;
|
|
9518
9694
|
query_create_info.queryCount = cgraph->n_nodes + 100;
|
|
9519
9695
|
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
|
|
9520
9696
|
ctx->device->num_queries = query_create_info.queryCount;
|
|
@@ -9523,7 +9699,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9523
9699
|
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
|
|
9524
9700
|
|
|
9525
9701
|
GGML_ASSERT(ctx->compute_ctx.expired());
|
|
9526
|
-
compute_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9702
|
+
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
9527
9703
|
ctx->compute_ctx = compute_ctx;
|
|
9528
9704
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
|
9529
9705
|
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
|
|
@@ -9558,7 +9734,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9558
9734
|
|
|
9559
9735
|
if (vk_perf_logger_enabled) {
|
|
9560
9736
|
if (ctx->compute_ctx.expired()) {
|
|
9561
|
-
compute_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9737
|
+
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
9562
9738
|
ctx->compute_ctx = compute_ctx;
|
|
9563
9739
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
|
9564
9740
|
} else {
|
|
@@ -9600,7 +9776,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9600
9776
|
|
|
9601
9777
|
// Get the results and pass them to the logger
|
|
9602
9778
|
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
|
9603
|
-
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
|
|
9779
|
+
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
|
|
9604
9780
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
9605
9781
|
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
|
|
9606
9782
|
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
|
|
@@ -10024,6 +10200,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
|
10024
10200
|
case GGML_OP_LEAKY_RELU:
|
|
10025
10201
|
case GGML_OP_OPT_STEP_ADAMW:
|
|
10026
10202
|
return true;
|
|
10203
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
10204
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
|
10027
10205
|
default:
|
|
10028
10206
|
return false;
|
|
10029
10207
|
}
|
|
@@ -10170,8 +10348,9 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
|
|
|
10170
10348
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
|
10171
10349
|
switch (props.vendorID) {
|
|
10172
10350
|
case VK_VENDOR_ID_INTEL:
|
|
10173
|
-
//
|
|
10174
|
-
|
|
10351
|
+
// Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
|
|
10352
|
+
// while some older hardware (ex. Arc A770) has performance regressions
|
|
10353
|
+
return arch == vk_device_architecture::INTEL_XE2;
|
|
10175
10354
|
case VK_VENDOR_ID_AMD:
|
|
10176
10355
|
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
|
10177
10356
|
// Workaround for AMD proprietary driver reporting support on all GPUs
|
|
@@ -10515,6 +10694,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|
|
10515
10694
|
const int32_t dim = tensor->op_params[0];
|
|
10516
10695
|
const int32_t max_period = tensor->op_params[1];
|
|
10517
10696
|
tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
|
|
10697
|
+
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
|
|
10698
|
+
const int32_t s0 = tensor->op_params[0];
|
|
10699
|
+
const int32_t p0 = tensor->op_params[1];
|
|
10700
|
+
const int32_t d0 = tensor->op_params[2];
|
|
10701
|
+
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
|
|
10518
10702
|
} else if (tensor->op == GGML_OP_POOL_2D) {
|
|
10519
10703
|
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
|
|
10520
10704
|
const int32_t k0 = tensor->op_params[1];
|