@novastera-oss/llamarn 0.2.6 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/include/llama.h +141 -38
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/LlamaCppModel.cpp +58 -24
- package/cpp/LlamaCppModel.h +3 -3
- package/cpp/PureCppImpl.cpp +1 -1
- package/cpp/PureCppImpl.h +2 -2
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +15 -4
- package/cpp/llama.cpp/Makefile +2 -2
- package/cpp/llama.cpp/README.md +32 -13
- package/cpp/llama.cpp/common/CMakeLists.txt +10 -20
- package/cpp/llama.cpp/common/arg.cpp +37 -6
- package/cpp/llama.cpp/common/build-info.cpp.in +2 -2
- package/cpp/llama.cpp/common/chat-parser.cpp +5 -0
- package/cpp/llama.cpp/common/chat-parser.h +2 -0
- package/cpp/llama.cpp/common/chat.cpp +12 -9
- package/cpp/llama.cpp/common/chat.h +1 -1
- package/cpp/llama.cpp/common/common.cpp +53 -40
- package/cpp/llama.cpp/common/common.h +6 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +3 -46
- package/cpp/llama.cpp/common/speculative.cpp +6 -4
- package/cpp/llama.cpp/convert_hf_to_gguf.py +215 -76
- package/cpp/llama.cpp/ggml/CMakeLists.txt +48 -2
- package/cpp/llama.cpp/ggml/cmake/common.cmake +1 -2
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +2 -0
- package/cpp/llama.cpp/ggml/include/ggml.h +33 -0
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +64 -13
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +6 -1
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +33 -9
- package/cpp/llama.cpp/ggml/src/ggml-common.h +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +124 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/cpu-feats.cpp +94 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +4114 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/repack.cpp +2163 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +2639 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/cpu-feats.cpp +82 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/powerpc/quants.c +2732 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +2069 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/repack.cpp +397 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +1300 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/wasm/quants.c +1481 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/quants.c +4311 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-aarch64.cpp → arch/x86/repack.cpp} +79 -3225
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +184 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +4 -3
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +16 -7
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +93 -104
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +12 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.cpp → hbm.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.cpp +58 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/llamafile/sgemm.h +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +194 -69
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/quants.c +1158 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-quants.h → quants.h} +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +1571 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.h +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +213 -37
- package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.cpp → traits.cpp} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +45 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +59 -37
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cu +161 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-dw.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cu +91 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d-transpose.cuh +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-mma-f16.cuh +4 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +90 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cu +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mean.cuh +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cu +257 -87
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmv.cuh +2 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ssm-scan.cu +6 -4
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cu +5 -18
- package/cpp/llama.cpp/ggml/src/ggml-cuda/sumrows.cuh +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +61 -183
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +11 -10
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +260 -49
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +497 -282
- package/cpp/llama.cpp/ggml/src/ggml-musa/mudnn.cuh +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1078 -468
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/concat.cl +109 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl +283 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +30 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/repeat.cl +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tanh.cl +63 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/upscale.cl +121 -0
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +0 -2
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +18 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +5 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +20 -48
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +28 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +4 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +117 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +192 -53
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dequantize.hpp +32 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +49 -67
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +31 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +99 -159
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +8 -105
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +209 -92
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +60 -80
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +158 -203
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +55 -74
- package/cpp/llama.cpp/ggml/src/ggml-sycl/quants.hpp +38 -10
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +24 -20
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.cpp +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/sycl_hw.hpp +3 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +3 -8
- package/cpp/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +108 -16
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +12 -16
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +36 -28
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +487 -247
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/CMakeLists.txt +4 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +98 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +69 -19
- package/cpp/llama.cpp/ggml/src/gguf.cpp +5 -1
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +133 -0
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +25 -1
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +78 -3
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +97 -4
- package/cpp/llama.cpp/gguf-py/pyproject.toml +2 -2
- package/cpp/llama.cpp/include/llama.h +141 -38
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +124 -0
- package/cpp/llama.cpp/requirements/requirements-compare-llama-bench.txt +1 -0
- package/cpp/llama.cpp/src/CMakeLists.txt +2 -2
- package/cpp/llama.cpp/src/llama-arch.cpp +150 -3
- package/cpp/llama.cpp/src/llama-arch.h +25 -1
- package/cpp/llama.cpp/src/llama-batch.cpp +736 -274
- package/cpp/llama.cpp/src/llama-batch.h +110 -57
- package/cpp/llama.cpp/src/llama-chat.cpp +30 -8
- package/cpp/llama.cpp/src/llama-chat.h +1 -0
- package/cpp/llama.cpp/src/llama-context.cpp +360 -266
- package/cpp/llama.cpp/src/llama-context.h +27 -23
- package/cpp/llama.cpp/src/llama-cparams.cpp +1 -1
- package/cpp/llama.cpp/src/llama-cparams.h +1 -1
- package/cpp/llama.cpp/src/llama-graph.cpp +411 -344
- package/cpp/llama.cpp/src/llama-graph.h +126 -58
- package/cpp/llama.cpp/src/llama-hparams.cpp +10 -2
- package/cpp/llama.cpp/src/llama-hparams.h +16 -2
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.cpp +103 -73
- package/cpp/llama.cpp/src/llama-kv-cache-unified-iswa.h +34 -42
- package/cpp/llama.cpp/src/llama-kv-cache-unified.cpp +345 -221
- package/cpp/llama.cpp/src/llama-kv-cache-unified.h +75 -50
- package/cpp/llama.cpp/src/llama-kv-cells.h +51 -22
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +246 -0
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +138 -0
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.cpp → llama-memory-recurrent.cpp} +302 -317
- package/cpp/llama.cpp/src/{llama-kv-cache-recurrent.h → llama-memory-recurrent.h} +60 -68
- package/cpp/llama.cpp/src/llama-memory.cpp +41 -0
- package/cpp/llama.cpp/src/llama-memory.h +73 -36
- package/cpp/llama.cpp/src/llama-mmap.cpp +1 -1
- package/cpp/llama.cpp/src/llama-model-loader.cpp +42 -17
- package/cpp/llama.cpp/src/llama-model-saver.cpp +1 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1630 -511
- package/cpp/llama.cpp/src/llama-model.h +26 -0
- package/cpp/llama.cpp/src/llama-quant.cpp +89 -6
- package/cpp/llama.cpp/src/llama-vocab.cpp +58 -26
- package/cpp/llama.cpp/src/llama-vocab.h +1 -0
- package/cpp/llama.cpp/src/llama.cpp +11 -7
- package/cpp/llama.cpp/src/unicode.cpp +5 -0
- package/cpp/rn-completion.cpp +2 -2
- package/cpp/{rn-llama.hpp → rn-llama.h} +1 -1
- package/cpp/{rn-utils.hpp → rn-utils.h} +3 -0
- package/ios/include/chat.h +1 -1
- package/ios/include/common.h +6 -2
- package/ios/include/llama.h +141 -38
- package/ios/libs/llama.xcframework/Info.plist +15 -15
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3766 -3624
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4890 -4689
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4861 -4710
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3764 -3622
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4926 -4725
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +4897 -4746
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +3794 -3652
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +2 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +33 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +141 -38
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +1 -2
- package/cpp/llama.cpp/common/cmake/build-info-gen-cpp.cmake +0 -24
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +0 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +0 -13891
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +0 -1
- package/cpp/llama.cpp/src/llama-kv-cache.h +0 -44
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{cpu-feats-x86.cpp → arch/x86/cpu-feats.cpp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-hbm.h → hbm.h} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-cpu/{ggml-cpu-traits.h → traits.h} +0 -0
|
@@ -78,7 +78,7 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
|
|
78
78
|
#define VK_VENDOR_ID_INTEL 0x8086
|
|
79
79
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
|
80
80
|
|
|
81
|
-
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE
|
|
81
|
+
#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 256
|
|
82
82
|
|
|
83
83
|
#define GGML_VK_MAX_NODES 8192
|
|
84
84
|
|
|
@@ -102,25 +102,11 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; }
|
|
|
102
102
|
|
|
103
103
|
struct ggml_backend_vk_context;
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
uint32_t queue_family_index;
|
|
107
|
-
vk::Queue queue;
|
|
108
|
-
vk::CommandPool pool;
|
|
109
|
-
uint32_t cmd_buffer_idx;
|
|
110
|
-
std::vector<vk::CommandBuffer> cmd_buffers;
|
|
111
|
-
|
|
112
|
-
vk::PipelineStageFlags stage_flags;
|
|
113
|
-
|
|
114
|
-
bool transfer_only;
|
|
115
|
-
};
|
|
105
|
+
#define MAX_PARAMETER_COUNT 8
|
|
116
106
|
|
|
117
107
|
struct vk_pipeline_struct {
|
|
118
108
|
std::string name;
|
|
119
109
|
vk::ShaderModule shader_module;
|
|
120
|
-
vk::DescriptorSetLayout dsl;
|
|
121
|
-
std::vector<vk::DescriptorPool> descriptor_pools;
|
|
122
|
-
std::vector<vk::DescriptorSet> descriptor_sets;
|
|
123
|
-
uint32_t descriptor_set_idx;
|
|
124
110
|
vk::PipelineLayout layout;
|
|
125
111
|
vk::Pipeline pipeline;
|
|
126
112
|
uint32_t push_constant_size;
|
|
@@ -167,6 +153,45 @@ struct ggml_backend_vk_buffer_type_context {
|
|
|
167
153
|
vk_device device;
|
|
168
154
|
};
|
|
169
155
|
|
|
156
|
+
struct vk_queue;
|
|
157
|
+
|
|
158
|
+
// Stores command pool/buffers. There's an instance of this
|
|
159
|
+
// for each (context,queue) pair and for each (device,queue) pair.
|
|
160
|
+
struct vk_command_pool {
|
|
161
|
+
void init(vk_device& device, vk_queue *q_);
|
|
162
|
+
void destroy(vk::Device& device);
|
|
163
|
+
|
|
164
|
+
vk::CommandPool pool;
|
|
165
|
+
uint32_t cmd_buffer_idx;
|
|
166
|
+
std::vector<vk::CommandBuffer> cmd_buffers;
|
|
167
|
+
|
|
168
|
+
vk_queue *q;
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
// Prevent simultaneous submissions to the same queue.
|
|
172
|
+
// This could be per vk_queue if we stopped having two vk_queue structures
|
|
173
|
+
// sharing the same vk::Queue.
|
|
174
|
+
static std::mutex queue_mutex;
|
|
175
|
+
|
|
176
|
+
struct vk_queue {
|
|
177
|
+
uint32_t queue_family_index;
|
|
178
|
+
vk::Queue queue;
|
|
179
|
+
|
|
180
|
+
vk_command_pool cmd_pool;
|
|
181
|
+
|
|
182
|
+
vk::PipelineStageFlags stage_flags;
|
|
183
|
+
|
|
184
|
+
bool transfer_only;
|
|
185
|
+
|
|
186
|
+
// copy everything except the cmd_pool
|
|
187
|
+
void copyFrom(vk_queue &other) {
|
|
188
|
+
queue_family_index = other.queue_family_index;
|
|
189
|
+
queue = other.queue;
|
|
190
|
+
stage_flags = other.stage_flags;
|
|
191
|
+
transfer_only = other.transfer_only;
|
|
192
|
+
}
|
|
193
|
+
};
|
|
194
|
+
|
|
170
195
|
static const char * ggml_backend_vk_buffer_type_name(ggml_backend_buffer_type_t buft);
|
|
171
196
|
static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
|
172
197
|
static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft);
|
|
@@ -196,6 +221,7 @@ enum vk_device_architecture {
|
|
|
196
221
|
AMD_RDNA1,
|
|
197
222
|
AMD_RDNA2,
|
|
198
223
|
AMD_RDNA3,
|
|
224
|
+
INTEL_XE2,
|
|
199
225
|
};
|
|
200
226
|
|
|
201
227
|
static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) {
|
|
@@ -246,6 +272,34 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice&
|
|
|
246
272
|
}
|
|
247
273
|
return vk_device_architecture::AMD_RDNA2;
|
|
248
274
|
}
|
|
275
|
+
} else if (props.vendorID == VK_VENDOR_ID_INTEL) {
|
|
276
|
+
const std::vector<vk::ExtensionProperties> ext_props = device.enumerateDeviceExtensionProperties();
|
|
277
|
+
|
|
278
|
+
bool subgroup_size_control = false;
|
|
279
|
+
|
|
280
|
+
for (const auto& properties : ext_props) {
|
|
281
|
+
if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
|
282
|
+
subgroup_size_control = true;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (!subgroup_size_control) {
|
|
287
|
+
return vk_device_architecture::OTHER;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
vk::PhysicalDeviceProperties2 props2;
|
|
291
|
+
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
|
292
|
+
|
|
293
|
+
props2.pNext = &subgroup_size_control_props;
|
|
294
|
+
device.getProperties2(&props2);
|
|
295
|
+
|
|
296
|
+
if (subgroup_size_control_props.minSubgroupSize == 16) {
|
|
297
|
+
// Xe2 architecture uses SIMD16 while previous Xe and Gen architecture uses SIMD8.
|
|
298
|
+
// Minimum subgroup size matches the SIMD width so we distinguish architecture by checking this value.
|
|
299
|
+
// https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html
|
|
300
|
+
// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
|
|
301
|
+
return vk_device_architecture::INTEL_XE2;
|
|
302
|
+
}
|
|
249
303
|
}
|
|
250
304
|
return vk_device_architecture::OTHER;
|
|
251
305
|
}
|
|
@@ -312,6 +366,8 @@ struct vk_device_struct {
|
|
|
312
366
|
// set to true to indicate that some shaders need to be compiled after the dryrun
|
|
313
367
|
bool need_compiles {};
|
|
314
368
|
|
|
369
|
+
vk::DescriptorSetLayout dsl;
|
|
370
|
+
|
|
315
371
|
vk_matmul_pipeline pipeline_matmul_f32 {};
|
|
316
372
|
vk_matmul_pipeline pipeline_matmul_f32_f16 {};
|
|
317
373
|
vk_matmul_pipeline pipeline_matmul_bf16 {};
|
|
@@ -396,6 +452,7 @@ struct vk_device_struct {
|
|
|
396
452
|
vk_pipeline pipeline_count_equal_i32;
|
|
397
453
|
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
|
398
454
|
vk_pipeline pipeline_timestep_embedding_f32;
|
|
455
|
+
vk_pipeline pipeline_conv_transpose_1d_f32;
|
|
399
456
|
vk_pipeline pipeline_pool2d_f32;
|
|
400
457
|
vk_pipeline pipeline_rwkv_wkv6_f32;
|
|
401
458
|
vk_pipeline pipeline_rwkv_wkv7_f32;
|
|
@@ -428,7 +485,6 @@ struct vk_device_struct {
|
|
|
428
485
|
vk_pipeline pipeline_flash_attn_split_k_reduce;
|
|
429
486
|
|
|
430
487
|
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
|
431
|
-
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
|
432
488
|
|
|
433
489
|
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
|
434
490
|
|
|
@@ -444,7 +500,7 @@ struct vk_device_struct {
|
|
|
444
500
|
// for GGML_VK_PERF_LOGGER
|
|
445
501
|
std::unique_ptr<vk_perf_logger> perf_logger;
|
|
446
502
|
vk::QueryPool query_pool;
|
|
447
|
-
|
|
503
|
+
int32_t num_queries;
|
|
448
504
|
|
|
449
505
|
~vk_device_struct() {
|
|
450
506
|
VK_LOG_DEBUG("destroy device " << name);
|
|
@@ -453,10 +509,8 @@ struct vk_device_struct {
|
|
|
453
509
|
|
|
454
510
|
ggml_vk_destroy_buffer(sync_staging);
|
|
455
511
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
device.destroyCommandPool(transfer_queue.pool);
|
|
459
|
-
}
|
|
512
|
+
compute_queue.cmd_pool.destroy(device);
|
|
513
|
+
transfer_queue.cmd_pool.destroy(device);
|
|
460
514
|
|
|
461
515
|
for (auto& pipeline : pipelines) {
|
|
462
516
|
if (pipeline.second.expired()) {
|
|
@@ -468,10 +522,26 @@ struct vk_device_struct {
|
|
|
468
522
|
}
|
|
469
523
|
pipelines.clear();
|
|
470
524
|
|
|
525
|
+
device.destroyDescriptorSetLayout(dsl);
|
|
526
|
+
|
|
471
527
|
device.destroy();
|
|
472
528
|
}
|
|
473
529
|
};
|
|
474
530
|
|
|
531
|
+
void vk_command_pool::init(vk_device& device, vk_queue *q_) {
|
|
532
|
+
cmd_buffer_idx = 0;
|
|
533
|
+
q = q_;
|
|
534
|
+
|
|
535
|
+
vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), q->queue_family_index);
|
|
536
|
+
pool = device->device.createCommandPool(command_pool_create_info);
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
void vk_command_pool::destroy(vk::Device& device) {
|
|
540
|
+
device.destroyCommandPool(pool);
|
|
541
|
+
pool = nullptr;
|
|
542
|
+
cmd_buffers.clear();
|
|
543
|
+
}
|
|
544
|
+
|
|
475
545
|
struct vk_buffer_struct {
|
|
476
546
|
vk::Buffer buffer = VK_NULL_HANDLE;
|
|
477
547
|
vk::DeviceMemory device_memory = VK_NULL_HANDLE;
|
|
@@ -706,6 +776,21 @@ struct vk_op_timestep_embedding_push_constants {
|
|
|
706
776
|
uint32_t max_period;
|
|
707
777
|
};
|
|
708
778
|
|
|
779
|
+
struct vk_op_conv_transpose_1d_push_constants {
|
|
780
|
+
uint32_t Cout;
|
|
781
|
+
uint32_t Cin;
|
|
782
|
+
uint32_t K;
|
|
783
|
+
uint32_t L;
|
|
784
|
+
uint32_t KL;
|
|
785
|
+
|
|
786
|
+
uint32_t nb01;
|
|
787
|
+
uint32_t nb02;
|
|
788
|
+
uint32_t nb11;
|
|
789
|
+
uint32_t nb1;
|
|
790
|
+
|
|
791
|
+
int32_t s0;
|
|
792
|
+
};
|
|
793
|
+
|
|
709
794
|
struct vk_op_pool2d_push_constants {
|
|
710
795
|
uint32_t IW; uint32_t IH;
|
|
711
796
|
uint32_t OW; uint32_t OH;
|
|
@@ -774,7 +859,7 @@ struct vk_context_struct {
|
|
|
774
859
|
std::vector<vk_staging_memcpy> in_memcpys;
|
|
775
860
|
std::vector<vk_staging_memcpy> out_memcpys;
|
|
776
861
|
|
|
777
|
-
|
|
862
|
+
vk_command_pool * p {};
|
|
778
863
|
};
|
|
779
864
|
typedef std::shared_ptr<vk_context_struct> vk_context;
|
|
780
865
|
typedef std::weak_ptr<vk_context_struct> vk_context_ref;
|
|
@@ -885,6 +970,14 @@ struct ggml_backend_vk_context {
|
|
|
885
970
|
vk_context_ref transfer_ctx;
|
|
886
971
|
|
|
887
972
|
std::vector<vk_context_ref> tensor_ctxs;
|
|
973
|
+
|
|
974
|
+
std::vector<vk::DescriptorPool> descriptor_pools;
|
|
975
|
+
std::vector<vk::DescriptorSet> descriptor_sets;
|
|
976
|
+
uint32_t descriptor_set_idx {};
|
|
977
|
+
uint32_t pipeline_descriptor_set_requirements {};
|
|
978
|
+
|
|
979
|
+
vk_command_pool compute_cmd_pool;
|
|
980
|
+
vk_command_pool transfer_cmd_pool;
|
|
888
981
|
};
|
|
889
982
|
|
|
890
983
|
static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT
|
|
@@ -948,6 +1041,14 @@ void vk_memory_logger::log_deallocation(vk_buffer_ref buf_ref) {
|
|
|
948
1041
|
struct vk_instance_t {
|
|
949
1042
|
vk::Instance instance;
|
|
950
1043
|
|
|
1044
|
+
bool debug_utils_support = false; // VK_EXT_debug_utils enabled
|
|
1045
|
+
PFN_vkSetDebugUtilsObjectNameEXT pfn_vkSetDebugUtilsObjectNameEXT = {};
|
|
1046
|
+
PFN_vkQueueBeginDebugUtilsLabelEXT pfn_vkQueueBeginDebugUtilsLabelEXT = {};
|
|
1047
|
+
PFN_vkQueueEndDebugUtilsLabelEXT pfn_vkQueueEndDebugUtilsLabelEXT = {};
|
|
1048
|
+
PFN_vkCmdBeginDebugUtilsLabelEXT pfn_vkCmdBeginDebugUtilsLabelEXT = {};
|
|
1049
|
+
PFN_vkCmdEndDebugUtilsLabelEXT pfn_vkCmdEndDebugUtilsLabelEXT = {};
|
|
1050
|
+
PFN_vkCmdInsertDebugUtilsLabelEXT pfn_vkCmdInsertDebugUtilsLabelEXT = {};
|
|
1051
|
+
|
|
951
1052
|
std::vector<size_t> device_indices;
|
|
952
1053
|
vk_device devices[GGML_VK_MAX_DEVICES];
|
|
953
1054
|
};
|
|
@@ -1015,39 +1116,19 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|
|
1015
1116
|
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " <<
|
|
1016
1117
|
disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
|
1017
1118
|
GGML_ASSERT(parameter_count > 0);
|
|
1119
|
+
GGML_ASSERT(parameter_count <= MAX_PARAMETER_COUNT);
|
|
1018
1120
|
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
|
1019
1121
|
|
|
1020
1122
|
vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
|
|
1021
1123
|
pipeline->shader_module = device->device.createShaderModule(shader_module_create_info);
|
|
1022
1124
|
|
|
1023
|
-
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
|
1024
|
-
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
|
1025
|
-
for (uint32_t i = 0; i < parameter_count; i++) {
|
|
1026
|
-
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
|
1027
|
-
dsl_binding_flags.push_back({});
|
|
1028
|
-
}
|
|
1029
|
-
|
|
1030
|
-
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
|
1031
|
-
|
|
1032
1125
|
vk::PushConstantRange pcr(
|
|
1033
1126
|
vk::ShaderStageFlagBits::eCompute,
|
|
1034
1127
|
0,
|
|
1035
1128
|
pipeline->push_constant_size
|
|
1036
1129
|
);
|
|
1037
1130
|
|
|
1038
|
-
vk::
|
|
1039
|
-
{},
|
|
1040
|
-
dsl_binding);
|
|
1041
|
-
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
|
1042
|
-
pipeline->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
|
1043
|
-
|
|
1044
|
-
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
1045
|
-
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
1046
|
-
pipeline->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
1047
|
-
|
|
1048
|
-
pipeline->descriptor_set_idx = 0;
|
|
1049
|
-
|
|
1050
|
-
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
|
|
1131
|
+
vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), device->dsl, pcr);
|
|
1051
1132
|
pipeline->layout = device->device.createPipelineLayout(pipeline_layout_create_info);
|
|
1052
1133
|
|
|
1053
1134
|
std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
|
|
@@ -1107,6 +1188,14 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|
|
1107
1188
|
}
|
|
1108
1189
|
pipeline->compiled = true;
|
|
1109
1190
|
|
|
1191
|
+
if (vk_instance.debug_utils_support) {
|
|
1192
|
+
vk::DebugUtilsObjectNameInfoEXT duoni;
|
|
1193
|
+
duoni.objectType = vk::ObjectType::ePipeline;
|
|
1194
|
+
duoni.pObjectName = pipeline->name.c_str();
|
|
1195
|
+
duoni.objectHandle = reinterpret_cast<uint64_t>(static_cast<VkPipeline_T*>(pipeline->pipeline));
|
|
1196
|
+
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT(device->device, &static_cast<VkDebugUtilsObjectNameInfoEXT &>(duoni));
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1110
1199
|
{
|
|
1111
1200
|
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1112
1201
|
device->pipelines.insert({ pipeline->name, pipeline });
|
|
@@ -1122,15 +1211,6 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|
|
1122
1211
|
|
|
1123
1212
|
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
|
1124
1213
|
VK_LOG_DEBUG("ggml_pipeline_destroy_pipeline(" << pipeline->name << ")");
|
|
1125
|
-
for (auto& pool : pipeline->descriptor_pools) {
|
|
1126
|
-
device.destroyDescriptorPool(pool);
|
|
1127
|
-
}
|
|
1128
|
-
pipeline->descriptor_pools.clear();
|
|
1129
|
-
pipeline->descriptor_sets.clear();
|
|
1130
|
-
pipeline->descriptor_set_idx = 0;
|
|
1131
|
-
|
|
1132
|
-
device.destroyDescriptorSetLayout(pipeline->dsl);
|
|
1133
|
-
|
|
1134
1214
|
device.destroyPipelineLayout(pipeline->layout);
|
|
1135
1215
|
|
|
1136
1216
|
device.destroyShaderModule(pipeline->shader_module);
|
|
@@ -1138,97 +1218,77 @@ static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline)
|
|
|
1138
1218
|
device.destroyPipeline(pipeline->pipeline);
|
|
1139
1219
|
}
|
|
1140
1220
|
|
|
1141
|
-
static void ggml_pipeline_request_descriptor_sets(
|
|
1221
|
+
static void ggml_pipeline_request_descriptor_sets(ggml_backend_vk_context *ctx, vk_pipeline& pipeline, uint32_t n) {
|
|
1142
1222
|
VK_LOG_DEBUG("ggml_pipeline_request_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
|
1143
|
-
|
|
1223
|
+
ctx->pipeline_descriptor_set_requirements += n;
|
|
1144
1224
|
if (!pipeline->compiled) {
|
|
1145
1225
|
pipeline->needed = true;
|
|
1146
|
-
device->need_compiles = true;
|
|
1226
|
+
ctx->device->need_compiles = true;
|
|
1147
1227
|
}
|
|
1148
1228
|
}
|
|
1149
1229
|
|
|
1150
|
-
static void ggml_pipeline_allocate_descriptor_sets(
|
|
1151
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1152
|
-
|
|
1153
|
-
for (auto& pair : device->pipeline_descriptor_set_requirements) {
|
|
1154
|
-
vk_pipeline pipeline = device->pipelines.at(pair.first).lock();
|
|
1155
|
-
const uint64_t n = pair.second;
|
|
1156
|
-
|
|
1157
|
-
VK_LOG_DEBUG("ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")");
|
|
1230
|
+
static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx) {
|
|
1158
1231
|
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1232
|
+
if (ctx->descriptor_sets.size() >= ctx->pipeline_descriptor_set_requirements) {
|
|
1233
|
+
// Enough descriptors are available
|
|
1234
|
+
return;
|
|
1235
|
+
}
|
|
1163
1236
|
|
|
1164
|
-
|
|
1165
|
-
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - pipeline->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1166
|
-
uint32_t pool_idx = pipeline->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1237
|
+
vk_device& device = ctx->device;
|
|
1167
1238
|
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1239
|
+
uint32_t to_alloc = ctx->pipeline_descriptor_set_requirements - ctx->descriptor_sets.size();
|
|
1240
|
+
uint32_t pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE - ctx->descriptor_sets.size() % VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1241
|
+
uint32_t pool_idx = ctx->descriptor_sets.size() / VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1172
1242
|
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
}
|
|
1243
|
+
while (to_alloc > 0) {
|
|
1244
|
+
const uint32_t alloc_count = std::min(pool_remaining, to_alloc);
|
|
1245
|
+
to_alloc -= alloc_count;
|
|
1246
|
+
pool_remaining = VK_DEVICE_DESCRIPTOR_POOL_SIZE;
|
|
1178
1247
|
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
1185
|
-
pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
|
|
1248
|
+
if (pool_idx >= ctx->descriptor_pools.size()) {
|
|
1249
|
+
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, MAX_PARAMETER_COUNT * VK_DEVICE_DESCRIPTOR_POOL_SIZE);
|
|
1250
|
+
vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, VK_DEVICE_DESCRIPTOR_POOL_SIZE, descriptor_pool_size);
|
|
1251
|
+
ctx->descriptor_pools.push_back(device->device.createDescriptorPool(descriptor_pool_create_info));
|
|
1252
|
+
}
|
|
1186
1253
|
|
|
1187
|
-
|
|
1254
|
+
std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
|
|
1255
|
+
for (uint32_t i = 0; i < alloc_count; i++) {
|
|
1256
|
+
layouts[i] = device->dsl;
|
|
1188
1257
|
}
|
|
1189
|
-
|
|
1190
|
-
|
|
1258
|
+
vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(ctx->descriptor_pools[pool_idx], alloc_count, layouts.data());
|
|
1259
|
+
std::vector<vk::DescriptorSet> sets = device->device.allocateDescriptorSets(descriptor_set_alloc_info);
|
|
1260
|
+
ctx->descriptor_sets.insert(ctx->descriptor_sets.end(), sets.begin(), sets.end());
|
|
1191
1261
|
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
pipeline->descriptor_set_idx = 0;
|
|
1262
|
+
pool_idx++;
|
|
1263
|
+
}
|
|
1195
1264
|
}
|
|
1196
1265
|
|
|
1197
|
-
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device,
|
|
1266
|
+
static vk::CommandBuffer ggml_vk_create_cmd_buffer(vk_device& device, vk_command_pool& p) {
|
|
1198
1267
|
VK_LOG_DEBUG("ggml_vk_create_cmd_buffer()");
|
|
1199
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1200
1268
|
|
|
1201
|
-
if (
|
|
1269
|
+
if (p.cmd_buffers.size() > p.cmd_buffer_idx) {
|
|
1202
1270
|
// Reuse command buffer
|
|
1203
|
-
return
|
|
1271
|
+
return p.cmd_buffers[p.cmd_buffer_idx++];
|
|
1204
1272
|
}
|
|
1205
1273
|
|
|
1206
1274
|
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
|
1207
|
-
|
|
1275
|
+
p.pool,
|
|
1208
1276
|
vk::CommandBufferLevel::ePrimary,
|
|
1209
1277
|
1);
|
|
1210
1278
|
const std::vector<vk::CommandBuffer> cmd_buffers = device->device.allocateCommandBuffers(command_buffer_alloc_info);
|
|
1211
1279
|
auto buf = cmd_buffers.front();
|
|
1212
1280
|
|
|
1213
|
-
|
|
1214
|
-
|
|
1281
|
+
p.cmd_buffers.push_back(buf);
|
|
1282
|
+
p.cmd_buffer_idx++;
|
|
1215
1283
|
|
|
1216
1284
|
return buf;
|
|
1217
1285
|
}
|
|
1218
1286
|
|
|
1219
|
-
static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, std::vector<vk_semaphore> wait_semaphores, std::vector<vk_semaphore> signal_semaphores) {
|
|
1220
|
-
VK_LOG_DEBUG("ggml_vk_create_submission()");
|
|
1221
|
-
vk_submission s;
|
|
1222
|
-
s.buffer = ggml_vk_create_cmd_buffer(device, q);
|
|
1223
|
-
s.wait_semaphores = std::move(wait_semaphores);
|
|
1224
|
-
s.signal_semaphores = std::move(signal_semaphores);
|
|
1225
|
-
return s;
|
|
1226
|
-
}
|
|
1227
|
-
|
|
1228
1287
|
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
|
1229
1288
|
if (ctx->seqs.empty()) {
|
|
1230
1289
|
if (fence) {
|
|
1231
|
-
|
|
1290
|
+
std::lock_guard<std::mutex> guard(queue_mutex);
|
|
1291
|
+
ctx->p->q->queue.submit({}, fence);
|
|
1232
1292
|
}
|
|
1233
1293
|
return;
|
|
1234
1294
|
}
|
|
@@ -1267,7 +1327,7 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
|
|
1267
1327
|
tl_signal_vals.push_back({});
|
|
1268
1328
|
tl_signal_semaphores.push_back({});
|
|
1269
1329
|
for (size_t i = 0; i < submission.wait_semaphores.size(); i++) {
|
|
1270
|
-
stage_flags[idx].push_back(ctx->q->stage_flags);
|
|
1330
|
+
stage_flags[idx].push_back(ctx->p->q->stage_flags);
|
|
1271
1331
|
tl_wait_vals[idx].push_back(submission.wait_semaphores[i].value);
|
|
1272
1332
|
tl_wait_semaphores[idx].push_back(submission.wait_semaphores[i].s);
|
|
1273
1333
|
}
|
|
@@ -1297,7 +1357,8 @@ static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
|
|
1297
1357
|
}
|
|
1298
1358
|
}
|
|
1299
1359
|
|
|
1300
|
-
|
|
1360
|
+
std::lock_guard<std::mutex> guard(queue_mutex);
|
|
1361
|
+
ctx->p->q->queue.submit(submit_infos, fence);
|
|
1301
1362
|
|
|
1302
1363
|
ctx->seqs.clear();
|
|
1303
1364
|
}
|
|
@@ -1355,28 +1416,25 @@ static void ggml_vk_create_queue(vk_device& device, vk_queue& q, uint32_t queue_
|
|
|
1355
1416
|
q.queue_family_index = queue_family_index;
|
|
1356
1417
|
q.transfer_only = transfer_only;
|
|
1357
1418
|
|
|
1358
|
-
|
|
1359
|
-
q.pool = device->device.createCommandPool(command_pool_create_info_compute);
|
|
1360
|
-
|
|
1361
|
-
q.cmd_buffer_idx = 0;
|
|
1419
|
+
q.cmd_pool.init(device, &q);
|
|
1362
1420
|
|
|
1363
1421
|
q.queue = device->device.getQueue(queue_family_index, queue_index);
|
|
1364
1422
|
|
|
1365
1423
|
q.stage_flags = stage_flags;
|
|
1366
1424
|
}
|
|
1367
1425
|
|
|
1368
|
-
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx,
|
|
1426
|
+
static vk_context ggml_vk_create_context(ggml_backend_vk_context * ctx, vk_command_pool& p) {
|
|
1369
1427
|
vk_context result = std::make_shared<vk_context_struct>();
|
|
1370
1428
|
VK_LOG_DEBUG("ggml_vk_create_context(" << result << ")");
|
|
1371
1429
|
ctx->gc.contexts.emplace_back(result);
|
|
1372
|
-
result->
|
|
1430
|
+
result->p = &p;
|
|
1373
1431
|
return result;
|
|
1374
1432
|
}
|
|
1375
1433
|
|
|
1376
|
-
static vk_context ggml_vk_create_temporary_context(
|
|
1434
|
+
static vk_context ggml_vk_create_temporary_context(vk_command_pool& p) {
|
|
1377
1435
|
vk_context result = std::make_shared<vk_context_struct>();
|
|
1378
1436
|
VK_LOG_DEBUG("ggml_vk_create_temporary_context(" << result << ")");
|
|
1379
|
-
result->
|
|
1437
|
+
result->p = &p;
|
|
1380
1438
|
return result;
|
|
1381
1439
|
}
|
|
1382
1440
|
|
|
@@ -1409,15 +1467,29 @@ static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
|
|
|
1409
1467
|
return ctx->gc.events[ctx->event_idx++];
|
|
1410
1468
|
}
|
|
1411
1469
|
|
|
1412
|
-
static void
|
|
1413
|
-
VK_LOG_DEBUG("
|
|
1414
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1470
|
+
static void ggml_vk_command_pool_cleanup(vk_device& device, vk_command_pool& p) {
|
|
1471
|
+
VK_LOG_DEBUG("ggml_vk_command_pool_cleanup()");
|
|
1415
1472
|
|
|
1416
1473
|
// Requires command buffers to be done
|
|
1417
|
-
device->device.resetCommandPool(
|
|
1418
|
-
|
|
1474
|
+
device->device.resetCommandPool(p.pool);
|
|
1475
|
+
p.cmd_buffer_idx = 0;
|
|
1419
1476
|
}
|
|
1420
1477
|
|
|
1478
|
+
static void ggml_vk_queue_command_pools_cleanup(vk_device& device) {
|
|
1479
|
+
VK_LOG_DEBUG("ggml_vk_queue_command_pools_cleanup()");
|
|
1480
|
+
|
|
1481
|
+
// Arbitrary frequency to cleanup/reuse command buffers
|
|
1482
|
+
static constexpr uint32_t cleanup_frequency = 10;
|
|
1483
|
+
|
|
1484
|
+
if (device->compute_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
|
1485
|
+
ggml_vk_command_pool_cleanup(device, device->compute_queue.cmd_pool);
|
|
1486
|
+
}
|
|
1487
|
+
if (device->transfer_queue.cmd_pool.cmd_buffer_idx >= cleanup_frequency) {
|
|
1488
|
+
ggml_vk_command_pool_cleanup(device, device->transfer_queue.cmd_pool);
|
|
1489
|
+
}
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
|
|
1421
1493
|
static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) {
|
|
1422
1494
|
for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) {
|
|
1423
1495
|
vk::MemoryType memory_type = mem_props->memoryTypes[i];
|
|
@@ -1436,8 +1508,6 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, vk::Memor
|
|
|
1436
1508
|
throw vk::OutOfDeviceMemoryError("Requested buffer size exceeds device memory allocation limit");
|
|
1437
1509
|
}
|
|
1438
1510
|
|
|
1439
|
-
std::lock_guard<std::mutex> guard(device->mutex);
|
|
1440
|
-
|
|
1441
1511
|
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
|
1442
1512
|
|
|
1443
1513
|
if (size == 0) {
|
|
@@ -1566,11 +1636,11 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
|
1566
1636
|
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
|
1567
1637
|
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
|
1568
1638
|
|
|
1569
|
-
const bool transfer_queue = ctx->q->transfer_only;
|
|
1639
|
+
const bool transfer_queue = ctx->p->q->transfer_only;
|
|
1570
1640
|
|
|
1571
1641
|
ctx->s->buffer.pipelineBarrier(
|
|
1572
|
-
ctx->q->stage_flags,
|
|
1573
|
-
ctx->q->stage_flags,
|
|
1642
|
+
ctx->p->q->stage_flags,
|
|
1643
|
+
ctx->p->q->stage_flags,
|
|
1574
1644
|
{},
|
|
1575
1645
|
{ {
|
|
1576
1646
|
{ !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) },
|
|
@@ -1589,8 +1659,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events
|
|
|
1589
1659
|
|
|
1590
1660
|
ctx->s->buffer.waitEvents(
|
|
1591
1661
|
events,
|
|
1592
|
-
ctx->q->stage_flags,
|
|
1593
|
-
ctx->q->stage_flags,
|
|
1662
|
+
ctx->p->q->stage_flags,
|
|
1663
|
+
ctx->p->q->stage_flags,
|
|
1594
1664
|
{},
|
|
1595
1665
|
{},
|
|
1596
1666
|
{}
|
|
@@ -2726,6 +2796,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
2726
2796
|
|
|
2727
2797
|
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
|
2728
2798
|
|
|
2799
|
+
ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1);
|
|
2800
|
+
|
|
2729
2801
|
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
|
2730
2802
|
|
|
2731
2803
|
ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1);
|
|
@@ -3322,6 +3394,22 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
3322
3394
|
}
|
|
3323
3395
|
}
|
|
3324
3396
|
|
|
3397
|
+
|
|
3398
|
+
std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
|
|
3399
|
+
std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
|
|
3400
|
+
for (uint32_t i = 0; i < MAX_PARAMETER_COUNT; i++) {
|
|
3401
|
+
dsl_binding.push_back({i, vk::DescriptorType::eStorageBuffer, 1, vk::ShaderStageFlagBits::eCompute});
|
|
3402
|
+
dsl_binding_flags.push_back({});
|
|
3403
|
+
}
|
|
3404
|
+
|
|
3405
|
+
vk::DescriptorSetLayoutBindingFlagsCreateInfo dslbfci = { dsl_binding_flags };
|
|
3406
|
+
|
|
3407
|
+
vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
|
|
3408
|
+
{},
|
|
3409
|
+
dsl_binding);
|
|
3410
|
+
descriptor_set_layout_create_info.setPNext(&dslbfci);
|
|
3411
|
+
device->dsl = device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
|
|
3412
|
+
|
|
3325
3413
|
ggml_vk_load_shaders(device);
|
|
3326
3414
|
|
|
3327
3415
|
if (!device->single_queue) {
|
|
@@ -3329,7 +3417,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
3329
3417
|
ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true);
|
|
3330
3418
|
} else {
|
|
3331
3419
|
// TODO: Use pointer or reference to avoid copy
|
|
3332
|
-
device->transfer_queue
|
|
3420
|
+
device->transfer_queue.copyFrom(device->compute_queue);
|
|
3421
|
+
device->transfer_queue.cmd_pool.init(device, &device->transfer_queue);
|
|
3333
3422
|
}
|
|
3334
3423
|
|
|
3335
3424
|
device->buffer_type = {
|
|
@@ -3488,6 +3577,8 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|
|
3488
3577
|
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
|
3489
3578
|
static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
|
|
3490
3579
|
|
|
3580
|
+
static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
|
|
3581
|
+
|
|
3491
3582
|
static void ggml_vk_instance_init() {
|
|
3492
3583
|
if (vk_instance_initialized) {
|
|
3493
3584
|
return;
|
|
@@ -3508,7 +3599,7 @@ static void ggml_vk_instance_init() {
|
|
|
3508
3599
|
#ifdef __APPLE__
|
|
3509
3600
|
const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
|
|
3510
3601
|
#endif
|
|
3511
|
-
|
|
3602
|
+
const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr;
|
|
3512
3603
|
std::vector<const char*> layers;
|
|
3513
3604
|
|
|
3514
3605
|
if (validation_ext) {
|
|
@@ -3523,6 +3614,9 @@ static void ggml_vk_instance_init() {
|
|
|
3523
3614
|
extensions.push_back("VK_KHR_portability_enumeration");
|
|
3524
3615
|
}
|
|
3525
3616
|
#endif
|
|
3617
|
+
if (debug_utils_ext) {
|
|
3618
|
+
extensions.push_back("VK_EXT_debug_utils");
|
|
3619
|
+
}
|
|
3526
3620
|
vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
|
|
3527
3621
|
#ifdef __APPLE__
|
|
3528
3622
|
if (portability_enumeration_ext) {
|
|
@@ -3546,13 +3640,25 @@ static void ggml_vk_instance_init() {
|
|
|
3546
3640
|
vk_instance.instance = vk::createInstance(instance_create_info);
|
|
3547
3641
|
vk_instance_initialized = true;
|
|
3548
3642
|
|
|
3549
|
-
|
|
3643
|
+
if (debug_utils_ext) {
|
|
3644
|
+
vk_instance.debug_utils_support = true;
|
|
3645
|
+
vk_instance.pfn_vkSetDebugUtilsObjectNameEXT = (PFN_vkSetDebugUtilsObjectNameEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkSetDebugUtilsObjectNameEXT");
|
|
3646
|
+
vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT = (PFN_vkQueueBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueBeginDebugUtilsLabelEXT");
|
|
3647
|
+
vk_instance.pfn_vkQueueEndDebugUtilsLabelEXT = (PFN_vkQueueEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkQueueEndDebugUtilsLabelEXT");
|
|
3648
|
+
vk_instance.pfn_vkCmdBeginDebugUtilsLabelEXT = (PFN_vkCmdBeginDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdBeginDebugUtilsLabelEXT");
|
|
3649
|
+
vk_instance.pfn_vkCmdEndDebugUtilsLabelEXT = (PFN_vkCmdEndDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdEndDebugUtilsLabelEXT");
|
|
3650
|
+
vk_instance.pfn_vkCmdInsertDebugUtilsLabelEXT = (PFN_vkCmdInsertDebugUtilsLabelEXT) vkGetInstanceProcAddr(vk_instance.instance, "vkCmdInsertDebugUtilsLabelEXT");
|
|
3651
|
+
|
|
3652
|
+
}
|
|
3550
3653
|
|
|
3551
3654
|
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
|
3655
|
+
vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;
|
|
3552
3656
|
|
|
3553
3657
|
// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
|
|
3554
3658
|
char * devices_env = getenv("GGML_VK_VISIBLE_DEVICES");
|
|
3555
3659
|
if (devices_env != nullptr) {
|
|
3660
|
+
size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();
|
|
3661
|
+
|
|
3556
3662
|
std::string devices(devices_env);
|
|
3557
3663
|
std::replace(devices.begin(), devices.end(), ',', ' ');
|
|
3558
3664
|
|
|
@@ -3568,9 +3674,9 @@ static void ggml_vk_instance_init() {
|
|
|
3568
3674
|
} else {
|
|
3569
3675
|
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
|
3570
3676
|
|
|
3571
|
-
//
|
|
3677
|
+
// If no vulkan devices are found, return early
|
|
3572
3678
|
if (devices.empty()) {
|
|
3573
|
-
|
|
3679
|
+
GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
|
|
3574
3680
|
return;
|
|
3575
3681
|
}
|
|
3576
3682
|
|
|
@@ -3653,9 +3759,20 @@ static void ggml_vk_instance_init() {
|
|
|
3653
3759
|
}
|
|
3654
3760
|
}
|
|
3655
3761
|
|
|
3656
|
-
// If no dedicated GPUs found, fall back to
|
|
3762
|
+
// If no dedicated GPUs found, fall back to the first non-CPU device.
|
|
3763
|
+
// If only CPU devices are available, return without devices.
|
|
3657
3764
|
if (vk_instance.device_indices.empty()) {
|
|
3658
|
-
|
|
3765
|
+
for (size_t i = 0; i < devices.size(); i++) {
|
|
3766
|
+
if (devices[i].getProperties().deviceType != vk::PhysicalDeviceType::eCpu) {
|
|
3767
|
+
vk_instance.device_indices.push_back(i);
|
|
3768
|
+
break;
|
|
3769
|
+
}
|
|
3770
|
+
}
|
|
3771
|
+
}
|
|
3772
|
+
|
|
3773
|
+
if (vk_instance.device_indices.empty()) {
|
|
3774
|
+
GGML_LOG_INFO("ggml_vulkan: No devices found.\n");
|
|
3775
|
+
return;
|
|
3659
3776
|
}
|
|
3660
3777
|
}
|
|
3661
3778
|
GGML_LOG_DEBUG("ggml_vulkan: Found %zu Vulkan devices:\n", vk_instance.device_indices.size());
|
|
@@ -3684,6 +3801,9 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
|
3684
3801
|
ctx->fence = ctx->device->device.createFence({});
|
|
3685
3802
|
ctx->almost_ready_fence = ctx->device->device.createFence({});
|
|
3686
3803
|
|
|
3804
|
+
ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue);
|
|
3805
|
+
ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue);
|
|
3806
|
+
|
|
3687
3807
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
3688
3808
|
const char* skip_checks = getenv("GGML_VULKAN_SKIP_CHECKS");
|
|
3689
3809
|
vk_skip_checks = (skip_checks == NULL ? 0 : atoi(skip_checks));
|
|
@@ -4049,9 +4169,9 @@ static void ggml_vk_host_get(vk_device& device, const void * ptr, vk_buffer& buf
|
|
|
4049
4169
|
}
|
|
4050
4170
|
}
|
|
4051
4171
|
|
|
4052
|
-
static vk_submission ggml_vk_begin_submission(vk_device& device,
|
|
4172
|
+
static vk_submission ggml_vk_begin_submission(vk_device& device, vk_command_pool& p, bool one_time = true) {
|
|
4053
4173
|
vk_submission s;
|
|
4054
|
-
s.buffer = ggml_vk_create_cmd_buffer(device,
|
|
4174
|
+
s.buffer = ggml_vk_create_cmd_buffer(device, p);
|
|
4055
4175
|
if (one_time) {
|
|
4056
4176
|
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
|
4057
4177
|
} else {
|
|
@@ -4061,7 +4181,33 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
|
|
|
4061
4181
|
return s;
|
|
4062
4182
|
}
|
|
4063
4183
|
|
|
4064
|
-
|
|
4184
|
+
template <typename T> size_t push_constant_size(const T &t) {
|
|
4185
|
+
static_assert(std::is_class<T>::value, "T must be a struct/class");
|
|
4186
|
+
GGML_UNUSED(t);
|
|
4187
|
+
return sizeof(T);
|
|
4188
|
+
}
|
|
4189
|
+
template <typename T> size_t push_constant_size(const std::vector<T> &t) {
|
|
4190
|
+
GGML_UNUSED(t);
|
|
4191
|
+
return sizeof(T) * t.size();
|
|
4192
|
+
}
|
|
4193
|
+
template <typename T, uint32_t N> size_t push_constant_size(const std::array<T, N> &t) {
|
|
4194
|
+
GGML_UNUSED(t);
|
|
4195
|
+
return sizeof(T) * N;
|
|
4196
|
+
}
|
|
4197
|
+
|
|
4198
|
+
template <typename T> const T *push_constant_data(const T &t) {
|
|
4199
|
+
static_assert(std::is_class<T>::value, "T must be a struct/class");
|
|
4200
|
+
return &t;
|
|
4201
|
+
}
|
|
4202
|
+
template <typename T> const T *push_constant_data(const std::vector<T> &t) {
|
|
4203
|
+
return t.data();
|
|
4204
|
+
}
|
|
4205
|
+
template <typename T, uint32_t N> const T *push_constant_data(const std::array<T, N> &t) {
|
|
4206
|
+
return t.data();
|
|
4207
|
+
}
|
|
4208
|
+
|
|
4209
|
+
template <typename T>
|
|
4210
|
+
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, const T &push_constants, std::array<uint32_t, 3> elements) {
|
|
4065
4211
|
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
|
4066
4212
|
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
|
4067
4213
|
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
|
@@ -4070,14 +4216,14 @@ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context&
|
|
|
4070
4216
|
std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.range << "), ";
|
|
4071
4217
|
}
|
|
4072
4218
|
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
|
4073
|
-
GGML_ASSERT(
|
|
4074
|
-
GGML_ASSERT(descriptor_buffer_infos.size()
|
|
4219
|
+
GGML_ASSERT(ctx->descriptor_set_idx < ctx->descriptor_sets.size());
|
|
4220
|
+
GGML_ASSERT(descriptor_buffer_infos.size() <= MAX_PARAMETER_COUNT);
|
|
4075
4221
|
|
|
4076
|
-
vk::DescriptorSet& descriptor_set =
|
|
4222
|
+
vk::DescriptorSet& descriptor_set = ctx->descriptor_sets[ctx->descriptor_set_idx++];
|
|
4077
4223
|
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
|
4078
4224
|
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
|
4079
4225
|
|
|
4080
|
-
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
|
|
4226
|
+
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size(push_constants), push_constant_data(push_constants));
|
|
4081
4227
|
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
|
4082
4228
|
subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
|
|
4083
4229
|
pipeline->layout,
|
|
@@ -4110,7 +4256,7 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) {
|
|
|
4110
4256
|
ggml_vk_ctx_end(subctx);
|
|
4111
4257
|
}
|
|
4112
4258
|
|
|
4113
|
-
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->
|
|
4259
|
+
subctx->seqs.push_back({ ggml_vk_begin_submission(device, *subctx->p) });
|
|
4114
4260
|
subctx->s = subctx->seqs[subctx->seqs.size() - 1].data();
|
|
4115
4261
|
}
|
|
4116
4262
|
|
|
@@ -4311,7 +4457,9 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|
|
4311
4457
|
memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
|
|
4312
4458
|
}
|
|
4313
4459
|
} else {
|
|
4314
|
-
|
|
4460
|
+
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
|
4461
|
+
|
|
4462
|
+
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
|
4315
4463
|
ggml_vk_ctx_begin(dst->device, subctx);
|
|
4316
4464
|
ggml_vk_buffer_write_2d_async(subctx, dst, offset, src, spitch, width, height, true);
|
|
4317
4465
|
ggml_vk_ctx_end(subctx);
|
|
@@ -4323,6 +4471,7 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void *
|
|
|
4323
4471
|
ggml_vk_submit(subctx, dst->device->fence);
|
|
4324
4472
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
|
4325
4473
|
dst->device->device.resetFences({ dst->device->fence });
|
|
4474
|
+
ggml_vk_queue_command_pools_cleanup(dst->device);
|
|
4326
4475
|
}
|
|
4327
4476
|
}
|
|
4328
4477
|
|
|
@@ -4399,7 +4548,9 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|
|
4399
4548
|
|
|
4400
4549
|
memcpy(dst, (uint8_t *) src->ptr + offset, size);
|
|
4401
4550
|
} else {
|
|
4402
|
-
|
|
4551
|
+
std::lock_guard<std::mutex> guard(src->device->mutex);
|
|
4552
|
+
|
|
4553
|
+
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
|
4403
4554
|
ggml_vk_ctx_begin(src->device, subctx);
|
|
4404
4555
|
ggml_vk_buffer_read_async(subctx, src, offset, dst, size, true);
|
|
4405
4556
|
ggml_vk_ctx_end(subctx);
|
|
@@ -4407,6 +4558,7 @@ static void ggml_vk_buffer_read(vk_buffer& src, size_t offset, void * dst, size_
|
|
|
4407
4558
|
ggml_vk_submit(subctx, src->device->fence);
|
|
4408
4559
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
|
|
4409
4560
|
src->device->device.resetFences({ src->device->fence });
|
|
4561
|
+
ggml_vk_queue_command_pools_cleanup(src->device);
|
|
4410
4562
|
|
|
4411
4563
|
for (auto& cpy : subctx->out_memcpys) {
|
|
4412
4564
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
|
@@ -4426,15 +4578,17 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds
|
|
|
4426
4578
|
|
|
4427
4579
|
static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) {
|
|
4428
4580
|
if (src->device == dst->device) {
|
|
4581
|
+
std::lock_guard<std::mutex> guard(src->device->mutex);
|
|
4429
4582
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(SINGLE_DEVICE, " << size << ")");
|
|
4430
4583
|
// Copy within the device
|
|
4431
|
-
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue);
|
|
4584
|
+
vk_context subctx = ggml_vk_create_temporary_context(src->device->transfer_queue.cmd_pool);
|
|
4432
4585
|
ggml_vk_ctx_begin(src->device, subctx);
|
|
4433
4586
|
ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
|
|
4434
4587
|
ggml_vk_ctx_end(subctx);
|
|
4435
4588
|
ggml_vk_submit(subctx, src->device->fence);
|
|
4436
4589
|
VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
|
|
4437
4590
|
src->device->device.resetFences({ src->device->fence });
|
|
4591
|
+
ggml_vk_queue_command_pools_cleanup(src->device);
|
|
4438
4592
|
} else {
|
|
4439
4593
|
VK_LOG_DEBUG("ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")");
|
|
4440
4594
|
// Copy device to device
|
|
@@ -4459,7 +4613,8 @@ static void ggml_vk_buffer_memset_async(vk_context& ctx, vk_buffer& dst, size_t
|
|
|
4459
4613
|
static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, size_t size) {
|
|
4460
4614
|
VK_LOG_DEBUG("ggml_vk_buffer_memset(" << offset << ", " << c << ", " << size << ")");
|
|
4461
4615
|
|
|
4462
|
-
|
|
4616
|
+
std::lock_guard<std::mutex> guard(dst->device->mutex);
|
|
4617
|
+
vk_context subctx = ggml_vk_create_temporary_context(dst->device->transfer_queue.cmd_pool);
|
|
4463
4618
|
ggml_vk_ctx_begin(dst->device, subctx);
|
|
4464
4619
|
subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
|
|
4465
4620
|
ggml_vk_ctx_end(subctx);
|
|
@@ -4467,6 +4622,7 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
|
|
|
4467
4622
|
ggml_vk_submit(subctx, dst->device->fence);
|
|
4468
4623
|
VK_CHECK(dst->device->device.waitForFences({ dst->device->fence }, true, UINT64_MAX), "vk_memset waitForFences");
|
|
4469
4624
|
dst->device->device.resetFences({ dst->device->fence });
|
|
4625
|
+
ggml_vk_queue_command_pools_cleanup(dst->device);
|
|
4470
4626
|
}
|
|
4471
4627
|
|
|
4472
4628
|
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
|
|
@@ -4540,7 +4696,7 @@ static void ggml_vk_matmul(
|
|
|
4540
4696
|
ggml_vk_sync_buffers(subctx);
|
|
4541
4697
|
if (split_k == 1) {
|
|
4542
4698
|
const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
4543
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d },
|
|
4699
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch });
|
|
4544
4700
|
return;
|
|
4545
4701
|
}
|
|
4546
4702
|
|
|
@@ -4548,10 +4704,10 @@ static void ggml_vk_matmul(
|
|
|
4548
4704
|
|
|
4549
4705
|
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
|
|
4550
4706
|
// Make sure enough workgroups get assigned for split k to work
|
|
4551
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer },
|
|
4707
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
|
|
4552
4708
|
ggml_vk_sync_buffers(subctx);
|
|
4553
4709
|
const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
|
|
4554
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2
|
|
4710
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 });
|
|
4555
4711
|
}
|
|
4556
4712
|
|
|
4557
4713
|
static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) {
|
|
@@ -4599,7 +4755,7 @@ static void ggml_vk_matmul_id(
|
|
|
4599
4755
|
ggml_vk_sync_buffers(subctx);
|
|
4600
4756
|
const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d,
|
|
4601
4757
|
nei0, nei1, nbi1, ne11, padded_n };
|
|
4602
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids },
|
|
4758
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as });
|
|
4603
4759
|
}
|
|
4604
4760
|
|
|
4605
4761
|
static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
|
|
@@ -4720,7 +4876,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context&
|
|
|
4720
4876
|
};
|
|
4721
4877
|
init_pushconst_fastdiv(pc);
|
|
4722
4878
|
ggml_vk_sync_buffers(subctx);
|
|
4723
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out },
|
|
4879
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements);
|
|
4724
4880
|
}
|
|
4725
4881
|
|
|
4726
4882
|
static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) {
|
|
@@ -4739,7 +4895,7 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
4739
4895
|
vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1);
|
|
4740
4896
|
|
|
4741
4897
|
ggml_vk_sync_buffers(subctx);
|
|
4742
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out },
|
|
4898
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array<uint32_t, 1>{ne}, { ne, 1, 1 });
|
|
4743
4899
|
}
|
|
4744
4900
|
|
|
4745
4901
|
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -4880,18 +5036,18 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
4880
5036
|
}
|
|
4881
5037
|
|
|
4882
5038
|
// Request descriptor sets
|
|
4883
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5039
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
4884
5040
|
if (qx_needs_dequant) {
|
|
4885
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5041
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
4886
5042
|
}
|
|
4887
5043
|
if (qy_needs_dequant) {
|
|
4888
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5044
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
4889
5045
|
}
|
|
4890
5046
|
if (quantize_y) {
|
|
4891
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5047
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_q8_1, 1);
|
|
4892
5048
|
}
|
|
4893
5049
|
if (split_k > 1) {
|
|
4894
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5050
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
|
|
4895
5051
|
}
|
|
4896
5052
|
return;
|
|
4897
5053
|
}
|
|
@@ -4939,7 +5095,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|
|
4939
5095
|
} else if (qx_needs_dequant) {
|
|
4940
5096
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
4941
5097
|
ggml_vk_sync_buffers(subctx);
|
|
4942
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc
|
|
5098
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
4943
5099
|
}
|
|
4944
5100
|
if (y_non_contig) {
|
|
4945
5101
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -5073,12 +5229,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5073
5229
|
|
|
5074
5230
|
// Request descriptor sets
|
|
5075
5231
|
if (qx_needs_dequant) {
|
|
5076
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5232
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
5077
5233
|
}
|
|
5078
5234
|
if (qy_needs_dequant) {
|
|
5079
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5235
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
5080
5236
|
}
|
|
5081
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5237
|
+
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
|
5082
5238
|
return;
|
|
5083
5239
|
}
|
|
5084
5240
|
|
|
@@ -5155,7 +5311,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5155
5311
|
ggml_vk_sync_buffers(subctx);
|
|
5156
5312
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
5157
5313
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
|
5158
|
-
|
|
5314
|
+
pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
|
5159
5315
|
}
|
|
5160
5316
|
|
|
5161
5317
|
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5211,7 +5367,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
5211
5367
|
|
|
5212
5368
|
if (dryrun) {
|
|
5213
5369
|
// Request descriptor sets
|
|
5214
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5370
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], 1);
|
|
5215
5371
|
return;
|
|
5216
5372
|
}
|
|
5217
5373
|
|
|
@@ -5243,7 +5399,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
|
5243
5399
|
}
|
|
5244
5400
|
|
|
5245
5401
|
ggml_vk_sync_buffers(subctx);
|
|
5246
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } },
|
|
5402
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z });
|
|
5247
5403
|
}
|
|
5248
5404
|
|
|
5249
5405
|
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5300,7 +5456,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
5300
5456
|
|
|
5301
5457
|
if (dryrun) {
|
|
5302
5458
|
// Request descriptor sets
|
|
5303
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5459
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
|
|
5304
5460
|
return;
|
|
5305
5461
|
}
|
|
5306
5462
|
|
|
@@ -5326,7 +5482,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
|
5326
5482
|
const std::array<uint32_t, 9> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
|
5327
5483
|
ggml_vk_sync_buffers(subctx);
|
|
5328
5484
|
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
|
5329
|
-
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } },
|
|
5485
|
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
5330
5486
|
}
|
|
5331
5487
|
|
|
5332
5488
|
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -5487,12 +5643,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5487
5643
|
}
|
|
5488
5644
|
|
|
5489
5645
|
// Request descriptor sets
|
|
5490
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5646
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
5491
5647
|
if (qx_needs_dequant) {
|
|
5492
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5648
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
5493
5649
|
}
|
|
5494
5650
|
if (qy_needs_dequant) {
|
|
5495
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5651
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
5496
5652
|
}
|
|
5497
5653
|
return;
|
|
5498
5654
|
}
|
|
@@ -5542,7 +5698,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
|
5542
5698
|
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
|
5543
5699
|
ggml_vk_sync_buffers(subctx);
|
|
5544
5700
|
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
|
5545
|
-
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc
|
|
5701
|
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
5546
5702
|
}
|
|
5547
5703
|
if (y_non_contig) {
|
|
5548
5704
|
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -5681,12 +5837,12 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
5681
5837
|
|
|
5682
5838
|
// Request descriptor sets
|
|
5683
5839
|
if (qx_needs_dequant) {
|
|
5684
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5840
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_0, 1);
|
|
5685
5841
|
}
|
|
5686
5842
|
if (qy_needs_dequant) {
|
|
5687
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5843
|
+
ggml_pipeline_request_descriptor_sets(ctx, to_fp16_vk_1, 1);
|
|
5688
5844
|
}
|
|
5689
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
5845
|
+
ggml_pipeline_request_descriptor_sets(ctx, dmmv, 1);
|
|
5690
5846
|
return;
|
|
5691
5847
|
}
|
|
5692
5848
|
|
|
@@ -5762,7 +5918,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
|
5762
5918
|
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
|
5763
5919
|
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
|
5764
5920
|
vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
|
|
5765
|
-
|
|
5921
|
+
pc, { groups_x, (uint32_t)nei0, groups_z });
|
|
5766
5922
|
}
|
|
5767
5923
|
|
|
5768
5924
|
static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -6006,9 +6162,9 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6006
6162
|
|
|
6007
6163
|
if (dryrun) {
|
|
6008
6164
|
// Request descriptor sets
|
|
6009
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
6165
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
6010
6166
|
if (split_k > 1) {
|
|
6011
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
6167
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_flash_attn_split_k_reduce, 1);
|
|
6012
6168
|
}
|
|
6013
6169
|
return;
|
|
6014
6170
|
}
|
|
@@ -6112,7 +6268,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6112
6268
|
// there's no more than one tile of rows (i.e. workgroups_x would have been
|
|
6113
6269
|
// one). We reuse workgroups_x to mean the number of splits, so we need to
|
|
6114
6270
|
// cancel out the divide by wg_denoms[0].
|
|
6115
|
-
|
|
6271
|
+
pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z });
|
|
6116
6272
|
|
|
6117
6273
|
ggml_vk_sync_buffers(subctx);
|
|
6118
6274
|
const std::array<uint32_t, 3> pc2 = { D, (uint32_t)ne1, split_k };
|
|
@@ -6121,7 +6277,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6121
6277
|
vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE},
|
|
6122
6278
|
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
|
6123
6279
|
},
|
|
6124
|
-
pc2
|
|
6280
|
+
pc2, { (uint32_t)ne1, 1, 1 });
|
|
6125
6281
|
} else {
|
|
6126
6282
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
|
|
6127
6283
|
{
|
|
@@ -6131,7 +6287,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6131
6287
|
vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE},
|
|
6132
6288
|
vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE},
|
|
6133
6289
|
},
|
|
6134
|
-
|
|
6290
|
+
pc, { workgroups_x, workgroups_y, workgroups_z });
|
|
6135
6291
|
}
|
|
6136
6292
|
}
|
|
6137
6293
|
|
|
@@ -6392,6 +6548,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
6392
6548
|
return ctx->device->pipeline_timestep_embedding_f32;
|
|
6393
6549
|
}
|
|
6394
6550
|
return nullptr;
|
|
6551
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
6552
|
+
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
6553
|
+
return ctx->device->pipeline_conv_transpose_1d_f32;
|
|
6554
|
+
}
|
|
6555
|
+
return nullptr;
|
|
6395
6556
|
case GGML_OP_POOL_2D:
|
|
6396
6557
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
6397
6558
|
return ctx->device->pipeline_pool2d_f32;
|
|
@@ -6566,7 +6727,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6566
6727
|
}
|
|
6567
6728
|
|
|
6568
6729
|
if (dryrun) {
|
|
6569
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
6730
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
6570
6731
|
return;
|
|
6571
6732
|
}
|
|
6572
6733
|
|
|
@@ -6726,6 +6887,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6726
6887
|
uint32_t half_ceil = (dim + 1) / 2;
|
|
6727
6888
|
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
|
6728
6889
|
} break;
|
|
6890
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
6891
|
+
{
|
|
6892
|
+
elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1}
|
|
6893
|
+
} break;
|
|
6729
6894
|
case GGML_OP_POOL_2D:
|
|
6730
6895
|
{
|
|
6731
6896
|
const uint32_t N = dst->ne[3];
|
|
@@ -6800,7 +6965,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6800
6965
|
}
|
|
6801
6966
|
|
|
6802
6967
|
ggml_vk_sync_buffers(subctx);
|
|
6803
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6968
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6804
6969
|
} else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) {
|
|
6805
6970
|
// Empty src2 is possible in rope, but the shader needs a buffer
|
|
6806
6971
|
vk_subbuffer subbuf_z;
|
|
@@ -6811,26 +6976,26 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
6811
6976
|
}
|
|
6812
6977
|
|
|
6813
6978
|
ggml_vk_sync_buffers(subctx);
|
|
6814
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6979
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6815
6980
|
} else if (op == GGML_OP_IM2COL) {
|
|
6816
6981
|
// im2col uses only src1 and dst buffers
|
|
6817
6982
|
ggml_vk_sync_buffers(subctx);
|
|
6818
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6983
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6819
6984
|
} else if (op == GGML_OP_COUNT_EQUAL) {
|
|
6820
6985
|
ggml_vk_sync_buffers(subctx);
|
|
6821
6986
|
// count_equal assumes that destination buffer is initialized with zeroes
|
|
6822
6987
|
ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz);
|
|
6823
6988
|
ggml_vk_sync_buffers(subctx);
|
|
6824
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6989
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6825
6990
|
} else if (use_src2) {
|
|
6826
6991
|
ggml_vk_sync_buffers(subctx);
|
|
6827
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6992
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6828
6993
|
} else if (use_src1) {
|
|
6829
6994
|
ggml_vk_sync_buffers(subctx);
|
|
6830
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6995
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6831
6996
|
} else {
|
|
6832
6997
|
ggml_vk_sync_buffers(subctx);
|
|
6833
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } },
|
|
6998
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements);
|
|
6834
6999
|
}
|
|
6835
7000
|
}
|
|
6836
7001
|
|
|
@@ -6943,7 +7108,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6943
7108
|
GGML_ASSERT(pipeline != nullptr);
|
|
6944
7109
|
|
|
6945
7110
|
if (dryrun) {
|
|
6946
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7111
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
6947
7112
|
return;
|
|
6948
7113
|
}
|
|
6949
7114
|
|
|
@@ -6999,7 +7164,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
6999
7164
|
vk_subbuffer{ d_srcs[4], src_offsets[4], src_sizes[4] },
|
|
7000
7165
|
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
|
7001
7166
|
vk_subbuffer{ d_D, dst_offset, dst_size }
|
|
7002
|
-
},
|
|
7167
|
+
}, pc, elements);
|
|
7003
7168
|
} else if (version == 7) {
|
|
7004
7169
|
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, {
|
|
7005
7170
|
vk_subbuffer{ d_srcs[0], src_offsets[0], src_sizes[0] },
|
|
@@ -7010,7 +7175,7 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
|
7010
7175
|
vk_subbuffer{ d_srcs[5], src_offsets[5], src_sizes[5] },
|
|
7011
7176
|
vk_subbuffer{ d_srcs[6], src_offsets[6], src_sizes[6] },
|
|
7012
7177
|
vk_subbuffer{ d_D, dst_offset, dst_size }
|
|
7013
|
-
},
|
|
7178
|
+
}, pc, elements);
|
|
7014
7179
|
} else {
|
|
7015
7180
|
// shouldn't happen
|
|
7016
7181
|
GGML_ASSERT(false);
|
|
@@ -7082,7 +7247,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
|
|
7082
7247
|
GGML_ASSERT(pipeline != nullptr);
|
|
7083
7248
|
|
|
7084
7249
|
if (dryrun) {
|
|
7085
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7250
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
7086
7251
|
return;
|
|
7087
7252
|
}
|
|
7088
7253
|
|
|
@@ -7147,7 +7312,7 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont
|
|
|
7147
7312
|
vk_subbuffer{ d_GM, gm_offset, gm_size },
|
|
7148
7313
|
vk_subbuffer{ d_GV, gv_offset, gv_size },
|
|
7149
7314
|
vk_subbuffer{ d_P, p_offset, p_size },
|
|
7150
|
-
},
|
|
7315
|
+
}, pc, elements);
|
|
7151
7316
|
}
|
|
7152
7317
|
|
|
7153
7318
|
static void ggml_vk_opt_step_adamw(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) {
|
|
@@ -7529,6 +7694,37 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
|
|
7529
7694
|
}, dryrun);
|
|
7530
7695
|
}
|
|
7531
7696
|
|
|
7697
|
+
static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
7698
|
+
// src0: (K, Cout, Cin, 1) -- kernel
|
|
7699
|
+
// src1: (L, Cin, 1, 1) -- input
|
|
7700
|
+
// dst: (*, Cout, 1, 1)
|
|
7701
|
+
|
|
7702
|
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
7703
|
+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
|
7704
|
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
|
7705
|
+
|
|
7706
|
+
GGML_TENSOR_BINARY_OP_LOCALS
|
|
7707
|
+
|
|
7708
|
+
GGML_ASSERT(nb00 == sizeof(float));
|
|
7709
|
+
GGML_ASSERT(nb10 == sizeof(float));
|
|
7710
|
+
|
|
7711
|
+
const int32_t s0 = dst->op_params[0];
|
|
7712
|
+
|
|
7713
|
+
vk_op_conv_transpose_1d_push_constants p{};
|
|
7714
|
+
p.Cout = static_cast<uint32_t>(ne01);
|
|
7715
|
+
p.Cin = static_cast<uint32_t>(ne02);
|
|
7716
|
+
p.K = static_cast<uint32_t>(ne00);
|
|
7717
|
+
p.L = static_cast<uint32_t>(ne10);
|
|
7718
|
+
p.KL = static_cast<uint32_t>(ne0);
|
|
7719
|
+
p.nb01 = static_cast<uint32_t>(nb01 / nb00);
|
|
7720
|
+
p.nb02 = static_cast<uint32_t>(nb02 / nb00);
|
|
7721
|
+
p.nb11 = static_cast<uint32_t>(nb11 / nb10);
|
|
7722
|
+
p.nb1 = static_cast<uint32_t>(nb1 / nb0);
|
|
7723
|
+
p.s0 = static_cast<uint32_t>(s0);
|
|
7724
|
+
|
|
7725
|
+
ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p), dryrun);
|
|
7726
|
+
}
|
|
7727
|
+
|
|
7532
7728
|
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
7533
7729
|
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
|
7534
7730
|
const int32_t k1 = dst->op_params[1];
|
|
@@ -7729,9 +7925,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7729
7925
|
}
|
|
7730
7926
|
}
|
|
7731
7927
|
|
|
7732
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7928
|
+
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
|
7733
7929
|
if (split_k > 1) {
|
|
7734
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
7930
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
|
7735
7931
|
|
|
7736
7932
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
|
7737
7933
|
// Resize buffer
|
|
@@ -7746,7 +7942,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7746
7942
|
ggml_vk_load_shaders(ctx->device);
|
|
7747
7943
|
}
|
|
7748
7944
|
|
|
7749
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
7945
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
7750
7946
|
|
|
7751
7947
|
vk_buffer d_X = ggml_vk_create_buffer_check(ctx->device, sizeof(X_TYPE) * x_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
7752
7948
|
vk_buffer d_Y = ggml_vk_create_buffer_check(ctx->device, sizeof(Y_TYPE) * y_ne, vk::MemoryPropertyFlagBits::eDeviceLocal);
|
|
@@ -7788,7 +7984,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7788
7984
|
ggml_vk_buffer_write(d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
|
|
7789
7985
|
ggml_vk_buffer_write(d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
|
|
7790
7986
|
|
|
7791
|
-
vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
7987
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
7792
7988
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
7793
7989
|
for (size_t i = 0; i < num_it; i++) {
|
|
7794
7990
|
ggml_vk_matmul(
|
|
@@ -7804,6 +8000,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7804
8000
|
ggml_vk_submit(subctx, ctx->fence);
|
|
7805
8001
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
|
|
7806
8002
|
ctx->device->device.resetFences({ ctx->fence });
|
|
8003
|
+
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
7807
8004
|
|
|
7808
8005
|
auto end = std::chrono::high_resolution_clock::now();
|
|
7809
8006
|
double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
|
|
@@ -7905,16 +8102,13 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
|
7905
8102
|
|
|
7906
8103
|
free(d_chk);
|
|
7907
8104
|
|
|
7908
|
-
|
|
7909
|
-
|
|
8105
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
|
8106
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
|
7910
8107
|
|
|
7911
8108
|
ggml_vk_destroy_buffer(d_X);
|
|
7912
8109
|
ggml_vk_destroy_buffer(d_Y);
|
|
7913
8110
|
ggml_vk_destroy_buffer(d_D);
|
|
7914
8111
|
|
|
7915
|
-
ggml_pipeline_cleanup(p);
|
|
7916
|
-
ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
|
|
7917
|
-
|
|
7918
8112
|
free(x);
|
|
7919
8113
|
free(y);
|
|
7920
8114
|
free(d);
|
|
@@ -7992,20 +8186,20 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
7992
8186
|
ggml_vk_quantize_data(x, qx, ne, quant);
|
|
7993
8187
|
ggml_vk_dequantize_data(qx, x_ref, ne, quant);
|
|
7994
8188
|
|
|
7995
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8189
|
+
ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
|
7996
8190
|
|
|
7997
8191
|
if (ctx->device->need_compiles) {
|
|
7998
8192
|
ggml_vk_load_shaders(ctx->device);
|
|
7999
8193
|
}
|
|
8000
8194
|
|
|
8001
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
8195
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
8002
8196
|
|
|
8003
8197
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
|
8004
8198
|
|
|
8005
|
-
vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
8199
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8006
8200
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
8007
8201
|
const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
|
|
8008
|
-
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc
|
|
8202
|
+
ggml_vk_dispatch_pipeline(ctx, subctx, p, { vk_subbuffer{ qx_buf, 0, qx_sz }, vk_subbuffer{ x_buf, 0, x_sz_f16 } }, pc, { (uint32_t)ne, 1, 1});
|
|
8009
8203
|
ggml_vk_ctx_end(subctx);
|
|
8010
8204
|
|
|
8011
8205
|
auto begin = std::chrono::high_resolution_clock::now();
|
|
@@ -8013,6 +8207,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
8013
8207
|
ggml_vk_submit(subctx, ctx->fence);
|
|
8014
8208
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
|
8015
8209
|
ctx->device->device.resetFences({ ctx->fence });
|
|
8210
|
+
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
8016
8211
|
|
|
8017
8212
|
auto end = std::chrono::high_resolution_clock::now();
|
|
8018
8213
|
|
|
@@ -8092,17 +8287,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
8092
8287
|
//
|
|
8093
8288
|
// vk_pipeline p = ggml_vk_get_quantize_pipeline(ctx, quant);
|
|
8094
8289
|
//
|
|
8095
|
-
// ggml_pipeline_request_descriptor_sets(ctx
|
|
8290
|
+
// ggml_pipeline_request_descriptor_sets(ctx, p, 1);
|
|
8096
8291
|
//
|
|
8097
8292
|
// if (ctx->device->need_compiles) {
|
|
8098
8293
|
// ggml_vk_load_shaders(ctx->device);
|
|
8099
8294
|
// }
|
|
8100
8295
|
//
|
|
8101
|
-
// ggml_pipeline_allocate_descriptor_sets(ctx
|
|
8296
|
+
// ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
8102
8297
|
//
|
|
8103
8298
|
// ggml_vk_buffer_write(x_buf, 0, x, x_sz);
|
|
8104
8299
|
//
|
|
8105
|
-
// vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
8300
|
+
// vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8106
8301
|
// ggml_vk_ctx_begin(ctx->device, subctx);
|
|
8107
8302
|
// ggml_vk_quantize_q8_1(ctx, subctx, ggml_vk_subbuffer(x_buf), ggml_vk_subbuffer(qx_buf), ne);
|
|
8108
8303
|
// ggml_vk_ctx_end(subctx);
|
|
@@ -8112,6 +8307,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
|
|
|
8112
8307
|
// ggml_vk_submit(subctx, ctx->fence);
|
|
8113
8308
|
// VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_quantize waitForFences");
|
|
8114
8309
|
// ctx->device->device.resetFences({ ctx->fence });
|
|
8310
|
+
// ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
8115
8311
|
//
|
|
8116
8312
|
// auto end = std::chrono::high_resolution_clock::now();
|
|
8117
8313
|
//
|
|
@@ -8251,9 +8447,9 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
8251
8447
|
// y[i] = i % k;
|
|
8252
8448
|
}
|
|
8253
8449
|
|
|
8254
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8450
|
+
ggml_pipeline_request_descriptor_sets(ctx, p, num_it);
|
|
8255
8451
|
if (split_k > 1) {
|
|
8256
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8452
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
|
|
8257
8453
|
|
|
8258
8454
|
if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
|
|
8259
8455
|
// Resize buffer
|
|
@@ -8264,19 +8460,19 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
8264
8460
|
}
|
|
8265
8461
|
}
|
|
8266
8462
|
if (mmq) {
|
|
8267
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8463
|
+
ggml_pipeline_request_descriptor_sets(ctx, ctx->device->pipeline_quantize_q8_1, num_it);
|
|
8268
8464
|
}
|
|
8269
8465
|
|
|
8270
8466
|
if (ctx->device->need_compiles) {
|
|
8271
8467
|
ggml_vk_load_shaders(ctx->device);
|
|
8272
8468
|
}
|
|
8273
8469
|
|
|
8274
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
8470
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
8275
8471
|
|
|
8276
8472
|
ggml_vk_buffer_write(qx_buf, 0, qx, qx_sz);
|
|
8277
8473
|
ggml_vk_buffer_write(y_buf, 0, y, y_sz);
|
|
8278
8474
|
|
|
8279
|
-
vk_context subctx = ggml_vk_create_context(ctx, ctx->
|
|
8475
|
+
vk_context subctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8280
8476
|
ggml_vk_ctx_begin(ctx->device, subctx);
|
|
8281
8477
|
if (mmq) {
|
|
8282
8478
|
for (size_t i = 0; i < num_it; i++) {
|
|
@@ -8305,6 +8501,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
|
8305
8501
|
ggml_vk_submit(subctx, ctx->fence);
|
|
8306
8502
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
|
|
8307
8503
|
ctx->device->device.resetFences({ ctx->fence });
|
|
8504
|
+
ggml_vk_queue_command_pools_cleanup(ctx->device);
|
|
8308
8505
|
|
|
8309
8506
|
auto end = std::chrono::high_resolution_clock::now();
|
|
8310
8507
|
|
|
@@ -8600,6 +8797,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8600
8797
|
case GGML_OP_COUNT_EQUAL:
|
|
8601
8798
|
case GGML_OP_IM2COL:
|
|
8602
8799
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
8800
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
8603
8801
|
case GGML_OP_POOL_2D:
|
|
8604
8802
|
case GGML_OP_CONV_2D_DW:
|
|
8605
8803
|
case GGML_OP_RWKV_WKV6:
|
|
@@ -8618,7 +8816,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8618
8816
|
|
|
8619
8817
|
if (!dryrun) {
|
|
8620
8818
|
if (ctx->compute_ctx.expired()) {
|
|
8621
|
-
compute_ctx = ggml_vk_create_context(ctx, ctx->
|
|
8819
|
+
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
8622
8820
|
ctx->compute_ctx = compute_ctx;
|
|
8623
8821
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
|
8624
8822
|
} else {
|
|
@@ -8664,6 +8862,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8664
8862
|
case GGML_OP_COUNT_EQUAL:
|
|
8665
8863
|
case GGML_OP_IM2COL:
|
|
8666
8864
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
8865
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
8667
8866
|
case GGML_OP_POOL_2D:
|
|
8668
8867
|
case GGML_OP_CONV_2D_DW:
|
|
8669
8868
|
case GGML_OP_LEAKY_RELU:
|
|
@@ -8671,7 +8870,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8671
8870
|
// These operations all go through ggml_vk_op_f32, so short-circuit and
|
|
8672
8871
|
// do the only thing needed for the dryrun.
|
|
8673
8872
|
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op);
|
|
8674
|
-
ggml_pipeline_request_descriptor_sets(ctx
|
|
8873
|
+
ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1);
|
|
8675
8874
|
return false;
|
|
8676
8875
|
}
|
|
8677
8876
|
default:
|
|
@@ -8835,6 +9034,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
|
8835
9034
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
8836
9035
|
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
|
8837
9036
|
|
|
9037
|
+
break;
|
|
9038
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
9039
|
+
ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node, dryrun);
|
|
9040
|
+
|
|
8838
9041
|
break;
|
|
8839
9042
|
case GGML_OP_POOL_2D:
|
|
8840
9043
|
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
|
@@ -8963,6 +9166,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
|
|
8963
9166
|
case GGML_OP_COUNT_EQUAL:
|
|
8964
9167
|
case GGML_OP_IM2COL:
|
|
8965
9168
|
case GGML_OP_TIMESTEP_EMBEDDING:
|
|
9169
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
8966
9170
|
case GGML_OP_POOL_2D:
|
|
8967
9171
|
case GGML_OP_CONV_2D_DW:
|
|
8968
9172
|
case GGML_OP_RWKV_WKV6:
|
|
@@ -9058,19 +9262,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
9058
9262
|
}
|
|
9059
9263
|
ctx->gc.temp_buffers.clear();
|
|
9060
9264
|
|
|
9061
|
-
|
|
9062
|
-
|
|
9063
|
-
|
|
9064
|
-
if (plr.expired()) {
|
|
9065
|
-
continue;
|
|
9066
|
-
}
|
|
9067
|
-
|
|
9068
|
-
vk_pipeline pl = plr.lock();
|
|
9069
|
-
ggml_pipeline_cleanup(pl);
|
|
9070
|
-
}
|
|
9071
|
-
|
|
9072
|
-
ggml_vk_queue_cleanup(ctx->device, ctx->device->compute_queue);
|
|
9073
|
-
ggml_vk_queue_cleanup(ctx->device, ctx->device->transfer_queue);
|
|
9265
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool);
|
|
9266
|
+
ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool);
|
|
9074
9267
|
|
|
9075
9268
|
for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
|
|
9076
9269
|
ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
|
|
@@ -9091,7 +9284,8 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
9091
9284
|
|
|
9092
9285
|
ctx->tensor_ctxs.clear();
|
|
9093
9286
|
ctx->gc.contexts.clear();
|
|
9094
|
-
ctx->
|
|
9287
|
+
ctx->pipeline_descriptor_set_requirements = 0;
|
|
9288
|
+
ctx->descriptor_set_idx = 0;
|
|
9095
9289
|
}
|
|
9096
9290
|
|
|
9097
9291
|
// Clean up on backend free
|
|
@@ -9118,6 +9312,15 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
9118
9312
|
|
|
9119
9313
|
ctx->device->device.destroyFence(ctx->fence);
|
|
9120
9314
|
ctx->device->device.destroyFence(ctx->almost_ready_fence);
|
|
9315
|
+
|
|
9316
|
+
for (auto& pool : ctx->descriptor_pools) {
|
|
9317
|
+
ctx->device->device.destroyDescriptorPool(pool);
|
|
9318
|
+
}
|
|
9319
|
+
ctx->descriptor_pools.clear();
|
|
9320
|
+
ctx->descriptor_sets.clear();
|
|
9321
|
+
|
|
9322
|
+
ctx->compute_cmd_pool.destroy(ctx->device->device);
|
|
9323
|
+
ctx->transfer_cmd_pool.destroy(ctx->device->device);
|
|
9121
9324
|
}
|
|
9122
9325
|
|
|
9123
9326
|
static int ggml_vk_get_device_count() {
|
|
@@ -9325,6 +9528,12 @@ static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer
|
|
|
9325
9528
|
UNUSED(buft);
|
|
9326
9529
|
}
|
|
9327
9530
|
|
|
9531
|
+
static size_t ggml_backend_vk_host_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
|
9532
|
+
return vk_instance.devices[0]->suballocation_block_size;
|
|
9533
|
+
|
|
9534
|
+
UNUSED(buft);
|
|
9535
|
+
}
|
|
9536
|
+
|
|
9328
9537
|
// Should be changed to return device-specific host buffer type
|
|
9329
9538
|
// but that probably requires changes in llama.cpp
|
|
9330
9539
|
ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
@@ -9333,7 +9542,7 @@ ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type() {
|
|
|
9333
9542
|
/* .get_name = */ ggml_backend_vk_host_buffer_type_name,
|
|
9334
9543
|
/* .alloc_buffer = */ ggml_backend_vk_host_buffer_type_alloc_buffer,
|
|
9335
9544
|
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
|
|
9336
|
-
/* .get_max_size = */
|
|
9545
|
+
/* .get_max_size = */ ggml_backend_vk_host_buffer_type_get_max_size,
|
|
9337
9546
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
9338
9547
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
9339
9548
|
},
|
|
@@ -9384,7 +9593,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
|
|
|
9384
9593
|
|
|
9385
9594
|
if (ctx->transfer_ctx.expired()) {
|
|
9386
9595
|
// Initialize new transfer context
|
|
9387
|
-
transfer_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9596
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
|
9388
9597
|
ctx->transfer_ctx = transfer_ctx;
|
|
9389
9598
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
9390
9599
|
} else {
|
|
@@ -9407,7 +9616,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
|
|
|
9407
9616
|
|
|
9408
9617
|
if (ctx->transfer_ctx.expired()) {
|
|
9409
9618
|
// Initialize new transfer context
|
|
9410
|
-
transfer_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9619
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
|
9411
9620
|
ctx->transfer_ctx = transfer_ctx;
|
|
9412
9621
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
9413
9622
|
} else {
|
|
@@ -9430,7 +9639,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
|
|
|
9430
9639
|
|
|
9431
9640
|
if (ctx->transfer_ctx.expired()) {
|
|
9432
9641
|
// Initialize new transfer context
|
|
9433
|
-
transfer_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9642
|
+
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
|
|
9434
9643
|
ctx->transfer_ctx = transfer_ctx;
|
|
9435
9644
|
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
|
|
9436
9645
|
} else {
|
|
@@ -9480,6 +9689,13 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9480
9689
|
VK_LOG_DEBUG("ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)");
|
|
9481
9690
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
|
9482
9691
|
|
|
9692
|
+
if (vk_instance.debug_utils_support) {
|
|
9693
|
+
vk::DebugUtilsLabelEXT dul = {};
|
|
9694
|
+
dul.pLabelName = "ggml_backend_vk_graph_compute";
|
|
9695
|
+
dul.color = std::array<float,4>{1.0f, 1.0f, 1.0f, 1.0f};
|
|
9696
|
+
vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast<VkDebugUtilsLabelEXT*>(&dul));
|
|
9697
|
+
}
|
|
9698
|
+
|
|
9483
9699
|
uint64_t total_mat_mul_bytes = 0;
|
|
9484
9700
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
9485
9701
|
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false, false);
|
|
@@ -9491,7 +9707,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9491
9707
|
ggml_vk_load_shaders(ctx->device);
|
|
9492
9708
|
}
|
|
9493
9709
|
ggml_vk_preallocate_buffers(ctx);
|
|
9494
|
-
ggml_pipeline_allocate_descriptor_sets(ctx
|
|
9710
|
+
ggml_pipeline_allocate_descriptor_sets(ctx);
|
|
9495
9711
|
|
|
9496
9712
|
int last_node = cgraph->n_nodes - 1;
|
|
9497
9713
|
|
|
@@ -9513,8 +9729,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9513
9729
|
if (ctx->device->query_pool) {
|
|
9514
9730
|
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
|
|
9515
9731
|
}
|
|
9516
|
-
|
|
9517
|
-
query_create_info.queryType =
|
|
9732
|
+
vk::QueryPoolCreateInfo query_create_info;
|
|
9733
|
+
query_create_info.queryType = vk::QueryType::eTimestamp;
|
|
9518
9734
|
query_create_info.queryCount = cgraph->n_nodes + 100;
|
|
9519
9735
|
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
|
|
9520
9736
|
ctx->device->num_queries = query_create_info.queryCount;
|
|
@@ -9523,7 +9739,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9523
9739
|
ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);
|
|
9524
9740
|
|
|
9525
9741
|
GGML_ASSERT(ctx->compute_ctx.expired());
|
|
9526
|
-
compute_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9742
|
+
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
9527
9743
|
ctx->compute_ctx = compute_ctx;
|
|
9528
9744
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
|
9529
9745
|
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
|
|
@@ -9558,7 +9774,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9558
9774
|
|
|
9559
9775
|
if (vk_perf_logger_enabled) {
|
|
9560
9776
|
if (ctx->compute_ctx.expired()) {
|
|
9561
|
-
compute_ctx = ggml_vk_create_context(ctx, ctx->
|
|
9777
|
+
compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
|
|
9562
9778
|
ctx->compute_ctx = compute_ctx;
|
|
9563
9779
|
ggml_vk_ctx_begin(ctx->device, compute_ctx);
|
|
9564
9780
|
} else {
|
|
@@ -9600,7 +9816,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
|
|
|
9600
9816
|
|
|
9601
9817
|
// Get the results and pass them to the logger
|
|
9602
9818
|
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
|
|
9603
|
-
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
|
|
9819
|
+
VK_CHECK(ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait), "get timestamp results");
|
|
9604
9820
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
9605
9821
|
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
|
|
9606
9822
|
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
|
|
@@ -10024,6 +10240,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
|
10024
10240
|
case GGML_OP_LEAKY_RELU:
|
|
10025
10241
|
case GGML_OP_OPT_STEP_ADAMW:
|
|
10026
10242
|
return true;
|
|
10243
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
10244
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
|
10027
10245
|
default:
|
|
10028
10246
|
return false;
|
|
10029
10247
|
}
|
|
@@ -10167,11 +10385,28 @@ static bool ggml_vk_instance_portability_enumeration_ext_available(const std::ve
|
|
|
10167
10385
|
UNUSED(instance_extensions);
|
|
10168
10386
|
}
|
|
10169
10387
|
|
|
10388
|
+
// Extension availability
|
|
10389
|
+
static bool ggml_vk_instance_debug_utils_ext_available(
|
|
10390
|
+
const std::vector<vk::ExtensionProperties> & instance_extensions) {
|
|
10391
|
+
// Check for portability enumeration extension for MoltenVK support
|
|
10392
|
+
for (const auto & properties : instance_extensions) {
|
|
10393
|
+
if (strcmp("VK_EXT_debug_utils", properties.extensionName) == 0) {
|
|
10394
|
+
return true;
|
|
10395
|
+
}
|
|
10396
|
+
}
|
|
10397
|
+
|
|
10398
|
+
std::cerr << "ggml_vulkan: WARNING: Instance extension VK_EXT_debug_utils not found." << std::endl;
|
|
10399
|
+
return false;
|
|
10400
|
+
|
|
10401
|
+
UNUSED(instance_extensions);
|
|
10402
|
+
}
|
|
10403
|
+
|
|
10170
10404
|
static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) {
|
|
10171
10405
|
switch (props.vendorID) {
|
|
10172
10406
|
case VK_VENDOR_ID_INTEL:
|
|
10173
|
-
//
|
|
10174
|
-
|
|
10407
|
+
// Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost,
|
|
10408
|
+
// while some older hardware (ex. Arc A770) has performance regressions
|
|
10409
|
+
return arch == vk_device_architecture::INTEL_XE2;
|
|
10175
10410
|
case VK_VENDOR_ID_AMD:
|
|
10176
10411
|
if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) {
|
|
10177
10412
|
// Workaround for AMD proprietary driver reporting support on all GPUs
|
|
@@ -10515,6 +10750,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
|
|
10515
10750
|
const int32_t dim = tensor->op_params[0];
|
|
10516
10751
|
const int32_t max_period = tensor->op_params[1];
|
|
10517
10752
|
tensor_clone = ggml_timestep_embedding(ggml_ctx, src_clone[0], dim, max_period);
|
|
10753
|
+
} else if (tensor->op == GGML_OP_CONV_TRANSPOSE_1D){
|
|
10754
|
+
const int32_t s0 = tensor->op_params[0];
|
|
10755
|
+
const int32_t p0 = tensor->op_params[1];
|
|
10756
|
+
const int32_t d0 = tensor->op_params[2];
|
|
10757
|
+
tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0);
|
|
10518
10758
|
} else if (tensor->op == GGML_OP_POOL_2D) {
|
|
10519
10759
|
enum ggml_op_pool op = static_cast<ggml_op_pool>(tensor->op_params[0]);
|
|
10520
10760
|
const int32_t k0 = tensor->op_params[1];
|