@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -0
- package/android/CMakeLists.txt +2 -0
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -11,10 +11,13 @@
|
|
|
11
11
|
|
|
12
12
|
#include <webgpu/webgpu_cpp.h>
|
|
13
13
|
|
|
14
|
+
#include <atomic>
|
|
14
15
|
#include <condition_variable>
|
|
15
16
|
#include <cstring>
|
|
16
17
|
#include <iostream>
|
|
18
|
+
#include <map>
|
|
17
19
|
#include <mutex>
|
|
20
|
+
#include <optional>
|
|
18
21
|
#include <string>
|
|
19
22
|
#include <vector>
|
|
20
23
|
|
|
@@ -25,16 +28,76 @@
|
|
|
25
28
|
# define WEBGPU_LOG_DEBUG(msg) ((void) 0)
|
|
26
29
|
#endif // GGML_WEBGPU_DEBUG
|
|
27
30
|
|
|
31
|
+
#ifdef GGML_WEBGPU_CPU_PROFILE
|
|
32
|
+
// total timing (aggregated)
|
|
33
|
+
# define WEBGPU_CPU_PROFILE_TOTAL_START(id) auto cpu_total_start_##id = std::chrono::high_resolution_clock::now();
|
|
34
|
+
|
|
35
|
+
# define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx) \
|
|
36
|
+
auto cpu_total_end_##id = std::chrono::high_resolution_clock::now(); \
|
|
37
|
+
double cpu_total_time_##id = \
|
|
38
|
+
std::chrono::duration<double, std::milli>(cpu_total_end_##id - cpu_total_start_##id).count(); \
|
|
39
|
+
(ctx)->cpu_time_ms[#id] += cpu_total_time_##id;
|
|
40
|
+
|
|
41
|
+
// fine-grained timing (not included in totals)
|
|
42
|
+
# define WEBGPU_CPU_PROFILE_DETAIL_START(id) auto cpu_detail_start_##id = std::chrono::high_resolution_clock::now();
|
|
43
|
+
|
|
44
|
+
# define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx) \
|
|
45
|
+
auto cpu_detail_end_##id = std::chrono::high_resolution_clock::now(); \
|
|
46
|
+
double cpu_detail_time_##id = \
|
|
47
|
+
std::chrono::duration<double, std::milli>(cpu_detail_end_##id - cpu_detail_start_##id).count(); \
|
|
48
|
+
(ctx)->cpu_detail_ms[#id] += cpu_detail_time_##id;
|
|
49
|
+
#else
|
|
50
|
+
# define WEBGPU_CPU_PROFILE_TOTAL_START(id)
|
|
51
|
+
# define WEBGPU_CPU_PROFILE_TOTAL_END(id, ctx)
|
|
52
|
+
# define WEBGPU_CPU_PROFILE_DETAIL_START(id)
|
|
53
|
+
# define WEBGPU_CPU_PROFILE_DETAIL_END(id, ctx)
|
|
54
|
+
#endif // GGML_WEBGPU_CPU_PROFILE
|
|
55
|
+
|
|
56
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
57
|
+
# define WEBGPU_NUM_TIMESTAMP_QUERY_BUFS 24
|
|
58
|
+
# define WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES 16 // e.g. enough for two timestamps
|
|
59
|
+
#endif
|
|
60
|
+
|
|
28
61
|
/* Constants */
|
|
29
62
|
|
|
30
|
-
#define
|
|
31
|
-
#define
|
|
32
|
-
#define
|
|
63
|
+
#define WEBGPU_MUL_MAT_WG_SIZE 256
|
|
64
|
+
#define WEBGPU_NUM_PARAM_BUFS 32u
|
|
65
|
+
#define WEBGPU_COMMAND_SUBMIT_BATCH_SIZE 8u
|
|
66
|
+
#define WEBGPU_WAIT_ANY_TIMEOUT_MS 0
|
|
67
|
+
// Maximum number of in-flight submissions per-thread, to avoid exhausting the parameter buffer pool
|
|
68
|
+
#define WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD WEBGPU_NUM_PARAM_BUFS / WEBGPU_COMMAND_SUBMIT_BATCH_SIZE
|
|
33
69
|
#define WEBGPU_PARAMS_BUF_SIZE_BYTES 128 // enough for 32 parameters
|
|
34
70
|
#define WEBGPU_NUM_SET_ROWS_ERROR_BUFS 32
|
|
35
71
|
#define WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES 4
|
|
36
72
|
#define WEBGPU_STORAGE_BUF_BINDING_MULT 4 // a storage buffer binding size must be a multiple of 4
|
|
37
73
|
|
|
74
|
+
// For operations which process a row in parallel, this seems like a reasonable default
|
|
75
|
+
#define WEBGPU_ROW_SPLIT_WG_SIZE 64
|
|
76
|
+
|
|
77
|
+
// Matrix multiplication parameters
|
|
78
|
+
|
|
79
|
+
// Register tiling parameters
|
|
80
|
+
#define WEBGPU_MUL_MAT_TILE_M 8
|
|
81
|
+
#define WEBGPU_MUL_MAT_TILE_N 8
|
|
82
|
+
#define WEBGPU_MUL_MAT_WG_SIZE_M 8
|
|
83
|
+
#define WEBGPU_MUL_MAT_WG_SIZE_N 8
|
|
84
|
+
#define WEBGPU_MUL_MAT_TILE_K 32
|
|
85
|
+
|
|
86
|
+
// Subgroup matrix parameters
|
|
87
|
+
// The number of subgroups in the M dimension
|
|
88
|
+
#define WEBGPU_MUL_MAT_SUBGROUP_M 2
|
|
89
|
+
// The number of subgroups in the N dimension
|
|
90
|
+
#define WEBGPU_MUL_MAT_SUBGROUP_N 2
|
|
91
|
+
// The number of subgroup matrices each subgroup accumulates over
|
|
92
|
+
#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M 4
|
|
93
|
+
#define WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N 2
|
|
94
|
+
|
|
95
|
+
// Matrix-vector multiplication parameters
|
|
96
|
+
#define WEBGPU_MUL_MAT_VEC_WG_SIZE 256
|
|
97
|
+
// Must be multiple of 4 to work with vectorized paths, and must divide mul_mat_vec wg size
|
|
98
|
+
#define WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG 64
|
|
99
|
+
#define WEBGPU_MUL_MAT_VEC_TILE_K 256
|
|
100
|
+
|
|
38
101
|
/* End Constants */
|
|
39
102
|
|
|
40
103
|
// This is a "fake" base pointer, since WebGPU buffers do not have pointers to their locations.
|
|
@@ -62,6 +125,11 @@ struct webgpu_pool_bufs {
|
|
|
62
125
|
wgpu::Buffer dev_buf;
|
|
63
126
|
};
|
|
64
127
|
|
|
128
|
+
// The futures to wait on for a single queue submission
|
|
129
|
+
struct webgpu_submission_futures {
|
|
130
|
+
std::vector<wgpu::FutureWaitInfo> futures;
|
|
131
|
+
};
|
|
132
|
+
|
|
65
133
|
// Holds a pool of parameter buffers for WebGPU operations
|
|
66
134
|
struct webgpu_buf_pool {
|
|
67
135
|
std::vector<webgpu_pool_bufs> free;
|
|
@@ -108,6 +176,83 @@ struct webgpu_buf_pool {
|
|
|
108
176
|
}
|
|
109
177
|
};
|
|
110
178
|
|
|
179
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
180
|
+
struct webgpu_gpu_profile_bufs {
|
|
181
|
+
wgpu::Buffer host_buf;
|
|
182
|
+
wgpu::Buffer dev_buf;
|
|
183
|
+
wgpu::QuerySet query_set;
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
// Holds a pool of parameter buffers for WebGPU operations
|
|
187
|
+
struct webgpu_gpu_profile_buf_pool {
|
|
188
|
+
std::vector<webgpu_gpu_profile_bufs> free;
|
|
189
|
+
|
|
190
|
+
std::mutex mutex;
|
|
191
|
+
|
|
192
|
+
std::condition_variable cv;
|
|
193
|
+
|
|
194
|
+
void init(wgpu::Device device,
|
|
195
|
+
int num_bufs,
|
|
196
|
+
size_t buf_size,
|
|
197
|
+
wgpu::BufferUsage dev_buf_usage,
|
|
198
|
+
wgpu::BufferUsage host_buf_usage) {
|
|
199
|
+
for (int i = 0; i < num_bufs; i++) {
|
|
200
|
+
wgpu::Buffer host_buf;
|
|
201
|
+
wgpu::Buffer dev_buf;
|
|
202
|
+
ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_profile_buf");
|
|
203
|
+
ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_profile_buf");
|
|
204
|
+
// Create a query set for 2 timestamps
|
|
205
|
+
wgpu::QuerySetDescriptor ts_query_set_desc = {};
|
|
206
|
+
|
|
207
|
+
ts_query_set_desc.type = wgpu::QueryType::Timestamp;
|
|
208
|
+
ts_query_set_desc.count = 2;
|
|
209
|
+
wgpu::QuerySet ts_query_set = device.CreateQuerySet(&ts_query_set_desc);
|
|
210
|
+
|
|
211
|
+
free.push_back({ host_buf, dev_buf, ts_query_set });
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
webgpu_gpu_profile_bufs alloc_bufs() {
|
|
216
|
+
std::unique_lock<std::mutex> lock(mutex);
|
|
217
|
+
cv.wait(lock, [this] { return !free.empty(); });
|
|
218
|
+
webgpu_gpu_profile_bufs bufs = free.back();
|
|
219
|
+
free.pop_back();
|
|
220
|
+
return bufs;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
void free_bufs(std::vector<webgpu_gpu_profile_bufs> bufs) {
|
|
224
|
+
std::lock_guard<std::mutex> lock(mutex);
|
|
225
|
+
free.insert(free.end(), bufs.begin(), bufs.end());
|
|
226
|
+
cv.notify_all();
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
void cleanup() {
|
|
230
|
+
std::lock_guard<std::mutex> lock(mutex);
|
|
231
|
+
for (auto & bufs : free) {
|
|
232
|
+
bufs.host_buf.Destroy();
|
|
233
|
+
bufs.dev_buf.Destroy();
|
|
234
|
+
bufs.query_set.Destroy();
|
|
235
|
+
}
|
|
236
|
+
free.clear();
|
|
237
|
+
}
|
|
238
|
+
};
|
|
239
|
+
#endif
|
|
240
|
+
|
|
241
|
+
struct webgpu_pipeline {
|
|
242
|
+
wgpu::ComputePipeline pipeline;
|
|
243
|
+
std::string name;
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
struct webgpu_command {
|
|
247
|
+
wgpu::CommandBuffer commands;
|
|
248
|
+
webgpu_pool_bufs params_bufs;
|
|
249
|
+
std::optional<webgpu_pool_bufs> set_rows_error_bufs;
|
|
250
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
251
|
+
webgpu_gpu_profile_bufs timestamp_query_bufs;
|
|
252
|
+
std::string pipeline_name;
|
|
253
|
+
#endif
|
|
254
|
+
};
|
|
255
|
+
|
|
111
256
|
// All the base objects needed to run operations on a WebGPU device
|
|
112
257
|
struct webgpu_context_struct {
|
|
113
258
|
wgpu::Instance instance;
|
|
@@ -116,35 +261,64 @@ struct webgpu_context_struct {
|
|
|
116
261
|
wgpu::Queue queue;
|
|
117
262
|
wgpu::Limits limits;
|
|
118
263
|
|
|
264
|
+
bool supports_subgroup_matrix = false;
|
|
265
|
+
uint32_t subgroup_size;
|
|
266
|
+
wgpu::SubgroupMatrixConfig subgroup_matrix_config;
|
|
267
|
+
|
|
268
|
+
// Separate this out from limits since on some Metal systems, the limit returned by
|
|
269
|
+
// querying the limits is higher than the actual allowed maximum.
|
|
270
|
+
uint32_t max_wg_size_x;
|
|
271
|
+
|
|
119
272
|
std::recursive_mutex mutex;
|
|
273
|
+
std::atomic_uint inflight_threads = 0;
|
|
120
274
|
|
|
121
275
|
webgpu_buf_pool param_buf_pool;
|
|
122
276
|
webgpu_buf_pool set_rows_error_buf_pool;
|
|
123
277
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
278
|
+
webgpu_pipeline memset_pipeline;
|
|
279
|
+
|
|
280
|
+
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> mul_mat_pipelines; // src0_type, src1_type, vectorized
|
|
281
|
+
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>>
|
|
282
|
+
mul_mat_vec_pipelines; // src0_type, src1_type, vectorized
|
|
283
|
+
|
|
284
|
+
webgpu_pipeline mul_mat_pipeline[30][2];
|
|
285
|
+
webgpu_pipeline set_rows_pipeline[1][2]; // dst->type, vectorized
|
|
286
|
+
webgpu_pipeline get_rows_pipeline[30];
|
|
287
|
+
webgpu_pipeline get_rows_f32_no_vec_pipeline;
|
|
288
|
+
webgpu_pipeline cpy_pipeline[2][2]; // src type, dst type
|
|
289
|
+
webgpu_pipeline add_pipeline[2][2]; // type, inplace
|
|
290
|
+
webgpu_pipeline sub_pipeline[2][2]; // type, inplace
|
|
291
|
+
webgpu_pipeline mul_pipeline[2][2]; // type, inplace
|
|
292
|
+
webgpu_pipeline div_pipeline[2][2]; // type, inplace
|
|
293
|
+
webgpu_pipeline rms_norm_pipeline[2]; // inplace
|
|
294
|
+
webgpu_pipeline rope_pipeline[2][2][2]; // type, ff, inplace
|
|
295
|
+
webgpu_pipeline glu_pipeline[7][2][2]; // glu-op, type, split
|
|
296
|
+
webgpu_pipeline scale_pipeline[2]; // inplace
|
|
297
|
+
webgpu_pipeline soft_max_pipeline[3][2][2]; // (no_mask, f32_mask, f16_mask), has_sink, inplace
|
|
128
298
|
|
|
129
299
|
size_t memset_bytes_per_thread;
|
|
130
300
|
|
|
131
301
|
// Staging buffer for reading data from the GPU
|
|
132
302
|
wgpu::Buffer get_tensor_staging_buf;
|
|
133
303
|
|
|
134
|
-
// Command buffers which need to be submitted
|
|
135
|
-
std::vector<wgpu::CommandBuffer> staged_command_bufs;
|
|
136
|
-
|
|
137
|
-
// Parameter buffers associated with the staged command buffers
|
|
138
|
-
std::vector<webgpu_pool_bufs> staged_param_bufs;
|
|
139
|
-
// Buffers associated with set_rows operations, used to store potential errors
|
|
140
|
-
std::vector<webgpu_pool_bufs> staged_set_row_error_bufs;
|
|
141
|
-
|
|
142
|
-
std::vector<wgpu::FutureWaitInfo> callback_futures;
|
|
143
|
-
|
|
144
304
|
#ifdef GGML_WEBGPU_DEBUG
|
|
145
305
|
wgpu::Buffer debug_host_buf;
|
|
146
306
|
wgpu::Buffer debug_dev_buf;
|
|
147
307
|
#endif
|
|
308
|
+
|
|
309
|
+
#ifdef GGML_WEBGPU_CPU_PROFILE
|
|
310
|
+
// Profiling: labeled CPU time in ms (total)
|
|
311
|
+
std::unordered_map<std::string, double> cpu_time_ms;
|
|
312
|
+
// Profiling: detailed CPU time in ms
|
|
313
|
+
std::unordered_map<std::string, double> cpu_detail_ms;
|
|
314
|
+
#endif
|
|
315
|
+
|
|
316
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
317
|
+
// Profiling: per-shader GPU time in ms
|
|
318
|
+
std::unordered_map<std::string, double> shader_gpu_time_ms;
|
|
319
|
+
// Profiling: pool of timestamp query buffers (one per operation)
|
|
320
|
+
webgpu_gpu_profile_buf_pool timestamp_query_buf_pool;
|
|
321
|
+
#endif
|
|
148
322
|
};
|
|
149
323
|
|
|
150
324
|
typedef std::shared_ptr<webgpu_context_struct> webgpu_context;
|
|
@@ -169,23 +343,66 @@ struct ggml_backend_webgpu_context {
|
|
|
169
343
|
struct ggml_backend_webgpu_buffer_context {
|
|
170
344
|
webgpu_context webgpu_ctx;
|
|
171
345
|
wgpu::Buffer buffer;
|
|
346
|
+
std::string label;
|
|
172
347
|
|
|
173
|
-
ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf) :
|
|
348
|
+
ggml_backend_webgpu_buffer_context(webgpu_context ctx, wgpu::Buffer buf, std::string lbl) :
|
|
174
349
|
webgpu_ctx(std::move(ctx)),
|
|
175
|
-
buffer(std::move(buf))
|
|
350
|
+
buffer(std::move(buf)),
|
|
351
|
+
label(std::move(lbl)) {}
|
|
176
352
|
};
|
|
177
353
|
|
|
178
354
|
/* End struct definitions */
|
|
179
355
|
|
|
180
356
|
/* WebGPU object initializations */
|
|
181
357
|
|
|
358
|
+
// Process a WGSL shader string, replacing tokens of the form {{KEY}} with
|
|
359
|
+
// the corresponding values provided in `repls`.
|
|
360
|
+
static std::string ggml_webgpu_process_shader_repls(const char * src,
|
|
361
|
+
const std::map<std::string, std::string> & repls) {
|
|
362
|
+
if (!src) {
|
|
363
|
+
return std::string();
|
|
364
|
+
}
|
|
365
|
+
std::string s = src;
|
|
366
|
+
for (const auto & kv : repls) {
|
|
367
|
+
std::string token = "{{" + kv.first + "}}";
|
|
368
|
+
size_t pos = 0;
|
|
369
|
+
while ((pos = s.find(token, pos)) != std::string::npos) {
|
|
370
|
+
s.replace(pos, token.length(), kv.second);
|
|
371
|
+
pos += kv.second.length();
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
return s;
|
|
375
|
+
}
|
|
376
|
+
|
|
182
377
|
static void ggml_webgpu_create_pipeline(wgpu::Device & device,
|
|
183
|
-
|
|
378
|
+
webgpu_pipeline & pipeline,
|
|
184
379
|
const char * shader_code,
|
|
185
380
|
const char * label,
|
|
186
381
|
const std::vector<wgpu::ConstantEntry> & constants = {}) {
|
|
187
|
-
|
|
382
|
+
wgpu::ShaderSourceWGSL shader_source;
|
|
383
|
+
shader_source.code = shader_code;
|
|
384
|
+
|
|
385
|
+
wgpu::ShaderModuleDescriptor shader_desc;
|
|
386
|
+
shader_desc.nextInChain = &shader_source;
|
|
387
|
+
|
|
388
|
+
wgpu::ShaderModule shader_module = device.CreateShaderModule(&shader_desc);
|
|
389
|
+
|
|
390
|
+
wgpu::ComputePipelineDescriptor pipeline_desc;
|
|
391
|
+
pipeline_desc.label = label;
|
|
392
|
+
pipeline_desc.compute.module = shader_module;
|
|
393
|
+
pipeline_desc.compute.entryPoint = "main"; // Entry point in the WGSL code
|
|
394
|
+
pipeline_desc.layout = nullptr; // nullptr means auto layout
|
|
395
|
+
if (constants.size() > 0) {
|
|
396
|
+
pipeline_desc.compute.constants = constants.data();
|
|
397
|
+
pipeline_desc.compute.constantCount = constants.size();
|
|
398
|
+
}
|
|
399
|
+
pipeline = { device.CreateComputePipeline(&pipeline_desc), label };
|
|
400
|
+
}
|
|
188
401
|
|
|
402
|
+
static webgpu_pipeline ggml_webgpu_create_pipeline2(wgpu::Device & device,
|
|
403
|
+
const char * shader_code,
|
|
404
|
+
const char * label,
|
|
405
|
+
const std::vector<wgpu::ConstantEntry> & constants = {}) {
|
|
189
406
|
wgpu::ShaderSourceWGSL shader_source;
|
|
190
407
|
shader_source.code = shader_code;
|
|
191
408
|
|
|
@@ -203,7 +420,7 @@ static void ggml_webgpu_create_pipeline(wgpu::Device &
|
|
|
203
420
|
pipeline_desc.compute.constants = constants.data();
|
|
204
421
|
pipeline_desc.compute.constantCount = constants.size();
|
|
205
422
|
}
|
|
206
|
-
|
|
423
|
+
return { device.CreateComputePipeline(&pipeline_desc), label };
|
|
207
424
|
}
|
|
208
425
|
|
|
209
426
|
static void ggml_webgpu_create_buffer(wgpu::Device & device,
|
|
@@ -211,8 +428,6 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,
|
|
|
211
428
|
size_t size,
|
|
212
429
|
wgpu::BufferUsage usage,
|
|
213
430
|
const char * label) {
|
|
214
|
-
WEBGPU_LOG_DEBUG("ggml_webgpu_create_buffer()");
|
|
215
|
-
|
|
216
431
|
wgpu::BufferDescriptor buffer_desc;
|
|
217
432
|
buffer_desc.size = size;
|
|
218
433
|
buffer_desc.usage = usage;
|
|
@@ -228,81 +443,35 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device,
|
|
|
228
443
|
/** WebGPU Actions */
|
|
229
444
|
|
|
230
445
|
// Wait for the queue to finish processing all submitted work
|
|
231
|
-
static void
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
UINT64_MAX);
|
|
243
|
-
} else {
|
|
244
|
-
// existing callbacks, wait on them
|
|
245
|
-
ctx->instance.WaitAny(ctx->callback_futures.size(), ctx->callback_futures.data(), UINT64_MAX);
|
|
246
|
-
ctx->callback_futures.clear();
|
|
446
|
+
static void ggml_backend_webgpu_wait(webgpu_context & ctx,
|
|
447
|
+
std::vector<webgpu_submission_futures> & futures,
|
|
448
|
+
bool block = true) {
|
|
449
|
+
// If we have too many in-flight submissions, wait on the oldest one first. If there are many threads,
|
|
450
|
+
// inflight_max may be 0, meaning that we must wait on all futures.
|
|
451
|
+
uint64_t timeout_ms = block ? UINT64_MAX : 0;
|
|
452
|
+
uint inflight_threads = ctx->inflight_threads;
|
|
453
|
+
uint inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u);
|
|
454
|
+
while (futures.size() >= inflight_max && futures.size() > 0) {
|
|
455
|
+
ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX);
|
|
456
|
+
futures.erase(futures.begin());
|
|
247
457
|
}
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
encoder.CopyBufferToBuffer(error_bufs.dev_buf, 0, error_bufs.host_buf, 0, error_bufs.host_buf.GetSize());
|
|
458
|
+
size_t i = 0;
|
|
459
|
+
while (i < futures.size()) {
|
|
460
|
+
auto waitStatus = ctx->instance.WaitAny(futures[i].futures.size(), futures[i].futures.data(), timeout_ms);
|
|
461
|
+
switch (waitStatus) {
|
|
462
|
+
case wgpu::WaitStatus::Success:
|
|
463
|
+
futures.erase(futures.begin() + i);
|
|
464
|
+
break;
|
|
465
|
+
case wgpu::WaitStatus::TimedOut:
|
|
466
|
+
i++;
|
|
467
|
+
break;
|
|
468
|
+
case wgpu::WaitStatus::Error:
|
|
469
|
+
GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an error\n");
|
|
470
|
+
break;
|
|
471
|
+
default:
|
|
472
|
+
GGML_LOG_ERROR("ggml_webgpu: WaitAny returned an unknown status\n");
|
|
473
|
+
break;
|
|
265
474
|
}
|
|
266
|
-
wgpu::CommandBuffer commands = encoder.Finish();
|
|
267
|
-
ctx->queue.Submit(1, &commands);
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
ctx->staged_command_bufs.clear();
|
|
271
|
-
std::vector<webgpu_pool_bufs> staged_param_bufs = std::move(ctx->staged_param_bufs);
|
|
272
|
-
std::vector<webgpu_pool_bufs> staged_set_row_error_bufs = std::move(ctx->staged_set_row_error_bufs);
|
|
273
|
-
|
|
274
|
-
// Free the staged parameter buffers once the submission completes
|
|
275
|
-
wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
|
|
276
|
-
wgpu::CallbackMode::AllowSpontaneous,
|
|
277
|
-
[ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
|
|
278
|
-
if (status != wgpu::QueueWorkDoneStatus::Success) {
|
|
279
|
-
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
|
|
280
|
-
}
|
|
281
|
-
// Free the staged buffers
|
|
282
|
-
ctx->param_buf_pool.free_bufs(staged_param_bufs);
|
|
283
|
-
});
|
|
284
|
-
ctx->callback_futures.push_back({ p_f });
|
|
285
|
-
|
|
286
|
-
// Check for errrors in SET_ROWS operations
|
|
287
|
-
for (auto & error_bufs : staged_set_row_error_bufs) {
|
|
288
|
-
wgpu::Future f = error_bufs.host_buf.MapAsync(
|
|
289
|
-
wgpu::MapMode::Read,
|
|
290
|
-
0,
|
|
291
|
-
error_bufs.host_buf.GetSize(),
|
|
292
|
-
wgpu::CallbackMode::AllowSpontaneous,
|
|
293
|
-
[ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
|
294
|
-
if (status != wgpu::MapAsyncStatus::Success) {
|
|
295
|
-
GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
|
|
296
|
-
} else {
|
|
297
|
-
const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange();
|
|
298
|
-
if (*error_data) {
|
|
299
|
-
GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
|
|
300
|
-
}
|
|
301
|
-
// We can't unmap in here due to WebGPU reentrancy limitations.
|
|
302
|
-
ctx->set_rows_error_buf_pool.free_bufs({ error_bufs });
|
|
303
|
-
}
|
|
304
|
-
});
|
|
305
|
-
ctx->callback_futures.push_back({ f });
|
|
306
475
|
}
|
|
307
476
|
}
|
|
308
477
|
|
|
@@ -311,10 +480,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
|
|
|
311
480
|
wgpu::MapMode mode,
|
|
312
481
|
size_t offset,
|
|
313
482
|
size_t size) {
|
|
314
|
-
ctx->instance.WaitAny(buffer.MapAsync(mode,
|
|
315
|
-
offset,
|
|
316
|
-
size,
|
|
317
|
-
wgpu::CallbackMode::AllowSpontaneous,
|
|
483
|
+
ctx->instance.WaitAny(buffer.MapAsync(mode, offset, size, wgpu::CallbackMode::AllowSpontaneous,
|
|
318
484
|
[](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
|
319
485
|
if (status != wgpu::MapAsyncStatus::Success) {
|
|
320
486
|
GGML_LOG_ERROR("ggml_webgpu: Failed to map buffer: %s\n",
|
|
@@ -329,7 +495,6 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx,
|
|
|
329
495
|
// To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and
|
|
330
496
|
// debug statements in the shader, and then call this function after encoding the commands and submitting them.
|
|
331
497
|
static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
|
|
332
|
-
ggml_backend_webgpu_submit_queue(ctx);
|
|
333
498
|
wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
|
|
334
499
|
encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize());
|
|
335
500
|
wgpu::CommandBuffer commands = encoder.Finish();
|
|
@@ -346,12 +511,86 @@ static void ggml_backend_webgpu_debug(webgpu_context & ctx) {
|
|
|
346
511
|
}
|
|
347
512
|
#endif
|
|
348
513
|
|
|
349
|
-
static
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
514
|
+
static webgpu_submission_futures ggml_backend_webgpu_submit(webgpu_context ctx, std::vector<webgpu_command> commands) {
|
|
515
|
+
std::vector<wgpu::CommandBuffer> command_buffers;
|
|
516
|
+
std::vector<webgpu_pool_bufs> params_bufs;
|
|
517
|
+
std::vector<webgpu_pool_bufs> set_rows_error_bufs;
|
|
518
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
519
|
+
std::vector<std::pair<std::string, webgpu_gpu_profile_bufs>> pipeline_name_and_ts_bufs;
|
|
520
|
+
#endif
|
|
521
|
+
|
|
522
|
+
for (const auto & command : commands) {
|
|
523
|
+
command_buffers.push_back(command.commands);
|
|
524
|
+
params_bufs.push_back(command.params_bufs);
|
|
525
|
+
if (command.set_rows_error_bufs) {
|
|
526
|
+
set_rows_error_bufs.push_back(command.set_rows_error_bufs.value());
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
ctx->queue.Submit(command_buffers.size(), command_buffers.data());
|
|
530
|
+
|
|
531
|
+
std::vector<wgpu::FutureWaitInfo> futures;
|
|
532
|
+
|
|
533
|
+
wgpu::Future p_f = ctx->queue.OnSubmittedWorkDone(
|
|
534
|
+
wgpu::CallbackMode::AllowSpontaneous,
|
|
535
|
+
[ctx, params_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
|
|
536
|
+
if (status != wgpu::QueueWorkDoneStatus::Success) {
|
|
537
|
+
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str());
|
|
538
|
+
}
|
|
539
|
+
// Free the staged buffers
|
|
540
|
+
ctx->param_buf_pool.free_bufs({ params_bufs });
|
|
541
|
+
});
|
|
542
|
+
futures.push_back({ p_f });
|
|
543
|
+
|
|
544
|
+
for (const auto & bufs : set_rows_error_bufs) {
|
|
545
|
+
wgpu::Future f = bufs.host_buf.MapAsync(
|
|
546
|
+
wgpu::MapMode::Read, 0, bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
|
|
547
|
+
[ctx, bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
|
548
|
+
if (status != wgpu::MapAsyncStatus::Success) {
|
|
549
|
+
GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str());
|
|
550
|
+
} else {
|
|
551
|
+
const uint32_t * error_data = (const uint32_t *) bufs.host_buf.GetConstMappedRange();
|
|
552
|
+
if (*error_data) {
|
|
553
|
+
GGML_ABORT("ggml_webgpu: SET_ROWS index > 2^32, unsupported.");
|
|
554
|
+
}
|
|
555
|
+
// We can't unmap in here due to WebGPU reentrancy limitations.
|
|
556
|
+
ctx->set_rows_error_buf_pool.free_bufs({ bufs });
|
|
557
|
+
}
|
|
558
|
+
});
|
|
559
|
+
futures.push_back({ f });
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
563
|
+
for (const auto & command : commands) {
|
|
564
|
+
auto label = command.pipeline_name;
|
|
565
|
+
auto ts_bufs = command.timestamp_query_bufs;
|
|
566
|
+
|
|
567
|
+
wgpu::Future f = ts_bufs.host_buf.MapAsync(
|
|
568
|
+
wgpu::MapMode::Read, 0, ts_bufs.host_buf.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
|
|
569
|
+
[ctx, ts_bufs, label](wgpu::MapAsyncStatus status, wgpu::StringView message) {
|
|
570
|
+
if (status != wgpu::MapAsyncStatus::Success) {
|
|
571
|
+
GGML_LOG_ERROR("ggml_webgpu: Failed to map timestamp buffer: %s\n", std::string(message).c_str());
|
|
572
|
+
} else {
|
|
573
|
+
const uint64_t * ts_data = (const uint64_t *) ts_bufs.host_buf.GetConstMappedRange();
|
|
574
|
+
// WebGPU timestamps are in ns; convert to ms
|
|
575
|
+
double elapsed_ms = double(ts_data[1] - ts_data[0]) * 1e-6;
|
|
576
|
+
ctx->shader_gpu_time_ms[label] += elapsed_ms;
|
|
577
|
+
// We can't unmap in here due to WebGPU reentrancy limitations.
|
|
578
|
+
ctx->timestamp_query_buf_pool.free_bufs({ ts_bufs });
|
|
579
|
+
}
|
|
580
|
+
});
|
|
581
|
+
futures.push_back({ f });
|
|
582
|
+
}
|
|
583
|
+
#endif
|
|
584
|
+
return { futures };
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
static webgpu_command ggml_backend_webgpu_build(webgpu_context & ctx,
|
|
588
|
+
webgpu_pipeline & pipeline,
|
|
589
|
+
std::vector<uint32_t> params,
|
|
590
|
+
std::vector<wgpu::BindGroupEntry> bind_group_entries,
|
|
591
|
+
uint32_t wg_x,
|
|
592
|
+
uint32_t wg_y = 1,
|
|
593
|
+
std::optional<webgpu_pool_bufs> set_rows_error_bufs = std::nullopt) {
|
|
355
594
|
webgpu_pool_bufs params_bufs = ctx->param_buf_pool.alloc_bufs();
|
|
356
595
|
|
|
357
596
|
ggml_backend_webgpu_map_buffer(ctx, params_bufs.host_buf, wgpu::MapMode::Write, 0, params_bufs.host_buf.GetSize());
|
|
@@ -369,41 +608,58 @@ static void ggml_backend_webgpu_build_and_enqueue(webgpu_context &
|
|
|
369
608
|
.size = params_bufs.dev_buf.GetSize() });
|
|
370
609
|
|
|
371
610
|
wgpu::BindGroupDescriptor bind_group_desc;
|
|
372
|
-
bind_group_desc.layout = pipeline.GetBindGroupLayout(0);
|
|
611
|
+
bind_group_desc.layout = pipeline.pipeline.GetBindGroupLayout(0);
|
|
373
612
|
bind_group_desc.entryCount = bind_group_entries.size();
|
|
374
613
|
bind_group_desc.entries = bind_group_entries.data();
|
|
614
|
+
bind_group_desc.label = pipeline.name.c_str();
|
|
375
615
|
wgpu::BindGroup bind_group = ctx->device.CreateBindGroup(&bind_group_desc);
|
|
376
616
|
|
|
377
617
|
wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder();
|
|
378
618
|
encoder.CopyBufferToBuffer(params_bufs.host_buf, 0, params_bufs.dev_buf, 0, params_bufs.dev_buf.GetSize());
|
|
619
|
+
|
|
620
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
621
|
+
// --- Profiling: GPU timestamp queries ---
|
|
622
|
+
// Allocate a timestamp query buffer (2 timestamps: start/end)
|
|
623
|
+
webgpu_gpu_profile_bufs ts_bufs = ctx->timestamp_query_buf_pool.alloc_bufs();
|
|
624
|
+
if (ts_bufs.host_buf.GetMapState() == wgpu::BufferMapState::Mapped) {
|
|
625
|
+
ts_bufs.host_buf.Unmap();
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
wgpu::PassTimestampWrites ts_writes = { .querySet = ts_bufs.query_set,
|
|
629
|
+
.beginningOfPassWriteIndex = 0,
|
|
630
|
+
.endOfPassWriteIndex = 1 };
|
|
631
|
+
wgpu::ComputePassDescriptor pass_desc = { .timestampWrites = &ts_writes };
|
|
632
|
+
wgpu::ComputePassEncoder pass = encoder.BeginComputePass(&pass_desc);
|
|
633
|
+
#else
|
|
379
634
|
wgpu::ComputePassEncoder pass = encoder.BeginComputePass();
|
|
380
|
-
|
|
635
|
+
#endif
|
|
636
|
+
pass.SetPipeline(pipeline.pipeline);
|
|
381
637
|
pass.SetBindGroup(0, bind_group);
|
|
382
|
-
pass.DispatchWorkgroups(wg_x,
|
|
638
|
+
pass.DispatchWorkgroups(wg_x, wg_y, 1);
|
|
383
639
|
pass.End();
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
}),
|
|
396
|
-
UINT64_MAX);
|
|
397
|
-
} else {
|
|
398
|
-
// Lock the context mutex when pushing to the staging vectors.
|
|
399
|
-
std::lock_guard<std::recursive_mutex> lock(ctx->mutex);
|
|
400
|
-
// Enqueue commands and only submit if we have enough staged commands
|
|
401
|
-
ctx->staged_command_bufs.push_back(commands);
|
|
402
|
-
ctx->staged_param_bufs.push_back(params_bufs);
|
|
403
|
-
if (ctx->staged_command_bufs.size() == WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) {
|
|
404
|
-
ggml_backend_webgpu_submit_queue(ctx);
|
|
405
|
-
}
|
|
640
|
+
|
|
641
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
642
|
+
// Resolve the query set into the device buffer
|
|
643
|
+
encoder.ResolveQuerySet(ts_bufs.query_set, 0, 2, ts_bufs.dev_buf, 0);
|
|
644
|
+
encoder.CopyBufferToBuffer(ts_bufs.dev_buf, 0, ts_bufs.host_buf, 0, ts_bufs.host_buf.GetSize());
|
|
645
|
+
#endif
|
|
646
|
+
|
|
647
|
+
// If there are SET_ROWS operations in this submission, copy their error buffers to the host.
|
|
648
|
+
if (set_rows_error_bufs) {
|
|
649
|
+
encoder.CopyBufferToBuffer(set_rows_error_bufs->dev_buf, 0, set_rows_error_bufs->host_buf, 0,
|
|
650
|
+
set_rows_error_bufs->host_buf.GetSize());
|
|
406
651
|
}
|
|
652
|
+
|
|
653
|
+
wgpu::CommandBuffer commands = encoder.Finish();
|
|
654
|
+
webgpu_command result = {};
|
|
655
|
+
result.commands = commands;
|
|
656
|
+
result.params_bufs = params_bufs;
|
|
657
|
+
result.set_rows_error_bufs = set_rows_error_bufs;
|
|
658
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
659
|
+
result.timestamp_query_bufs = ts_bufs;
|
|
660
|
+
result.pipeline_name = pipeline.name;
|
|
661
|
+
#endif
|
|
662
|
+
return result;
|
|
407
663
|
}
|
|
408
664
|
|
|
409
665
|
static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
|
|
@@ -415,9 +671,12 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx,
|
|
|
415
671
|
std::vector<wgpu::BindGroupEntry> entries = {
|
|
416
672
|
{ .binding = 0, .buffer = buf, .offset = 0, .size = buf.GetSize() }
|
|
417
673
|
};
|
|
418
|
-
size_t bytes_per_wg = ctx->
|
|
674
|
+
size_t bytes_per_wg = ctx->max_wg_size_x * ctx->memset_bytes_per_thread;
|
|
419
675
|
uint32_t wg_x = ((size + 3) + bytes_per_wg - 1) / bytes_per_wg;
|
|
420
|
-
|
|
676
|
+
|
|
677
|
+
webgpu_command command = ggml_backend_webgpu_build(ctx, ctx->memset_pipeline, params, entries, wg_x);
|
|
678
|
+
std::vector<webgpu_submission_futures> futures = { ggml_backend_webgpu_submit(ctx, { command }) };
|
|
679
|
+
ggml_backend_webgpu_wait(ctx, futures);
|
|
421
680
|
}
|
|
422
681
|
|
|
423
682
|
/** End WebGPU Actions */
|
|
@@ -433,8 +692,48 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) {
|
|
|
433
692
|
ggml_backend_webgpu_context * ctx = (ggml_backend_webgpu_context *) backend->context;
|
|
434
693
|
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_free(" << ctx->name << ")");
|
|
435
694
|
|
|
436
|
-
|
|
695
|
+
#ifdef GGML_WEBGPU_CPU_PROFILE
|
|
696
|
+
std::cout << "\n[ggml_webgpu cpu profiling summary]\n";
|
|
697
|
+
double total_cpu = 0.0;
|
|
698
|
+
for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
|
|
699
|
+
total_cpu += kv.second;
|
|
700
|
+
}
|
|
701
|
+
std::cout << "ggml_webgpu: total cpu time: " << total_cpu << " ms\n";
|
|
702
|
+
std::cout << "ggml_webgpu: cpu breakdown:\n";
|
|
703
|
+
for (const auto & kv : ctx->webgpu_ctx->cpu_time_ms) {
|
|
704
|
+
double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
|
|
705
|
+
std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
|
|
706
|
+
}
|
|
707
|
+
if (ctx->webgpu_ctx->cpu_detail_ms.size() > 0) {
|
|
708
|
+
std::cout << "ggml_webgpu: cpu detailed breakdown:\n";
|
|
709
|
+
}
|
|
710
|
+
for (const auto & kv : ctx->webgpu_ctx->cpu_detail_ms) {
|
|
711
|
+
double pct = (total_cpu > 0.0) ? (kv.second / total_cpu * 100.0) : 0.0;
|
|
712
|
+
std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
|
|
713
|
+
}
|
|
714
|
+
#endif
|
|
715
|
+
|
|
716
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
717
|
+
std::cout << "\n[ggml_webgpu gpu profiling summary]\n";
|
|
718
|
+
double total_gpu = 0.0;
|
|
719
|
+
for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
|
|
720
|
+
total_gpu += kv.second;
|
|
721
|
+
}
|
|
722
|
+
std::cout << "ggml_webgpu: total gpu time (all shaders): " << total_gpu << " ms\n";
|
|
723
|
+
std::cout << "\nggml_webgpu: gpu breakdown:\n";
|
|
724
|
+
for (const auto & kv : ctx->webgpu_ctx->shader_gpu_time_ms) {
|
|
725
|
+
double pct = (total_gpu > 0.0) ? (kv.second / total_gpu * 100.0) : 0.0;
|
|
726
|
+
std::cout << "ggml_webgpu: " << kv.first << ": " << kv.second << " ms (" << pct << "%)\n";
|
|
727
|
+
}
|
|
728
|
+
#endif
|
|
729
|
+
|
|
730
|
+
#if defined(GGML_WEBGPU_CPU_PROFILE) && defined(GGML_WEBGPU_GPU_PROFILE)
|
|
731
|
+
std::cout << "ggml_webgpu: gpu/cpu ratio: " << (total_cpu > 0.0 ? total_gpu / total_cpu : 0.0) << "\n";
|
|
732
|
+
#endif
|
|
733
|
+
|
|
734
|
+
#if !defined(GGML_WEBGPU_CPU_PROFILE) && !defined(GGML_WEBGPU_GPU_PROFILE)
|
|
437
735
|
GGML_UNUSED(ctx);
|
|
736
|
+
#endif
|
|
438
737
|
}
|
|
439
738
|
|
|
440
739
|
static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) {
|
|
@@ -461,26 +760,27 @@ static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor
|
|
|
461
760
|
~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1);
|
|
462
761
|
}
|
|
463
762
|
|
|
464
|
-
|
|
763
|
+
// Used to determine if two tensors are the same for in-place operations
|
|
764
|
+
static bool ggml_webgpu_tensor_equal(ggml_tensor * a, ggml_tensor * b) {
|
|
765
|
+
return (ggml_webgpu_tensor_buf(a).Get() == ggml_webgpu_tensor_buf(b).Get()) &&
|
|
766
|
+
(ggml_webgpu_tensor_offset(a) == ggml_webgpu_tensor_offset(b));
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
static webgpu_command ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
|
465
770
|
uint32_t ne = (uint32_t) ggml_nelements(dst);
|
|
466
771
|
|
|
467
|
-
std::vector<uint32_t> params = {
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
// Logical shape — same for both tensors even if permuted
|
|
480
|
-
(uint32_t) src->ne[0],
|
|
481
|
-
(uint32_t) src->ne[1],
|
|
482
|
-
(uint32_t) src->ne[2],
|
|
483
|
-
(uint32_t) src->ne[3] };
|
|
772
|
+
std::vector<uint32_t> params = {
|
|
773
|
+
ne, (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
|
774
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
775
|
+
// Convert byte-strides to element-strides
|
|
776
|
+
(uint32_t) (src->nb[0] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)),
|
|
777
|
+
(uint32_t) (src->nb[2] / ggml_type_size(src->type)), (uint32_t) (src->nb[3] / ggml_type_size(src->type)),
|
|
778
|
+
(uint32_t) (dst->nb[0] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
|
779
|
+
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
780
|
+
// Logical shapes
|
|
781
|
+
(uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) dst->ne[0],
|
|
782
|
+
(uint32_t) dst->ne[1], (uint32_t) dst->ne[2]
|
|
783
|
+
};
|
|
484
784
|
|
|
485
785
|
std::vector<wgpu::BindGroupEntry> entries = {
|
|
486
786
|
{ .binding = 0,
|
|
@@ -493,15 +793,18 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor
|
|
|
493
793
|
.size = ggml_webgpu_tensor_binding_size(ctx, dst) }
|
|
494
794
|
};
|
|
495
795
|
|
|
496
|
-
size_t max_wg_size = ctx->
|
|
796
|
+
size_t max_wg_size = ctx->max_wg_size_x;
|
|
497
797
|
uint32_t wg_x = (ne + max_wg_size - 1) / max_wg_size;
|
|
498
|
-
|
|
798
|
+
return ggml_backend_webgpu_build(ctx, ctx->cpy_pipeline[src->type][dst->type], params, entries, wg_x);
|
|
499
799
|
}
|
|
500
800
|
|
|
501
|
-
static
|
|
801
|
+
static std::optional<webgpu_command> ggml_webgpu_set_rows(webgpu_context & ctx,
|
|
802
|
+
ggml_tensor * src,
|
|
803
|
+
ggml_tensor * idx,
|
|
804
|
+
ggml_tensor * dst) {
|
|
502
805
|
// For set rows specifically, we need to check if src and idx are empty tensors.
|
|
503
806
|
if (ggml_is_empty(src) || ggml_is_empty(idx)) {
|
|
504
|
-
return;
|
|
807
|
+
return std::nullopt;
|
|
505
808
|
}
|
|
506
809
|
|
|
507
810
|
webgpu_pool_bufs error_bufs = ctx->set_rows_error_buf_pool.alloc_bufs();
|
|
@@ -509,27 +812,21 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
|
|
|
509
812
|
error_bufs.host_buf.Unmap();
|
|
510
813
|
}
|
|
511
814
|
|
|
512
|
-
std::vector<uint32_t> params = {
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
(uint32_t) src->ne[1],
|
|
528
|
-
(uint32_t) src->ne[2],
|
|
529
|
-
(uint32_t) src->ne[3],
|
|
530
|
-
// Shape of idx
|
|
531
|
-
(uint32_t) (idx->ne[1]),
|
|
532
|
-
(uint32_t) (idx->ne[2]) };
|
|
815
|
+
std::vector<uint32_t> params = {
|
|
816
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
|
817
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
|
|
818
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
819
|
+
// Convert byte-strides to element-strides
|
|
820
|
+
(uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
|
821
|
+
(uint32_t) (src->nb[3] / ggml_type_size(src->type)), (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
|
|
822
|
+
(uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
|
|
823
|
+
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
|
824
|
+
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
825
|
+
// Shape of src
|
|
826
|
+
(uint32_t) src->ne[0], (uint32_t) src->ne[1], (uint32_t) src->ne[2], (uint32_t) src->ne[3],
|
|
827
|
+
// Shape of idx
|
|
828
|
+
(uint32_t) (idx->ne[1]), (uint32_t) (idx->ne[2])
|
|
829
|
+
};
|
|
533
830
|
|
|
534
831
|
std::vector<wgpu::BindGroupEntry> entries = {
|
|
535
832
|
{ .binding = 0,
|
|
@@ -547,22 +844,77 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t
|
|
|
547
844
|
{ .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() }
|
|
548
845
|
};
|
|
549
846
|
|
|
550
|
-
size_t
|
|
551
|
-
uint32_t wg_x = (src->ne[1] * src->ne[2] * src->ne[3] + max_wg_size - 1) / max_wg_size;
|
|
847
|
+
size_t max_wg_size = ctx->max_wg_size_x;
|
|
552
848
|
|
|
553
|
-
|
|
554
|
-
ctx->
|
|
849
|
+
int vectorized = src->ne[0] % 4 == 0;
|
|
850
|
+
webgpu_pipeline pipeline = ctx->set_rows_pipeline[0][vectorized];
|
|
851
|
+
uint32_t threads;
|
|
852
|
+
if (vectorized) {
|
|
853
|
+
threads = (src->ne[1] * src->ne[2] * src->ne[3]) * (src->ne[0] / 4);
|
|
854
|
+
} else {
|
|
855
|
+
threads = src->ne[0] * src->ne[1] * src->ne[2] * src->ne[3];
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
uint32_t wg_x = (threads + max_wg_size - 1) / max_wg_size;
|
|
555
859
|
|
|
556
|
-
|
|
860
|
+
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, 1, error_bufs);
|
|
557
861
|
}
|
|
558
862
|
|
|
559
|
-
static
|
|
863
|
+
static webgpu_command ggml_webgpu_get_rows(webgpu_context & ctx,
|
|
864
|
+
ggml_tensor * src,
|
|
865
|
+
ggml_tensor * idx,
|
|
866
|
+
ggml_tensor * dst) {
|
|
867
|
+
std::vector<uint32_t> params = {
|
|
868
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
|
869
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)),
|
|
870
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
871
|
+
// Convert byte-strides to element-strides
|
|
872
|
+
(uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
|
873
|
+
(uint32_t) (src->nb[3] / ggml_type_size(src->type)), (uint32_t) (idx->nb[0] / ggml_type_size(idx->type)),
|
|
874
|
+
(uint32_t) (idx->nb[1] / ggml_type_size(idx->type)), (uint32_t) (idx->nb[2] / ggml_type_size(idx->type)),
|
|
875
|
+
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)), (uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
|
876
|
+
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
877
|
+
// Shape of dst
|
|
878
|
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3],
|
|
879
|
+
// Shape of idx
|
|
880
|
+
(uint32_t) (idx->ne[1]), (uint32_t) (idx->ne[2])
|
|
881
|
+
};
|
|
882
|
+
|
|
883
|
+
std::vector<wgpu::BindGroupEntry> entries = {
|
|
884
|
+
{ .binding = 0,
|
|
885
|
+
.buffer = ggml_webgpu_tensor_buf(src),
|
|
886
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src),
|
|
887
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src) },
|
|
888
|
+
{ .binding = 1,
|
|
889
|
+
.buffer = ggml_webgpu_tensor_buf(idx),
|
|
890
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, idx),
|
|
891
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, idx) },
|
|
892
|
+
{ .binding = 2,
|
|
893
|
+
.buffer = ggml_webgpu_tensor_buf(dst),
|
|
894
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
|
895
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, dst) }
|
|
896
|
+
};
|
|
897
|
+
|
|
898
|
+
size_t max_wg_size = ctx->max_wg_size_x;
|
|
899
|
+
uint32_t wg_x = (dst->ne[1] * dst->ne[2] * dst->ne[3] + max_wg_size - 1) / max_wg_size;
|
|
900
|
+
|
|
901
|
+
webgpu_pipeline pipeline = ctx->get_rows_pipeline[src->type];
|
|
902
|
+
if (src->type == GGML_TYPE_F32 && dst->ne[0] % 4 != 0) {
|
|
903
|
+
pipeline = ctx->get_rows_f32_no_vec_pipeline;
|
|
904
|
+
}
|
|
905
|
+
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
|
|
909
|
+
ggml_tensor * src0,
|
|
910
|
+
ggml_tensor * src1,
|
|
911
|
+
ggml_tensor * dst) {
|
|
560
912
|
std::vector<uint32_t> params = {
|
|
561
913
|
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
|
562
914
|
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
|
563
915
|
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
564
|
-
(uint32_t) dst->ne[
|
|
565
|
-
(uint32_t) dst->ne[
|
|
916
|
+
(uint32_t) dst->ne[0], // number of rows in result (M, transposed)
|
|
917
|
+
(uint32_t) dst->ne[1], // number of columns in result (N)
|
|
566
918
|
(uint32_t) src0->ne[0], // number of columns in src0/src1 (K)
|
|
567
919
|
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 1
|
|
568
920
|
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 1
|
|
@@ -591,46 +943,463 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t
|
|
|
591
943
|
.size = ggml_webgpu_tensor_binding_size(ctx, dst) },
|
|
592
944
|
};
|
|
593
945
|
|
|
946
|
+
webgpu_pipeline pipeline = ctx->mul_mat_pipeline[src0->type][src1->type];
|
|
947
|
+
|
|
594
948
|
uint32_t wg_x =
|
|
595
949
|
(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE;
|
|
596
|
-
|
|
950
|
+
uint32_t wg_y = 1;
|
|
951
|
+
|
|
952
|
+
bool use_fast = false;
|
|
953
|
+
switch (src1->type) {
|
|
954
|
+
case GGML_TYPE_F16:
|
|
955
|
+
use_fast = (src0->type == GGML_TYPE_F16);
|
|
956
|
+
break;
|
|
957
|
+
case GGML_TYPE_F32:
|
|
958
|
+
switch (src0->type) {
|
|
959
|
+
case GGML_TYPE_F32:
|
|
960
|
+
case GGML_TYPE_F16:
|
|
961
|
+
case GGML_TYPE_Q4_0:
|
|
962
|
+
use_fast = true;
|
|
963
|
+
break;
|
|
964
|
+
default:
|
|
965
|
+
break;
|
|
966
|
+
}
|
|
967
|
+
break;
|
|
968
|
+
default:
|
|
969
|
+
break;
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
if (use_fast) {
|
|
973
|
+
int vectorized = src0->ne[0] % 4 == 0 && dst->ne[0] % 4 == 0 && dst->ne[1] % 4 == 0;
|
|
974
|
+
if (dst->ne[1] == 1) {
|
|
975
|
+
// We don't support vectorized mul_mat_vec for quantized types
|
|
976
|
+
vectorized = vectorized && (src0->type < 2);
|
|
977
|
+
pipeline = ctx->mul_mat_vec_pipelines[src0->type][src1->type][vectorized];
|
|
978
|
+
uint32_t batches = dst->ne[2] * dst->ne[3];
|
|
979
|
+
uint32_t output_groups =
|
|
980
|
+
(dst->ne[0] + WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG - 1) / WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
|
|
981
|
+
uint32_t total_wg = output_groups * batches;
|
|
982
|
+
wg_x = total_wg % ctx->limits.maxComputeWorkgroupsPerDimension;
|
|
983
|
+
wg_y = (total_wg + ctx->limits.maxComputeWorkgroupsPerDimension - 1) /
|
|
984
|
+
ctx->limits.maxComputeWorkgroupsPerDimension;
|
|
985
|
+
} else {
|
|
986
|
+
pipeline = ctx->mul_mat_pipelines[src0->type][src1->type][vectorized];
|
|
987
|
+
uint32_t wg_m;
|
|
988
|
+
uint32_t wg_n;
|
|
989
|
+
if (ctx->supports_subgroup_matrix) {
|
|
990
|
+
// The total number of subgroups/workgroups needed per matrix.
|
|
991
|
+
uint32_t wg_m_sg_tile =
|
|
992
|
+
WEBGPU_MUL_MAT_SUBGROUP_M * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M * ctx->subgroup_matrix_config.M;
|
|
993
|
+
wg_m = (dst->ne[0] + wg_m_sg_tile - 1) / wg_m_sg_tile;
|
|
994
|
+
uint32_t wg_n_sg_tile =
|
|
995
|
+
WEBGPU_MUL_MAT_SUBGROUP_N * WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N * ctx->subgroup_matrix_config.N;
|
|
996
|
+
wg_n = (dst->ne[1] + wg_n_sg_tile - 1) / wg_n_sg_tile;
|
|
997
|
+
} else {
|
|
998
|
+
uint32_t tile_m_s = WEBGPU_MUL_MAT_TILE_M * WEBGPU_MUL_MAT_WG_SIZE_M;
|
|
999
|
+
uint32_t tile_n_s = WEBGPU_MUL_MAT_TILE_N * WEBGPU_MUL_MAT_WG_SIZE_N;
|
|
1000
|
+
wg_m = (dst->ne[0] + tile_m_s - 1) / tile_m_s;
|
|
1001
|
+
wg_n = (dst->ne[1] + tile_n_s - 1) / tile_n_s;
|
|
1002
|
+
}
|
|
1003
|
+
wg_x = wg_m * wg_n * dst->ne[2] * dst->ne[3];
|
|
1004
|
+
}
|
|
1005
|
+
}
|
|
1006
|
+
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x, wg_y);
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx,
|
|
1010
|
+
ggml_tensor * src0,
|
|
1011
|
+
ggml_tensor * src1,
|
|
1012
|
+
ggml_tensor * dst,
|
|
1013
|
+
webgpu_pipeline & pipeline,
|
|
1014
|
+
bool inplace) {
|
|
1015
|
+
std::vector<uint32_t> params = {
|
|
1016
|
+
(uint32_t) ggml_nelements(dst),
|
|
1017
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
|
1018
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
|
1019
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
1020
|
+
(uint32_t) (src1->nb[0] / ggml_type_size(src1->type)),
|
|
1021
|
+
(uint32_t) (src1->nb[1] / ggml_type_size(src1->type)),
|
|
1022
|
+
(uint32_t) (src1->nb[2] / ggml_type_size(src1->type)),
|
|
1023
|
+
(uint32_t) (src1->nb[3] / ggml_type_size(src1->type)),
|
|
1024
|
+
(uint32_t) src0->ne[0],
|
|
1025
|
+
(uint32_t) src0->ne[1],
|
|
1026
|
+
(uint32_t) src0->ne[2],
|
|
1027
|
+
(uint32_t) src1->ne[0],
|
|
1028
|
+
(uint32_t) src1->ne[1],
|
|
1029
|
+
(uint32_t) src1->ne[2],
|
|
1030
|
+
(uint32_t) src1->ne[3],
|
|
1031
|
+
};
|
|
1032
|
+
|
|
1033
|
+
std::vector<wgpu::BindGroupEntry> entries = {
|
|
1034
|
+
{ .binding = 0,
|
|
1035
|
+
.buffer = ggml_webgpu_tensor_buf(src0),
|
|
1036
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
|
|
1037
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src0) },
|
|
1038
|
+
{ .binding = 1,
|
|
1039
|
+
.buffer = ggml_webgpu_tensor_buf(src1),
|
|
1040
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
|
|
1041
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src1) }
|
|
1042
|
+
};
|
|
1043
|
+
if (!inplace) {
|
|
1044
|
+
entries.push_back({ .binding = 2,
|
|
1045
|
+
.buffer = ggml_webgpu_tensor_buf(dst),
|
|
1046
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
|
1047
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
size_t max_wg_size = ctx->max_wg_size_x;
|
|
1051
|
+
uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
|
|
1052
|
+
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
|
1053
|
+
}
|
|
1054
|
+
|
|
1055
|
+
static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
|
1056
|
+
int inplace = ggml_webgpu_tensor_equal(src, dst);
|
|
1057
|
+
|
|
1058
|
+
std::vector<uint32_t> params = {
|
|
1059
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
|
1060
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
1061
|
+
(uint32_t) (src->nb[1] / ggml_type_size(src->type)),
|
|
1062
|
+
(uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
|
1063
|
+
(uint32_t) (src->nb[3] / ggml_type_size(src->type)),
|
|
1064
|
+
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
|
1065
|
+
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
|
1066
|
+
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
1067
|
+
(uint32_t) src->ne[0],
|
|
1068
|
+
(uint32_t) src->ne[1],
|
|
1069
|
+
(uint32_t) src->ne[2],
|
|
1070
|
+
(uint32_t) src->ne[3],
|
|
1071
|
+
*(uint32_t *) dst->op_params // epsilon, treated as f32 in the shader
|
|
1072
|
+
};
|
|
1073
|
+
|
|
1074
|
+
std::vector<wgpu::BindGroupEntry> entries = {
|
|
1075
|
+
{ .binding = 0,
|
|
1076
|
+
.buffer = ggml_webgpu_tensor_buf(src),
|
|
1077
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src),
|
|
1078
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src) }
|
|
1079
|
+
};
|
|
1080
|
+
if (!inplace) {
|
|
1081
|
+
entries.push_back({ .binding = 1,
|
|
1082
|
+
.buffer = ggml_webgpu_tensor_buf(dst),
|
|
1083
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
|
1084
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
return ggml_backend_webgpu_build(ctx, ctx->rms_norm_pipeline[inplace], params, entries, ggml_nrows(src));
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
static webgpu_command ggml_webgpu_rope(webgpu_context & ctx,
|
|
1091
|
+
ggml_tensor * src0,
|
|
1092
|
+
ggml_tensor * src1,
|
|
1093
|
+
ggml_tensor * src2,
|
|
1094
|
+
ggml_tensor * dst) {
|
|
1095
|
+
const int inplace = ggml_webgpu_tensor_equal(src0, dst);
|
|
1096
|
+
const int has_freq_factor = (src2 != nullptr);
|
|
1097
|
+
|
|
1098
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
1099
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
|
1100
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
1101
|
+
|
|
1102
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
1103
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
1104
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
1105
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
|
1106
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
|
1107
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
1108
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
1109
|
+
|
|
1110
|
+
int sections[4];
|
|
1111
|
+
memcpy(sections, (int32_t *) dst->op_params + 11, 4 * sizeof(int));
|
|
1112
|
+
|
|
1113
|
+
float theta_scale = powf(freq_base, -2.0f / n_dims);
|
|
1114
|
+
|
|
1115
|
+
float corr_dims[2];
|
|
1116
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
1117
|
+
|
|
1118
|
+
std::vector<uint32_t> params = {
|
|
1119
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
|
1120
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)),
|
|
1121
|
+
src2 != nullptr ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)) : 0,
|
|
1122
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
1123
|
+
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
|
|
1124
|
+
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
|
|
1125
|
+
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
|
|
1126
|
+
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
|
1127
|
+
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
|
1128
|
+
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
1129
|
+
(uint32_t) ggml_nelements(src0) / 2,
|
|
1130
|
+
(uint32_t) src0->ne[0],
|
|
1131
|
+
(uint32_t) src0->ne[1],
|
|
1132
|
+
(uint32_t) src0->ne[2],
|
|
1133
|
+
(uint32_t) n_dims,
|
|
1134
|
+
(uint32_t) mode,
|
|
1135
|
+
*(uint32_t *) &theta_scale,
|
|
1136
|
+
*(uint32_t *) &attn_factor,
|
|
1137
|
+
*(uint32_t *) &freq_scale,
|
|
1138
|
+
*(uint32_t *) &ext_factor,
|
|
1139
|
+
*(uint32_t *) &corr_dims[0],
|
|
1140
|
+
*(uint32_t *) &corr_dims[1],
|
|
1141
|
+
(uint32_t) sections[0],
|
|
1142
|
+
(uint32_t) sections[1],
|
|
1143
|
+
(uint32_t) sections[2],
|
|
1144
|
+
(uint32_t) sections[3]
|
|
1145
|
+
};
|
|
1146
|
+
|
|
1147
|
+
std::vector<wgpu::BindGroupEntry> entries = {
|
|
1148
|
+
{ .binding = 0,
|
|
1149
|
+
.buffer = ggml_webgpu_tensor_buf(src0),
|
|
1150
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
|
|
1151
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src0) },
|
|
1152
|
+
{ .binding = 1,
|
|
1153
|
+
.buffer = ggml_webgpu_tensor_buf(src1),
|
|
1154
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
|
|
1155
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src1) }
|
|
1156
|
+
};
|
|
1157
|
+
uint32_t dst_binding = 2;
|
|
1158
|
+
if (has_freq_factor) {
|
|
1159
|
+
dst_binding = 3;
|
|
1160
|
+
entries.push_back({ .binding = 2,
|
|
1161
|
+
.buffer = ggml_webgpu_tensor_buf(src2),
|
|
1162
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src2),
|
|
1163
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src2) });
|
|
1164
|
+
}
|
|
1165
|
+
if (!inplace) {
|
|
1166
|
+
entries.push_back({ .binding = dst_binding,
|
|
1167
|
+
.buffer = ggml_webgpu_tensor_buf(dst),
|
|
1168
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
|
1169
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
webgpu_pipeline pipeline = ctx->rope_pipeline[dst->type][has_freq_factor][inplace];
|
|
1173
|
+
size_t max_wg_size = ctx->max_wg_size_x;
|
|
1174
|
+
uint32_t wg_x = (ggml_nelements(src0) / 2 + max_wg_size - 1) / max_wg_size;
|
|
1175
|
+
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
static webgpu_command ggml_webgpu_glu(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) {
|
|
1179
|
+
const int split = (src1 != nullptr);
|
|
1180
|
+
|
|
1181
|
+
std::vector<uint32_t> params = {
|
|
1182
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
|
1183
|
+
src1 != nullptr ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)) : 0,
|
|
1184
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
1185
|
+
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
|
|
1186
|
+
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
|
|
1187
|
+
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
|
|
1188
|
+
src1 != nullptr ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) :
|
|
1189
|
+
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
|
|
1190
|
+
src1 != nullptr ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) :
|
|
1191
|
+
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
|
|
1192
|
+
src1 != nullptr ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) :
|
|
1193
|
+
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
|
|
1194
|
+
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
|
1195
|
+
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
|
1196
|
+
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
1197
|
+
(uint32_t) ggml_nelements(dst),
|
|
1198
|
+
(uint32_t) dst->ne[0],
|
|
1199
|
+
(uint32_t) dst->ne[1],
|
|
1200
|
+
(uint32_t) dst->ne[2],
|
|
1201
|
+
(uint32_t) ((int32_t *) dst->op_params)[1], // swapped
|
|
1202
|
+
*(uint32_t *) &dst->op_params[2], // alpha, for swiglu_oai
|
|
1203
|
+
*(uint32_t *) &dst->op_params[3], // limit, for swiglu_oai
|
|
1204
|
+
};
|
|
1205
|
+
|
|
1206
|
+
std::vector<wgpu::BindGroupEntry> entries = {
|
|
1207
|
+
{ .binding = 0,
|
|
1208
|
+
.buffer = ggml_webgpu_tensor_buf(src0),
|
|
1209
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
|
|
1210
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src0) },
|
|
1211
|
+
};
|
|
1212
|
+
uint32_t dst_binding = 1;
|
|
1213
|
+
if (split) {
|
|
1214
|
+
dst_binding = 2;
|
|
1215
|
+
entries.push_back({ .binding = 1,
|
|
1216
|
+
.buffer = ggml_webgpu_tensor_buf(src1),
|
|
1217
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
|
|
1218
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src1) });
|
|
1219
|
+
}
|
|
1220
|
+
entries.push_back({ .binding = dst_binding,
|
|
1221
|
+
.buffer = ggml_webgpu_tensor_buf(dst),
|
|
1222
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
|
1223
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
|
1224
|
+
|
|
1225
|
+
webgpu_pipeline pipeline = ctx->glu_pipeline[ggml_get_glu_op(dst)][dst->type][split];
|
|
1226
|
+
size_t max_wg_size = ctx->max_wg_size_x;
|
|
1227
|
+
uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
|
|
1228
|
+
return ggml_backend_webgpu_build(ctx, pipeline, params, entries, wg_x);
|
|
1229
|
+
}
|
|
1230
|
+
|
|
1231
|
+
static webgpu_command ggml_webgpu_scale(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
|
1232
|
+
int inplace = ggml_webgpu_tensor_equal(src, dst);
|
|
1233
|
+
|
|
1234
|
+
std::vector<uint32_t> params = {
|
|
1235
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
|
1236
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
1237
|
+
(uint32_t) (src->nb[1] / ggml_type_size(src->type)),
|
|
1238
|
+
(uint32_t) (src->nb[2] / ggml_type_size(src->type)),
|
|
1239
|
+
(uint32_t) (src->nb[3] / ggml_type_size(src->type)),
|
|
1240
|
+
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
|
1241
|
+
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
|
1242
|
+
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
1243
|
+
(uint32_t) ggml_nelements(dst),
|
|
1244
|
+
(uint32_t) src->ne[0],
|
|
1245
|
+
(uint32_t) src->ne[1],
|
|
1246
|
+
(uint32_t) src->ne[2],
|
|
1247
|
+
*(uint32_t *) dst->op_params, // scale
|
|
1248
|
+
*(uint32_t *) &dst->op_params[1] // bias
|
|
1249
|
+
};
|
|
1250
|
+
|
|
1251
|
+
std::vector<wgpu::BindGroupEntry> entries = {
|
|
1252
|
+
{ .binding = 0,
|
|
1253
|
+
.buffer = ggml_webgpu_tensor_buf(src),
|
|
1254
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src),
|
|
1255
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src) }
|
|
1256
|
+
};
|
|
1257
|
+
if (!inplace) {
|
|
1258
|
+
entries.push_back({ .binding = 1,
|
|
1259
|
+
.buffer = ggml_webgpu_tensor_buf(dst),
|
|
1260
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
|
1261
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
|
1262
|
+
}
|
|
1263
|
+
|
|
1264
|
+
size_t max_wg_size = ctx->max_wg_size_x;
|
|
1265
|
+
uint32_t wg_x = (ggml_nelements(dst) + max_wg_size - 1) / max_wg_size;
|
|
1266
|
+
return ggml_backend_webgpu_build(ctx, ctx->scale_pipeline[inplace], params, entries, wg_x);
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
static webgpu_command ggml_webgpu_soft_max(webgpu_context & ctx,
|
|
1270
|
+
ggml_tensor * src0,
|
|
1271
|
+
ggml_tensor * src1,
|
|
1272
|
+
ggml_tensor * src2,
|
|
1273
|
+
ggml_tensor * dst) {
|
|
1274
|
+
const int inplace = ggml_webgpu_tensor_equal(src0, dst);
|
|
1275
|
+
const int mask_type = (src1 != nullptr) ? src1->type : 2; // use 2 for no mask here
|
|
1276
|
+
const int has_sink = (src2 != nullptr);
|
|
1277
|
+
float max_bias;
|
|
1278
|
+
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
|
1279
|
+
float n_head_log2 = float(1u << (uint32_t) floor(log2(src0->ne[2])));
|
|
1280
|
+
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
|
1281
|
+
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
|
1282
|
+
|
|
1283
|
+
std::vector<uint32_t> params = {
|
|
1284
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)),
|
|
1285
|
+
mask_type < 2 ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)) : 0,
|
|
1286
|
+
has_sink ? (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src2) / ggml_type_size(src2->type)) : 0,
|
|
1287
|
+
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)),
|
|
1288
|
+
(uint32_t) (src0->nb[1] / ggml_type_size(src0->type)),
|
|
1289
|
+
(uint32_t) (src0->nb[2] / ggml_type_size(src0->type)),
|
|
1290
|
+
(uint32_t) (src0->nb[3] / ggml_type_size(src0->type)),
|
|
1291
|
+
mask_type < 2 ? (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)) : 0,
|
|
1292
|
+
mask_type < 2 ? (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)) : 0,
|
|
1293
|
+
mask_type < 2 ? (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)) : 0,
|
|
1294
|
+
(uint32_t) (dst->nb[1] / ggml_type_size(dst->type)),
|
|
1295
|
+
(uint32_t) (dst->nb[2] / ggml_type_size(dst->type)),
|
|
1296
|
+
(uint32_t) (dst->nb[3] / ggml_type_size(dst->type)),
|
|
1297
|
+
(uint32_t) ggml_nelements(dst),
|
|
1298
|
+
(uint32_t) src0->ne[0],
|
|
1299
|
+
(uint32_t) src0->ne[1],
|
|
1300
|
+
(uint32_t) src0->ne[2],
|
|
1301
|
+
mask_type < 2 ? (uint32_t) src1->ne[2] : 0,
|
|
1302
|
+
mask_type < 2 ? (uint32_t) src1->ne[3] : 0,
|
|
1303
|
+
*(uint32_t *) dst->op_params, // scale
|
|
1304
|
+
*(uint32_t *) &max_bias,
|
|
1305
|
+
*(uint32_t *) &n_head_log2,
|
|
1306
|
+
*(uint32_t *) &m0,
|
|
1307
|
+
*(uint32_t *) &m1
|
|
1308
|
+
};
|
|
1309
|
+
|
|
1310
|
+
std::vector<wgpu::BindGroupEntry> entries = {
|
|
1311
|
+
{ .binding = 0,
|
|
1312
|
+
.buffer = ggml_webgpu_tensor_buf(src0),
|
|
1313
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src0),
|
|
1314
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src0) }
|
|
1315
|
+
};
|
|
1316
|
+
uint32_t binding_num = 1;
|
|
1317
|
+
if (mask_type < 2) {
|
|
1318
|
+
entries.push_back({ .binding = binding_num,
|
|
1319
|
+
.buffer = ggml_webgpu_tensor_buf(src1),
|
|
1320
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src1),
|
|
1321
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src1) });
|
|
1322
|
+
binding_num++;
|
|
1323
|
+
}
|
|
1324
|
+
if (has_sink) {
|
|
1325
|
+
entries.push_back({ .binding = binding_num,
|
|
1326
|
+
.buffer = ggml_webgpu_tensor_buf(src2),
|
|
1327
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, src2),
|
|
1328
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, src2) });
|
|
1329
|
+
binding_num++;
|
|
1330
|
+
}
|
|
1331
|
+
if (!inplace) {
|
|
1332
|
+
entries.push_back({ .binding = binding_num,
|
|
1333
|
+
.buffer = ggml_webgpu_tensor_buf(dst),
|
|
1334
|
+
.offset = ggml_webgpu_tensor_align_offset(ctx, dst),
|
|
1335
|
+
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
return ggml_backend_webgpu_build(ctx, ctx->soft_max_pipeline[mask_type][has_sink][inplace], params, entries,
|
|
1339
|
+
ggml_nrows(dst));
|
|
597
1340
|
}
|
|
598
1341
|
|
|
599
|
-
// Returns
|
|
600
|
-
static
|
|
1342
|
+
// Returns the encoded command, or std::nullopt if the operation is a no-op
|
|
1343
|
+
static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx, ggml_tensor * node) {
|
|
601
1344
|
if (ggml_is_empty(node)) {
|
|
602
|
-
return
|
|
1345
|
+
return std::nullopt;
|
|
603
1346
|
}
|
|
604
1347
|
WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");
|
|
605
1348
|
|
|
606
1349
|
ggml_tensor * src0 = node->src[0];
|
|
607
1350
|
ggml_tensor * src1 = node->src[1];
|
|
1351
|
+
ggml_tensor * src2 = node->src[2];
|
|
608
1352
|
|
|
609
1353
|
switch (node->op) {
|
|
610
1354
|
// no-ops
|
|
611
1355
|
case GGML_OP_NONE:
|
|
612
1356
|
case GGML_OP_VIEW:
|
|
613
1357
|
case GGML_OP_PERMUTE:
|
|
614
|
-
|
|
1358
|
+
case GGML_OP_TRANSPOSE:
|
|
1359
|
+
case GGML_OP_RESHAPE:
|
|
1360
|
+
return std::nullopt;
|
|
615
1361
|
case GGML_OP_CPY:
|
|
1362
|
+
case GGML_OP_CONT:
|
|
1363
|
+
return ggml_webgpu_cpy(ctx, src0, node);
|
|
1364
|
+
case GGML_OP_SET_ROWS:
|
|
1365
|
+
return ggml_webgpu_set_rows(ctx, src0, src1, node);
|
|
1366
|
+
case GGML_OP_GET_ROWS:
|
|
1367
|
+
return ggml_webgpu_get_rows(ctx, src0, src1, node);
|
|
1368
|
+
case GGML_OP_MUL_MAT:
|
|
1369
|
+
return ggml_webgpu_mul_mat(ctx, src0, src1, node);
|
|
1370
|
+
case GGML_OP_ADD:
|
|
616
1371
|
{
|
|
617
|
-
|
|
618
|
-
|
|
1372
|
+
int inplace = ggml_webgpu_tensor_equal(src0, node);
|
|
1373
|
+
return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->add_pipeline[node->type][inplace], inplace);
|
|
619
1374
|
}
|
|
620
|
-
case
|
|
1375
|
+
case GGML_OP_SUB:
|
|
621
1376
|
{
|
|
622
|
-
|
|
623
|
-
|
|
1377
|
+
int inplace = ggml_webgpu_tensor_equal(src0, node);
|
|
1378
|
+
return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->sub_pipeline[node->type][inplace], inplace);
|
|
624
1379
|
}
|
|
625
|
-
case
|
|
1380
|
+
case GGML_OP_MUL:
|
|
626
1381
|
{
|
|
627
|
-
|
|
628
|
-
|
|
1382
|
+
int inplace = ggml_webgpu_tensor_equal(src0, node);
|
|
1383
|
+
return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->mul_pipeline[node->type][inplace], inplace);
|
|
1384
|
+
}
|
|
1385
|
+
case GGML_OP_DIV:
|
|
1386
|
+
{
|
|
1387
|
+
int inplace = ggml_webgpu_tensor_equal(src0, node);
|
|
1388
|
+
return ggml_webgpu_binary_op(ctx, src0, src1, node, ctx->div_pipeline[node->type][inplace], inplace);
|
|
629
1389
|
}
|
|
1390
|
+
case GGML_OP_RMS_NORM:
|
|
1391
|
+
return ggml_webgpu_rms_norm(ctx, src0, node);
|
|
1392
|
+
case GGML_OP_ROPE:
|
|
1393
|
+
return ggml_webgpu_rope(ctx, src0, src1, src2, node);
|
|
1394
|
+
case GGML_OP_GLU:
|
|
1395
|
+
return ggml_webgpu_glu(ctx, src0, src1, node);
|
|
1396
|
+
case GGML_OP_SCALE:
|
|
1397
|
+
return ggml_webgpu_scale(ctx, src0, node);
|
|
1398
|
+
case GGML_OP_SOFT_MAX:
|
|
1399
|
+
return ggml_webgpu_soft_max(ctx, src0, src1, src2, node);
|
|
630
1400
|
default:
|
|
631
|
-
return
|
|
1401
|
+
return std::nullopt;
|
|
632
1402
|
}
|
|
633
|
-
return true;
|
|
634
1403
|
}
|
|
635
1404
|
|
|
636
1405
|
static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
|
@@ -639,13 +1408,35 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str
|
|
|
639
1408
|
ggml_backend_webgpu_context * backend_ctx = static_cast<ggml_backend_webgpu_context *>(backend->context);
|
|
640
1409
|
webgpu_context ctx = backend_ctx->webgpu_ctx;
|
|
641
1410
|
|
|
642
|
-
|
|
643
|
-
ggml_webgpu_encode_node(ctx, cgraph->nodes[i]);
|
|
644
|
-
}
|
|
1411
|
+
WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute);
|
|
645
1412
|
|
|
646
|
-
|
|
647
|
-
ggml_backend_webgpu_wait_on_submission(ctx);
|
|
1413
|
+
ctx->inflight_threads++;
|
|
648
1414
|
|
|
1415
|
+
std::vector<webgpu_command> commands;
|
|
1416
|
+
std::vector<webgpu_submission_futures> futures;
|
|
1417
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
1418
|
+
if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) {
|
|
1419
|
+
commands.push_back(*cmd);
|
|
1420
|
+
}
|
|
1421
|
+
// compute the batch size based on the number of inflight threads
|
|
1422
|
+
uint inflight_threads = ctx->inflight_threads;
|
|
1423
|
+
uint batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)),
|
|
1424
|
+
WEBGPU_COMMAND_SUBMIT_BATCH_SIZE);
|
|
1425
|
+
if (commands.size() >= batch_size) {
|
|
1426
|
+
futures.push_back(ggml_backend_webgpu_submit(ctx, commands));
|
|
1427
|
+
// Process events and check for completed submissions
|
|
1428
|
+
ctx->instance.ProcessEvents();
|
|
1429
|
+
ggml_backend_webgpu_wait(ctx, futures, false);
|
|
1430
|
+
commands.clear();
|
|
1431
|
+
}
|
|
1432
|
+
}
|
|
1433
|
+
if (!commands.empty()) {
|
|
1434
|
+
webgpu_submission_futures new_futures = ggml_backend_webgpu_submit(ctx, commands);
|
|
1435
|
+
futures.push_back(new_futures);
|
|
1436
|
+
}
|
|
1437
|
+
ggml_backend_webgpu_wait(ctx, futures);
|
|
1438
|
+
ctx->inflight_threads--;
|
|
1439
|
+
WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx);
|
|
649
1440
|
return GGML_STATUS_SUCCESS;
|
|
650
1441
|
}
|
|
651
1442
|
|
|
@@ -663,6 +1454,7 @@ static ggml_backend_i ggml_backend_webgpu_i = {
|
|
|
663
1454
|
/* .graph_compute = */ ggml_backend_webgpu_graph_compute,
|
|
664
1455
|
/* .event_record = */ NULL,
|
|
665
1456
|
/* .event_wait = */ NULL,
|
|
1457
|
+
/* .graph_optimize = */ NULL,
|
|
666
1458
|
};
|
|
667
1459
|
|
|
668
1460
|
/* End GGML Backend Interface */
|
|
@@ -670,7 +1462,6 @@ static ggml_backend_i ggml_backend_webgpu_i = {
|
|
|
670
1462
|
/* GGML Backend Buffer Interface */
|
|
671
1463
|
|
|
672
1464
|
static void ggml_backend_webgpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
673
|
-
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_free_buffer()");
|
|
674
1465
|
ggml_backend_webgpu_buffer_context * ctx = static_cast<ggml_backend_webgpu_buffer_context *>(buffer->context);
|
|
675
1466
|
ctx->buffer.Destroy();
|
|
676
1467
|
}
|
|
@@ -691,16 +1482,19 @@ static void ggml_backend_webgpu_buffer_memset_tensor(ggml_backend_buffer_t buffe
|
|
|
691
1482
|
return;
|
|
692
1483
|
}
|
|
693
1484
|
|
|
694
|
-
|
|
695
|
-
<< offset << ", " << size << ")");
|
|
1485
|
+
WEBGPU_CPU_PROFILE_TOTAL_START(memset_tensor);
|
|
696
1486
|
|
|
697
1487
|
ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
|
|
698
1488
|
|
|
1489
|
+
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_memset_tensor(" << buf_ctx->label << ", " << tensor << ", " << value
|
|
1490
|
+
<< ", " << offset << ", " << size << ")");
|
|
1491
|
+
|
|
699
1492
|
size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
|
|
700
1493
|
|
|
701
1494
|
// This is a trick to set all bytes of a u32 to the same 1 byte value.
|
|
702
1495
|
uint32_t val32 = (uint32_t) value * 0x01010101;
|
|
703
1496
|
ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, val32, total_offset, size);
|
|
1497
|
+
WEBGPU_CPU_PROFILE_TOTAL_END(memset_tensor, buf_ctx->webgpu_ctx);
|
|
704
1498
|
}
|
|
705
1499
|
|
|
706
1500
|
static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
@@ -708,11 +1502,13 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
708
1502
|
const void * data,
|
|
709
1503
|
size_t offset,
|
|
710
1504
|
size_t size) {
|
|
711
|
-
|
|
712
|
-
<< offset << ", " << size << ")");
|
|
1505
|
+
WEBGPU_CPU_PROFILE_TOTAL_START(set_tensor);
|
|
713
1506
|
ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
|
|
714
1507
|
webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
|
|
715
1508
|
|
|
1509
|
+
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_set_tensor(" << buf_ctx->label << ", " << tensor << ", " << data
|
|
1510
|
+
<< ", " << offset << ", " << size << ")");
|
|
1511
|
+
|
|
716
1512
|
size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
|
|
717
1513
|
|
|
718
1514
|
webgpu_ctx->queue.WriteBuffer(buf_ctx->buffer, total_offset, data, (size / 4) * 4);
|
|
@@ -728,12 +1524,21 @@ static void ggml_backend_webgpu_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
728
1524
|
((uint8_t *) &val32)[i] = ((const uint8_t *) data)[size - remaining_size + i];
|
|
729
1525
|
}
|
|
730
1526
|
// memset the remaining bytes
|
|
731
|
-
ggml_backend_webgpu_buffer_memset(
|
|
732
|
-
|
|
1527
|
+
ggml_backend_webgpu_buffer_memset(webgpu_ctx, buf_ctx->buffer, val32, total_offset + (size - remaining_size),
|
|
1528
|
+
remaining_size);
|
|
733
1529
|
} else {
|
|
734
1530
|
// wait for WriteBuffer to complete
|
|
735
|
-
|
|
1531
|
+
webgpu_ctx->instance.WaitAny(
|
|
1532
|
+
webgpu_ctx->queue.OnSubmittedWorkDone(wgpu::CallbackMode::AllowSpontaneous,
|
|
1533
|
+
[](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) {
|
|
1534
|
+
if (status != wgpu::QueueWorkDoneStatus::Success) {
|
|
1535
|
+
GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n",
|
|
1536
|
+
std::string(message).c_str());
|
|
1537
|
+
}
|
|
1538
|
+
}),
|
|
1539
|
+
UINT64_MAX);
|
|
736
1540
|
}
|
|
1541
|
+
WEBGPU_CPU_PROFILE_TOTAL_END(set_tensor, webgpu_ctx);
|
|
737
1542
|
}
|
|
738
1543
|
|
|
739
1544
|
static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
@@ -741,12 +1546,12 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
741
1546
|
void * data,
|
|
742
1547
|
size_t offset,
|
|
743
1548
|
size_t size) {
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
webgpu_context
|
|
749
|
-
wgpu::Device
|
|
1549
|
+
WEBGPU_CPU_PROFILE_TOTAL_START(get_tensor);
|
|
1550
|
+
ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
|
|
1551
|
+
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_get_tensor(" << buf_ctx->label << ", " << tensor << ", " << data
|
|
1552
|
+
<< ", " << offset << ", " << size << ")");
|
|
1553
|
+
webgpu_context webgpu_ctx = buf_ctx->webgpu_ctx;
|
|
1554
|
+
wgpu::Device device = webgpu_ctx->device;
|
|
750
1555
|
|
|
751
1556
|
size_t total_offset = webgpu_tensor_offset(tensor) + tensor->view_offs + offset;
|
|
752
1557
|
|
|
@@ -763,11 +1568,8 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
763
1568
|
if (webgpu_ctx->get_tensor_staging_buf) {
|
|
764
1569
|
webgpu_ctx->get_tensor_staging_buf.Destroy();
|
|
765
1570
|
}
|
|
766
|
-
ggml_webgpu_create_buffer(device,
|
|
767
|
-
|
|
768
|
-
final_size,
|
|
769
|
-
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead,
|
|
770
|
-
"get_tensor_staging_buf");
|
|
1571
|
+
ggml_webgpu_create_buffer(device, webgpu_ctx->get_tensor_staging_buf, final_size,
|
|
1572
|
+
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "get_tensor_staging_buf");
|
|
771
1573
|
}
|
|
772
1574
|
|
|
773
1575
|
// Copy the data from the buffer to the staging buffer
|
|
@@ -786,12 +1588,15 @@ static void ggml_backend_webgpu_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
786
1588
|
// Copy the data from the mapped range to the output buffer
|
|
787
1589
|
std::memcpy(data, mapped_range, size);
|
|
788
1590
|
webgpu_ctx->get_tensor_staging_buf.Unmap();
|
|
1591
|
+
WEBGPU_CPU_PROFILE_TOTAL_END(get_tensor, webgpu_ctx);
|
|
789
1592
|
}
|
|
790
1593
|
|
|
791
1594
|
static void ggml_backend_webgpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
792
1595
|
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_clear(" << buffer << ", " << (uint32_t) value << ")");
|
|
1596
|
+
WEBGPU_CPU_PROFILE_TOTAL_START(clear);
|
|
793
1597
|
ggml_backend_webgpu_buffer_context * buf_ctx = (ggml_backend_webgpu_buffer_context *) buffer->context;
|
|
794
1598
|
ggml_backend_webgpu_buffer_memset(buf_ctx->webgpu_ctx, buf_ctx->buffer, value, 0, buffer->size);
|
|
1599
|
+
WEBGPU_CPU_PROFILE_TOTAL_END(clear, buf_ctx->webgpu_ctx);
|
|
795
1600
|
}
|
|
796
1601
|
|
|
797
1602
|
static ggml_backend_buffer_i ggml_backend_webgpu_buffer_interface = {
|
|
@@ -817,17 +1622,20 @@ static const char * ggml_backend_webgpu_buffer_type_get_name(ggml_backend_buffer
|
|
|
817
1622
|
|
|
818
1623
|
static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
819
1624
|
size_t size) {
|
|
820
|
-
|
|
1625
|
+
static std::atomic<int> buffer_count;
|
|
1626
|
+
int buffer_id = buffer_count++;
|
|
1627
|
+
std::string buf_name = "tensor_buf" + std::to_string(buffer_id);
|
|
1628
|
+
WEBGPU_LOG_DEBUG("ggml_backend_webgpu_buffer_type_alloc_buffer_" << buffer_id << ": " << size << " bytes");
|
|
821
1629
|
ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(buft->device->context);
|
|
822
1630
|
|
|
823
1631
|
wgpu::Buffer buf;
|
|
824
|
-
ggml_webgpu_create_buffer(ctx->webgpu_ctx->device,
|
|
825
|
-
buf,
|
|
1632
|
+
ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf,
|
|
826
1633
|
(size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1),
|
|
827
1634
|
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst,
|
|
828
|
-
|
|
1635
|
+
buf_name.c_str());
|
|
829
1636
|
|
|
830
|
-
ggml_backend_webgpu_buffer_context * buf_ctx =
|
|
1637
|
+
ggml_backend_webgpu_buffer_context * buf_ctx =
|
|
1638
|
+
new ggml_backend_webgpu_buffer_context(ctx->webgpu_ctx, buf, buf_name);
|
|
831
1639
|
|
|
832
1640
|
return ggml_backend_buffer_init(buft, ggml_backend_webgpu_buffer_interface, buf_ctx, size);
|
|
833
1641
|
}
|
|
@@ -887,9 +1695,17 @@ static ggml_guid_t ggml_backend_webgpu_guid(void) {
|
|
|
887
1695
|
return reinterpret_cast<ggml_guid_t>((void *) guid_str);
|
|
888
1696
|
}
|
|
889
1697
|
|
|
1698
|
+
// Workgroup size is a common constant
|
|
1699
|
+
static std::vector<wgpu::ConstantEntry> ggml_webgpu_wg_size_entry(uint32_t wg_size) {
|
|
1700
|
+
std::vector<wgpu::ConstantEntry> constants(1);
|
|
1701
|
+
constants[0].key = "wg_size";
|
|
1702
|
+
constants[0].value = wg_size;
|
|
1703
|
+
return constants;
|
|
1704
|
+
}
|
|
1705
|
+
|
|
890
1706
|
static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
|
|
891
1707
|
// we use the maximum workgroup size for the memset pipeline
|
|
892
|
-
size_t max_wg_size = webgpu_ctx->
|
|
1708
|
+
size_t max_wg_size = webgpu_ctx->max_wg_size_x;
|
|
893
1709
|
size_t max_threads = max_wg_size * webgpu_ctx->limits.maxComputeWorkgroupsPerDimension;
|
|
894
1710
|
// Size the bytes_per_thread so that the largest buffer size can be handled
|
|
895
1711
|
webgpu_ctx->memset_bytes_per_thread =
|
|
@@ -903,109 +1719,411 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) {
|
|
|
903
1719
|
}
|
|
904
1720
|
|
|
905
1721
|
static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) {
|
|
906
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
"
|
|
910
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
"
|
|
914
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
"
|
|
918
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
"
|
|
922
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
"
|
|
926
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
"
|
|
930
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
"
|
|
934
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
"
|
|
938
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
"
|
|
942
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device,
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
1722
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32],
|
|
1723
|
+
wgsl_mul_mat_q4_0_f32, "mul_mat_q4_0_f32");
|
|
1724
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32],
|
|
1725
|
+
wgsl_mul_mat_q4_1_f32, "mul_mat_q4_1_f32");
|
|
1726
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32],
|
|
1727
|
+
wgsl_mul_mat_q5_0_f32, "mul_mat_q5_0_f32");
|
|
1728
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32],
|
|
1729
|
+
wgsl_mul_mat_q5_1_f32, "mul_mat_q5_1_f32");
|
|
1730
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32],
|
|
1731
|
+
wgsl_mul_mat_q8_0_f32, "mul_mat_q8_0_f32");
|
|
1732
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32],
|
|
1733
|
+
wgsl_mul_mat_q2_k_f32, "mul_mat_q2_k_f32");
|
|
1734
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32],
|
|
1735
|
+
wgsl_mul_mat_q3_k_f32, "mul_mat_q3_k_f32");
|
|
1736
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32],
|
|
1737
|
+
wgsl_mul_mat_q4_k_f32, "mul_mat_q4_k_f32");
|
|
1738
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32],
|
|
1739
|
+
wgsl_mul_mat_q5_k_f32, "mul_mat_q5_k_f32");
|
|
1740
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32],
|
|
1741
|
+
wgsl_mul_mat_q6_k_f32, "mul_mat_q6_k_f32");
|
|
1742
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32],
|
|
1743
|
+
wgsl_mul_mat_iq2_xxs_f32, "mul_mat_iq2_xxs_f32");
|
|
1744
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32],
|
|
1745
|
+
wgsl_mul_mat_iq2_xs_f32, "mul_mat_iq2_xs_f32");
|
|
1746
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32],
|
|
1747
|
+
wgsl_mul_mat_iq2_s_f32, "mul_mat_iq2_s_f32");
|
|
1748
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32],
|
|
1749
|
+
wgsl_mul_mat_iq3_xxs_f32, "mul_mat_iq3_xxs_f32");
|
|
1750
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32],
|
|
1751
|
+
wgsl_mul_mat_iq3_s_f32, "mul_mat_iq3_s_f32");
|
|
1752
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32],
|
|
1753
|
+
wgsl_mul_mat_iq1_s_f32, "mul_mat_iq1_s_f32");
|
|
1754
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32],
|
|
1755
|
+
wgsl_mul_mat_iq1_m_f32, "mul_mat_iq1_m_f32");
|
|
1756
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32],
|
|
1757
|
+
wgsl_mul_mat_iq4_nl_f32, "mul_mat_iq4_nl_f32");
|
|
1758
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32],
|
|
1759
|
+
wgsl_mul_mat_iq4_xs_f32, "mul_mat_iq4_xs_f32");
|
|
1760
|
+
|
|
1761
|
+
if (webgpu_ctx->supports_subgroup_matrix) {
|
|
1762
|
+
std::map<std::string, std::string> sg_matrix_repls;
|
|
1763
|
+
sg_matrix_repls["WEBGPU_MAX_SUBGROUP_SIZE"] = std::to_string(webgpu_ctx->subgroup_size);
|
|
1764
|
+
sg_matrix_repls["WEBGPU_TILE_K"] = std::to_string(WEBGPU_MUL_MAT_TILE_K);
|
|
1765
|
+
sg_matrix_repls["WEBGPU_SUBGROUP_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_M);
|
|
1766
|
+
sg_matrix_repls["WEBGPU_SUBGROUP_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_N);
|
|
1767
|
+
sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_M"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_M);
|
|
1768
|
+
sg_matrix_repls["WEBGPU_SUBGROUP_MATRIX_N"] = std::to_string(WEBGPU_MUL_MAT_SUBGROUP_MATRIX_N);
|
|
1769
|
+
sg_matrix_repls["WEBGPU_SG_MAT_M_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.M);
|
|
1770
|
+
sg_matrix_repls["WEBGPU_SG_MAT_N_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.N);
|
|
1771
|
+
sg_matrix_repls["WEBGPU_SG_MAT_K_SIZE"] = std::to_string(webgpu_ctx->subgroup_matrix_config.K);
|
|
1772
|
+
|
|
1773
|
+
std::string proc_mul_mat_subgroup_matrix_f32_f32 =
|
|
1774
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32, sg_matrix_repls);
|
|
1775
|
+
std::string proc_mul_mat_subgroup_matrix_f32_f32_vec =
|
|
1776
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f32_f32_vec, sg_matrix_repls);
|
|
1777
|
+
std::string proc_mul_mat_subgroup_matrix_f16_f32 =
|
|
1778
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32, sg_matrix_repls);
|
|
1779
|
+
std::string proc_mul_mat_subgroup_matrix_f16_f32_vec =
|
|
1780
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f32_vec, sg_matrix_repls);
|
|
1781
|
+
std::string proc_mul_mat_subgroup_matrix_f16_f16 =
|
|
1782
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16, sg_matrix_repls);
|
|
1783
|
+
std::string proc_mul_mat_subgroup_matrix_f16_f16_vec =
|
|
1784
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_f16_f16_vec, sg_matrix_repls);
|
|
1785
|
+
std::string proc_mul_mat_subgroup_matrix_q4_0_f32 =
|
|
1786
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32, sg_matrix_repls);
|
|
1787
|
+
std::string proc_mul_mat_subgroup_matrix_q4_0_f32_vec =
|
|
1788
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_subgroup_matrix_q4_0_f32_vec, sg_matrix_repls);
|
|
1789
|
+
|
|
1790
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
1791
|
+
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32.c_str(), "mul_mat_subgroup_matrix_f32_f32");
|
|
1792
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
|
1793
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f32_f32_vec.c_str(),
|
|
1794
|
+
"mul_mat_subgroup_matrix_f32_f32_vec");
|
|
1795
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
1796
|
+
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32.c_str(), "mul_mat_subgroup_matrix_f16_f32");
|
|
1797
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
|
1798
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f32_vec.c_str(),
|
|
1799
|
+
"mul_mat_subgroup_matrix_f16_f32_vec");
|
|
1800
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
|
1801
|
+
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16.c_str(), "mul_mat_subgroup_matrix_f16_f16");
|
|
1802
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
|
1803
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_f16_f16_vec.c_str(),
|
|
1804
|
+
"mul_mat_subgroup_matrix_f16_f16_vec");
|
|
1805
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
1806
|
+
webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32.c_str(), "mul_mat_subgroup_matrix_q4_0_f32");
|
|
1807
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
|
1808
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_subgroup_matrix_q4_0_f32_vec.c_str(),
|
|
1809
|
+
"mul_mat_subgroup_matrix_q4_0_f32_vec");
|
|
1810
|
+
} else {
|
|
1811
|
+
std::vector<wgpu::ConstantEntry> mul_mat_reg_tile_constants(3);
|
|
1812
|
+
mul_mat_reg_tile_constants[0].key = "TILE_K";
|
|
1813
|
+
mul_mat_reg_tile_constants[0].value = WEBGPU_MUL_MAT_TILE_K;
|
|
1814
|
+
mul_mat_reg_tile_constants[1].key = "WORKGROUP_SIZE_M";
|
|
1815
|
+
mul_mat_reg_tile_constants[1].value = WEBGPU_MUL_MAT_WG_SIZE_M;
|
|
1816
|
+
mul_mat_reg_tile_constants[2].key = "WORKGROUP_SIZE_N";
|
|
1817
|
+
mul_mat_reg_tile_constants[2].value = WEBGPU_MUL_MAT_WG_SIZE_N;
|
|
1818
|
+
|
|
1819
|
+
std::map<std::string, std::string> reg_repls;
|
|
1820
|
+
reg_repls["WEBGPU_TILE_M"] = std::to_string(WEBGPU_MUL_MAT_TILE_M);
|
|
1821
|
+
reg_repls["WEBGPU_TILE_N"] = std::to_string(WEBGPU_MUL_MAT_TILE_N);
|
|
1822
|
+
|
|
1823
|
+
// Process each reg-tile shader with tile replacements.
|
|
1824
|
+
// Keep the processed strings in-scope so .c_str() remains valid.
|
|
1825
|
+
std::string proc_mul_mat_reg_tile_f32_f32 =
|
|
1826
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32, reg_repls);
|
|
1827
|
+
std::string proc_mul_mat_reg_tile_f32_f32_vec =
|
|
1828
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f32_f32_vec, reg_repls);
|
|
1829
|
+
std::string proc_mul_mat_reg_tile_f16_f32 =
|
|
1830
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32, reg_repls);
|
|
1831
|
+
std::string proc_mul_mat_reg_tile_f16_f32_vec =
|
|
1832
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f32_vec, reg_repls);
|
|
1833
|
+
std::string proc_mul_mat_reg_tile_f16_f16 =
|
|
1834
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16, reg_repls);
|
|
1835
|
+
std::string proc_mul_mat_reg_tile_f16_f16_vec =
|
|
1836
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_f16_f16_vec, reg_repls);
|
|
1837
|
+
std::string proc_mul_mat_reg_tile_q4_0_f32 =
|
|
1838
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32, reg_repls);
|
|
1839
|
+
std::string proc_mul_mat_reg_tile_q4_0_f32_vec =
|
|
1840
|
+
ggml_webgpu_process_shader_repls(wgsl_mul_mat_reg_tile_q4_0_f32_vec, reg_repls);
|
|
1841
|
+
|
|
1842
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] =
|
|
1843
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32.c_str(),
|
|
1844
|
+
"mul_mat_reg_tile_f32_f32", mul_mat_reg_tile_constants);
|
|
1845
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] =
|
|
1846
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f32_f32_vec.c_str(),
|
|
1847
|
+
"mul_mat_reg_tile_f32_f32_vec", mul_mat_reg_tile_constants);
|
|
1848
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] =
|
|
1849
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32.c_str(),
|
|
1850
|
+
"mul_mat_reg_tile_f16_f32", mul_mat_reg_tile_constants);
|
|
1851
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] =
|
|
1852
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f32_vec.c_str(),
|
|
1853
|
+
"mul_mat_reg_tile_f16_f32_vec", mul_mat_reg_tile_constants);
|
|
1854
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] =
|
|
1855
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16.c_str(),
|
|
1856
|
+
"mul_mat_reg_tile_f16_f16", mul_mat_reg_tile_constants);
|
|
1857
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] =
|
|
1858
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_f16_f16_vec.c_str(),
|
|
1859
|
+
"mul_mat_reg_tile_f16_f16_vec", mul_mat_reg_tile_constants);
|
|
1860
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] =
|
|
1861
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32.c_str(),
|
|
1862
|
+
"mul_mat_reg_tile_q4_0_f32", mul_mat_reg_tile_constants);
|
|
1863
|
+
webgpu_ctx->mul_mat_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][1] =
|
|
1864
|
+
ggml_webgpu_create_pipeline2(webgpu_ctx->device, proc_mul_mat_reg_tile_q4_0_f32_vec.c_str(),
|
|
1865
|
+
"mul_mat_reg_tile_q4_0_f32_vec", mul_mat_reg_tile_constants);
|
|
1866
|
+
}
|
|
1867
|
+
|
|
1868
|
+
std::vector<wgpu::ConstantEntry> mul_mat_vec_constants(3);
|
|
1869
|
+
mul_mat_vec_constants[0].key = "WORKGROUP_SIZE";
|
|
1870
|
+
mul_mat_vec_constants[0].value = WEBGPU_MUL_MAT_VEC_WG_SIZE;
|
|
1871
|
+
mul_mat_vec_constants[1].key = "TILE_K";
|
|
1872
|
+
mul_mat_vec_constants[1].value = WEBGPU_MUL_MAT_VEC_TILE_K;
|
|
1873
|
+
mul_mat_vec_constants[2].key = "OUTPUTS_PER_WG";
|
|
1874
|
+
mul_mat_vec_constants[2].value = WEBGPU_MUL_MAT_VEC_OUTPUTS_PER_WG;
|
|
1875
|
+
|
|
1876
|
+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
1877
|
+
webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32, "mul_mat_vec_f32_f32", mul_mat_vec_constants);
|
|
1878
|
+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F32][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
|
1879
|
+
webgpu_ctx->device, wgsl_mul_mat_vec_f32_f32_vec, "mul_mat_vec_f32_f32_vec", mul_mat_vec_constants);
|
|
1880
|
+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
1881
|
+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32, "mul_mat_vec_f16_f32", mul_mat_vec_constants);
|
|
1882
|
+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F32][1] = ggml_webgpu_create_pipeline2(
|
|
1883
|
+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f32_vec, "mul_mat_vec_f16_f32_vec", mul_mat_vec_constants);
|
|
1884
|
+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][0] = ggml_webgpu_create_pipeline2(
|
|
1885
|
+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16, "mul_mat_vec_f16_f16", mul_mat_vec_constants);
|
|
1886
|
+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_F16][GGML_TYPE_F16][1] = ggml_webgpu_create_pipeline2(
|
|
1887
|
+
webgpu_ctx->device, wgsl_mul_mat_vec_f16_f16_vec, "mul_mat_vec_f16_f16_vec", mul_mat_vec_constants);
|
|
1888
|
+
webgpu_ctx->mul_mat_vec_pipelines[GGML_TYPE_Q4_0][GGML_TYPE_F32][0] = ggml_webgpu_create_pipeline2(
|
|
1889
|
+
webgpu_ctx->device, wgsl_mul_mat_vec_q4_0_f32, "mul_mat_vec_q4_0_f32", mul_mat_vec_constants);
|
|
994
1890
|
}
|
|
995
1891
|
|
|
996
1892
|
static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) {
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1893
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][0], wgsl_set_rows_f16,
|
|
1894
|
+
"set_rows_f16", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
|
|
1895
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->set_rows_pipeline[0][1], wgsl_set_rows_f16_vec,
|
|
1896
|
+
"set_rows_f16_vec", ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x));
|
|
1897
|
+
}
|
|
1898
|
+
|
|
1899
|
+
static void ggml_webgpu_init_get_rows_pipeline(webgpu_context & webgpu_ctx) {
|
|
1900
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
1901
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F32], wgsl_get_rows_f32_vec,
|
|
1902
|
+
"get_rows_f32_vec", constants);
|
|
1903
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_f32_no_vec_pipeline, wgsl_get_rows_f32,
|
|
1904
|
+
"get_rows_f32", constants);
|
|
1905
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_F16], wgsl_get_rows_f16,
|
|
1906
|
+
"get_rows_f16", constants);
|
|
1907
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_I32], wgsl_get_rows_i32,
|
|
1908
|
+
"get_rows_i32", constants);
|
|
1909
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_0], wgsl_get_rows_q4_0,
|
|
1910
|
+
"get_rows_q4_0", constants);
|
|
1911
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_1], wgsl_get_rows_q4_1,
|
|
1912
|
+
"get_rows_q4_1", constants);
|
|
1913
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_0], wgsl_get_rows_q5_0,
|
|
1914
|
+
"get_rows_q5_0", constants);
|
|
1915
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_1], wgsl_get_rows_q5_1,
|
|
1916
|
+
"get_rows_q5_1", constants);
|
|
1917
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q8_0], wgsl_get_rows_q8_0,
|
|
1918
|
+
"get_rows_q8_0", constants);
|
|
1919
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q2_K], wgsl_get_rows_q2_k,
|
|
1920
|
+
"get_rows_q2_k", constants);
|
|
1921
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q3_K], wgsl_get_rows_q3_k,
|
|
1922
|
+
"get_rows_q3_k", constants);
|
|
1923
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q4_K], wgsl_get_rows_q4_k,
|
|
1924
|
+
"get_rows_q4_k", constants);
|
|
1925
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q5_K], wgsl_get_rows_q5_k,
|
|
1926
|
+
"get_rows_q5_k", constants);
|
|
1927
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_Q6_K], wgsl_get_rows_q6_k,
|
|
1928
|
+
"get_rows_q6_k", constants);
|
|
1929
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_XXS],
|
|
1930
|
+
wgsl_get_rows_iq2_xxs, "get_rows_iq2_xxs", constants);
|
|
1931
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_XS],
|
|
1932
|
+
wgsl_get_rows_iq2_xs, "get_rows_iq2_xs", constants);
|
|
1933
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ2_S], wgsl_get_rows_iq2_s,
|
|
1934
|
+
"get_rows_iq2_s", constants);
|
|
1935
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ3_XXS],
|
|
1936
|
+
wgsl_get_rows_iq3_xxs, "get_rows_iq3_xxs", constants);
|
|
1937
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ3_S], wgsl_get_rows_iq3_s,
|
|
1938
|
+
"get_rows_iq3_s", constants);
|
|
1939
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ1_S], wgsl_get_rows_iq1_s,
|
|
1940
|
+
"get_rows_iq1_s", constants);
|
|
1941
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ1_M], wgsl_get_rows_iq1_m,
|
|
1942
|
+
"get_rows_iq1_m", constants);
|
|
1943
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ4_NL],
|
|
1944
|
+
wgsl_get_rows_iq4_nl, "get_rows_iq4_nl", constants);
|
|
1945
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->get_rows_pipeline[GGML_TYPE_IQ4_XS],
|
|
1946
|
+
wgsl_get_rows_iq4_xs, "get_rows_iq4_xs", constants);
|
|
1002
1947
|
}
|
|
1003
1948
|
|
|
1004
1949
|
static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
|
|
1005
|
-
std::vector<wgpu::ConstantEntry> constants(
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline,
|
|
1950
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
1951
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F32][GGML_TYPE_F32],
|
|
1952
|
+
wgsl_cpy_f32_f32, "cpy_f32_f32", constants);
|
|
1953
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F32][GGML_TYPE_F16],
|
|
1954
|
+
wgsl_cpy_f32_f16, "cpy_f32_f16", constants);
|
|
1955
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F16][GGML_TYPE_F32],
|
|
1956
|
+
wgsl_cpy_f16_f32, "cpy_f16_f32", constants);
|
|
1957
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->cpy_pipeline[GGML_TYPE_F16][GGML_TYPE_F16],
|
|
1958
|
+
wgsl_cpy_f16_f16, "cpy_f16_f16", constants);
|
|
1959
|
+
}
|
|
1960
|
+
|
|
1961
|
+
static void ggml_webgpu_init_add_pipeline(webgpu_context & webgpu_ctx) {
|
|
1962
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
1963
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][0], wgsl_add_f32, "add_f32",
|
|
1964
|
+
constants);
|
|
1965
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][0], wgsl_add_f16, "add_f16",
|
|
1966
|
+
constants);
|
|
1967
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F32][1], wgsl_add_f32_inplace,
|
|
1968
|
+
"add_f32_inplace", constants);
|
|
1969
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->add_pipeline[GGML_TYPE_F16][1], wgsl_add_f16_inplace,
|
|
1970
|
+
"add_f16_inplace", constants);
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
static void ggml_webgpu_init_sub_pipeline(webgpu_context & webgpu_ctx) {
|
|
1974
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
1975
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][0], wgsl_sub_f32, "sub_f32",
|
|
1976
|
+
constants);
|
|
1977
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][0], wgsl_sub_f16, "sub_f16",
|
|
1978
|
+
constants);
|
|
1979
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F32][1], wgsl_sub_f32_inplace,
|
|
1980
|
+
"sub_f32_inplace", constants);
|
|
1981
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->sub_pipeline[GGML_TYPE_F16][1], wgsl_sub_f16_inplace,
|
|
1982
|
+
"sub_f16_inplace", constants);
|
|
1983
|
+
}
|
|
1984
|
+
|
|
1985
|
+
static void ggml_webgpu_init_mul_pipeline(webgpu_context & webgpu_ctx) {
|
|
1986
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
1987
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][0], wgsl_mul_f32, "mul_f32",
|
|
1988
|
+
constants);
|
|
1989
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][0], wgsl_mul_f16, "mul_f16",
|
|
1990
|
+
constants);
|
|
1991
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F32][1], wgsl_mul_f32_inplace,
|
|
1992
|
+
"mul_f32_inplace", constants);
|
|
1993
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_pipeline[GGML_TYPE_F16][1], wgsl_mul_f16_inplace,
|
|
1994
|
+
"mul_f16_inplace", constants);
|
|
1995
|
+
}
|
|
1996
|
+
|
|
1997
|
+
static void ggml_webgpu_init_div_pipeline(webgpu_context & webgpu_ctx) {
|
|
1998
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
1999
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][0], wgsl_div_f32, "div_f32",
|
|
2000
|
+
constants);
|
|
2001
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][0], wgsl_div_f16, "div_f16",
|
|
2002
|
+
constants);
|
|
2003
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F32][1], wgsl_div_f32_inplace,
|
|
2004
|
+
"div_f32_inplace", constants);
|
|
2005
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->div_pipeline[GGML_TYPE_F16][1], wgsl_div_f16_inplace,
|
|
2006
|
+
"div_f16_inplace", constants);
|
|
2007
|
+
}
|
|
2008
|
+
|
|
2009
|
+
static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
|
|
2010
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_ROW_SPLIT_WG_SIZE);
|
|
2011
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline[0], wgsl_rms_norm, "rms_norm",
|
|
2012
|
+
constants);
|
|
2013
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rms_norm_pipeline[1], wgsl_rms_norm_inplace,
|
|
2014
|
+
"rms_norm_inplace", constants);
|
|
2015
|
+
}
|
|
2016
|
+
|
|
2017
|
+
static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
|
|
2018
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
2019
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][0], wgsl_rope_f32,
|
|
2020
|
+
"rope_f32", constants);
|
|
2021
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][0][1],
|
|
2022
|
+
wgsl_rope_f32_inplace, "rope_f32_inplace", constants);
|
|
2023
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][1][0], wgsl_rope_f32_ff,
|
|
2024
|
+
"rope_f32_ff", constants);
|
|
2025
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F32][1][1],
|
|
2026
|
+
wgsl_rope_f32_ff_inplace, "rope_f32_ff_inplace", constants);
|
|
2027
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][0], wgsl_rope_f16,
|
|
2028
|
+
"rope_f16", constants);
|
|
2029
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][0][1],
|
|
2030
|
+
wgsl_rope_f16_inplace, "rope_f16_inplace", constants);
|
|
2031
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][1][0], wgsl_rope_f16_ff,
|
|
2032
|
+
"rope_f16_ff", constants);
|
|
2033
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->rope_pipeline[GGML_TYPE_F16][1][1],
|
|
2034
|
+
wgsl_rope_f16_ff_inplace, "rope_f16_ff_inplace", constants);
|
|
2035
|
+
}
|
|
2036
|
+
|
|
2037
|
+
static void ggml_webgpu_init_glu_pipeline(webgpu_context & webgpu_ctx) {
|
|
2038
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
2039
|
+
// reglu
|
|
2040
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F32][0],
|
|
2041
|
+
wgsl_reglu_f32, "reglu_f32", constants);
|
|
2042
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F16][0],
|
|
2043
|
+
wgsl_reglu_f16, "reglu_f16", constants);
|
|
2044
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F32][1],
|
|
2045
|
+
wgsl_reglu_f32_split, "reglu_f32_split", constants);
|
|
2046
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_REGLU][GGML_TYPE_F16][1],
|
|
2047
|
+
wgsl_reglu_f16_split, "reglu_f16_split", constants);
|
|
2048
|
+
// geglu
|
|
2049
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F32][0],
|
|
2050
|
+
wgsl_geglu_f32, "geglu_f32", constants);
|
|
2051
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F16][0],
|
|
2052
|
+
wgsl_geglu_f16, "geglu_f16", constants);
|
|
2053
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F32][1],
|
|
2054
|
+
wgsl_geglu_f32_split, "geglu_f32_split", constants);
|
|
2055
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU][GGML_TYPE_F16][1],
|
|
2056
|
+
wgsl_geglu_f16_split, "geglu_f16_split", constants);
|
|
2057
|
+
// swiglu
|
|
2058
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F32][0],
|
|
2059
|
+
wgsl_swiglu_f32, "swiglu_f32", constants);
|
|
2060
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F16][0],
|
|
2061
|
+
wgsl_swiglu_f16, "swiglu_f16", constants);
|
|
2062
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F32][1],
|
|
2063
|
+
wgsl_swiglu_f32_split, "swiglu_f32_split", constants);
|
|
2064
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU][GGML_TYPE_F16][1],
|
|
2065
|
+
wgsl_swiglu_f16_split, "swiglu_f16_split", constants);
|
|
2066
|
+
// swiglu_oai
|
|
2067
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU_OAI][GGML_TYPE_F32][0],
|
|
2068
|
+
wgsl_swiglu_oai_f32, "swiglu_oai_f32", constants);
|
|
2069
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_SWIGLU_OAI][GGML_TYPE_F32][1],
|
|
2070
|
+
wgsl_swiglu_oai_f32_split, "swiglu_oai_f32_split", constants);
|
|
2071
|
+
// geglu_erf
|
|
2072
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F32][0],
|
|
2073
|
+
wgsl_geglu_erf_f32, "geglu_erf_f32", constants);
|
|
2074
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F16][0],
|
|
2075
|
+
wgsl_geglu_erf_f16, "geglu_erf_f16", constants);
|
|
2076
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F32][1],
|
|
2077
|
+
wgsl_geglu_erf_f32_split, "geglu_erf_f32_split", constants);
|
|
2078
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_ERF][GGML_TYPE_F16][1],
|
|
2079
|
+
wgsl_geglu_erf_f16_split, "geglu_erf_f16_split", constants);
|
|
2080
|
+
// geglu_quick
|
|
2081
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F32][0],
|
|
2082
|
+
wgsl_geglu_quick_f32, "geglu_quick_f32", constants);
|
|
2083
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F16][0],
|
|
2084
|
+
wgsl_geglu_quick_f16, "geglu_quick_f16", constants);
|
|
2085
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F32][1],
|
|
2086
|
+
wgsl_geglu_quick_f32_split, "geglu_quick_f32_split", constants);
|
|
2087
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->glu_pipeline[GGML_GLU_OP_GEGLU_QUICK][GGML_TYPE_F16][1],
|
|
2088
|
+
wgsl_geglu_quick_f16_split, "geglu_quick_f16_split", constants);
|
|
2089
|
+
}
|
|
2090
|
+
|
|
2091
|
+
static void ggml_webgpu_init_scale_pipeline(webgpu_context & webgpu_ctx) {
|
|
2092
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(webgpu_ctx->max_wg_size_x);
|
|
2093
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->scale_pipeline[0], wgsl_scale_f32, "scale_f32",
|
|
2094
|
+
constants);
|
|
2095
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->scale_pipeline[1], wgsl_scale_f32_inplace,
|
|
2096
|
+
"scale_f32_inplace", constants);
|
|
2097
|
+
}
|
|
2098
|
+
|
|
2099
|
+
static void ggml_webgpu_init_soft_max_pipeline(webgpu_context & webgpu_ctx) {
|
|
2100
|
+
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_ROW_SPLIT_WG_SIZE);
|
|
2101
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][0][0], wgsl_soft_max_f32,
|
|
2102
|
+
"soft_max_f32", constants);
|
|
2103
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][0][1], wgsl_soft_max_f32_inplace,
|
|
2104
|
+
"soft_max_f32_inplace", constants);
|
|
2105
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][1][0], wgsl_soft_max_f32_sink,
|
|
2106
|
+
"soft_max_f32_sink", constants);
|
|
2107
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[2][1][1],
|
|
2108
|
+
wgsl_soft_max_f32_sink_inplace, "soft_max_f32_sink_inplace", constants);
|
|
2109
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][0][0], wgsl_soft_max_f32_mask_f32,
|
|
2110
|
+
"soft_max_f32_mask_f32", constants);
|
|
2111
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][0][1],
|
|
2112
|
+
wgsl_soft_max_f32_mask_f32_inplace, "soft_max_f32_mask_f32_inplace", constants);
|
|
2113
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][0][0], wgsl_soft_max_f32_mask_f16,
|
|
2114
|
+
"soft_max_f32_mask_f16", constants);
|
|
2115
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][0][1],
|
|
2116
|
+
wgsl_soft_max_f32_mask_f16_inplace, "soft_max_f32_mask_f16_inplace", constants);
|
|
2117
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][1][0],
|
|
2118
|
+
wgsl_soft_max_f32_mask_f32_sink, "soft_max_f32_mask_f32_sink", constants);
|
|
2119
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[0][1][1],
|
|
2120
|
+
wgsl_soft_max_f32_mask_f32_sink_inplace, "soft_max_f32_mask_f32_sink_inplace",
|
|
2121
|
+
constants);
|
|
2122
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][1][0],
|
|
2123
|
+
wgsl_soft_max_f32_mask_f16_sink, "soft_max_f32_mask_f16_sink", constants);
|
|
2124
|
+
ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->soft_max_pipeline[1][1][1],
|
|
2125
|
+
wgsl_soft_max_f32_mask_f16_sink_inplace, "soft_max_f32_mask_f16_sink_inplace",
|
|
2126
|
+
constants);
|
|
1009
2127
|
}
|
|
1010
2128
|
|
|
1011
2129
|
static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
@@ -1055,24 +2173,89 @@ static bool ggml_backend_webgpu_device_supports_buft(ggml_backend_dev_t dev, ggm
|
|
|
1055
2173
|
return buft->iface.get_name == ggml_backend_webgpu_buffer_type_get_name;
|
|
1056
2174
|
}
|
|
1057
2175
|
|
|
2176
|
+
static bool ggml_webgpu_supported_qtype(ggml_type type) {
|
|
2177
|
+
switch (type) {
|
|
2178
|
+
case GGML_TYPE_Q4_0:
|
|
2179
|
+
case GGML_TYPE_Q4_1:
|
|
2180
|
+
case GGML_TYPE_Q5_0:
|
|
2181
|
+
case GGML_TYPE_Q5_1:
|
|
2182
|
+
case GGML_TYPE_Q8_0:
|
|
2183
|
+
case GGML_TYPE_Q2_K:
|
|
2184
|
+
case GGML_TYPE_Q3_K:
|
|
2185
|
+
case GGML_TYPE_Q4_K:
|
|
2186
|
+
case GGML_TYPE_Q5_K:
|
|
2187
|
+
case GGML_TYPE_Q6_K:
|
|
2188
|
+
case GGML_TYPE_IQ2_XXS:
|
|
2189
|
+
case GGML_TYPE_IQ2_XS:
|
|
2190
|
+
case GGML_TYPE_IQ2_S:
|
|
2191
|
+
case GGML_TYPE_IQ3_XXS:
|
|
2192
|
+
case GGML_TYPE_IQ3_S:
|
|
2193
|
+
case GGML_TYPE_IQ1_S:
|
|
2194
|
+
case GGML_TYPE_IQ1_M:
|
|
2195
|
+
case GGML_TYPE_IQ4_NL:
|
|
2196
|
+
case GGML_TYPE_IQ4_XS:
|
|
2197
|
+
return true;
|
|
2198
|
+
default:
|
|
2199
|
+
return false;
|
|
2200
|
+
}
|
|
2201
|
+
}
|
|
2202
|
+
|
|
1058
2203
|
static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
|
1059
|
-
|
|
2204
|
+
ggml_backend_webgpu_device_context * ctx = static_cast<ggml_backend_webgpu_device_context *>(dev->context);
|
|
2205
|
+
|
|
2206
|
+
webgpu_context webgpu_ctx = ctx->webgpu_ctx;
|
|
1060
2207
|
|
|
2208
|
+
ggml_tensor * src0 = op->src[0];
|
|
2209
|
+
ggml_tensor * src1 = op->src[1];
|
|
2210
|
+
ggml_tensor * src2 = op->src[2];
|
|
2211
|
+
|
|
2212
|
+
// on smaller devices (or CI), tensors may be larger than the max storage buffer size
|
|
2213
|
+
if (ggml_nbytes(op) > webgpu_ctx->limits.maxStorageBufferBindingSize ||
|
|
2214
|
+
(src0 != nullptr && ggml_nbytes(src0) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
|
|
2215
|
+
(src1 != nullptr && ggml_nbytes(src1) > webgpu_ctx->limits.maxStorageBufferBindingSize)) {
|
|
2216
|
+
return false;
|
|
2217
|
+
}
|
|
2218
|
+
|
|
2219
|
+
bool supports_op = false;
|
|
1061
2220
|
switch (op->op) {
|
|
1062
2221
|
case GGML_OP_NONE:
|
|
1063
2222
|
case GGML_OP_VIEW:
|
|
1064
2223
|
case GGML_OP_PERMUTE:
|
|
1065
|
-
|
|
2224
|
+
case GGML_OP_TRANSPOSE:
|
|
2225
|
+
case GGML_OP_RESHAPE:
|
|
2226
|
+
supports_op = true;
|
|
2227
|
+
break;
|
|
2228
|
+
case GGML_OP_ADD:
|
|
2229
|
+
case GGML_OP_SUB:
|
|
2230
|
+
case GGML_OP_MUL:
|
|
2231
|
+
case GGML_OP_DIV:
|
|
2232
|
+
// TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE
|
|
2233
|
+
// see https://github.com/ggml-org/llama.cpp/pull/16857
|
|
2234
|
+
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) &&
|
|
2235
|
+
(src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
|
|
2236
|
+
break;
|
|
1066
2237
|
case GGML_OP_CPY:
|
|
2238
|
+
case GGML_OP_CONT:
|
|
2239
|
+
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) &&
|
|
2240
|
+
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
|
2241
|
+
break;
|
|
1067
2242
|
case GGML_OP_SET_ROWS:
|
|
1068
|
-
|
|
2243
|
+
supports_op = (op->type == GGML_TYPE_F16 && src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I64);
|
|
2244
|
+
break;
|
|
2245
|
+
case GGML_OP_GET_ROWS:
|
|
2246
|
+
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_I32 ||
|
|
2247
|
+
ggml_webgpu_supported_qtype(src0->type)) {
|
|
2248
|
+
supports_op = (op->type == GGML_TYPE_F32);
|
|
2249
|
+
}
|
|
2250
|
+
break;
|
|
1069
2251
|
case GGML_OP_MUL_MAT:
|
|
1070
2252
|
{
|
|
1071
|
-
switch (
|
|
2253
|
+
switch (src1->type) {
|
|
1072
2254
|
case GGML_TYPE_F16:
|
|
1073
|
-
|
|
2255
|
+
supports_op |= (src0->type == GGML_TYPE_F16);
|
|
2256
|
+
break;
|
|
1074
2257
|
case GGML_TYPE_F32:
|
|
1075
|
-
switch (
|
|
2258
|
+
switch (src0->type) {
|
|
1076
2259
|
case GGML_TYPE_F32:
|
|
1077
2260
|
case GGML_TYPE_F16:
|
|
1078
2261
|
case GGML_TYPE_Q4_0:
|
|
@@ -1094,17 +2277,67 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
|
|
1094
2277
|
case GGML_TYPE_IQ1_M:
|
|
1095
2278
|
case GGML_TYPE_IQ4_NL:
|
|
1096
2279
|
case GGML_TYPE_IQ4_XS:
|
|
1097
|
-
|
|
2280
|
+
supports_op = true;
|
|
2281
|
+
break;
|
|
1098
2282
|
default:
|
|
1099
|
-
|
|
2283
|
+
break;
|
|
1100
2284
|
}
|
|
1101
2285
|
default:
|
|
1102
|
-
|
|
2286
|
+
break;
|
|
1103
2287
|
}
|
|
2288
|
+
break;
|
|
2289
|
+
}
|
|
2290
|
+
case GGML_OP_RMS_NORM:
|
|
2291
|
+
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
|
|
2292
|
+
break;
|
|
2293
|
+
case GGML_OP_ROPE:
|
|
2294
|
+
supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
|
|
2295
|
+
break;
|
|
2296
|
+
case GGML_OP_GLU:
|
|
2297
|
+
switch (ggml_get_glu_op(op)) {
|
|
2298
|
+
case GGML_GLU_OP_REGLU:
|
|
2299
|
+
case GGML_GLU_OP_GEGLU:
|
|
2300
|
+
case GGML_GLU_OP_SWIGLU:
|
|
2301
|
+
case GGML_GLU_OP_GEGLU_ERF:
|
|
2302
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
2303
|
+
supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16;
|
|
2304
|
+
break;
|
|
2305
|
+
case GGML_GLU_OP_SWIGLU_OAI:
|
|
2306
|
+
supports_op = op->type == GGML_TYPE_F32;
|
|
2307
|
+
break;
|
|
2308
|
+
default:
|
|
2309
|
+
break;
|
|
1104
2310
|
}
|
|
2311
|
+
break;
|
|
2312
|
+
case GGML_OP_SCALE:
|
|
2313
|
+
supports_op = op->type == GGML_TYPE_F32;
|
|
2314
|
+
break;
|
|
2315
|
+
case GGML_OP_SOFT_MAX:
|
|
2316
|
+
supports_op = op->type == GGML_TYPE_F32;
|
|
2317
|
+
break;
|
|
1105
2318
|
default:
|
|
1106
|
-
|
|
2319
|
+
break;
|
|
1107
2320
|
}
|
|
2321
|
+
if (ggml_nbytes(op) > webgpu_ctx->limits.maxStorageBufferBindingSize ||
|
|
2322
|
+
(src0 != nullptr && ggml_nbytes(src0) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
|
|
2323
|
+
(src1 != nullptr && ggml_nbytes(src1) > webgpu_ctx->limits.maxStorageBufferBindingSize) ||
|
|
2324
|
+
(src2 != nullptr && ggml_nbytes(src2) > webgpu_ctx->limits.maxStorageBufferBindingSize)) {
|
|
2325
|
+
supports_op = false;
|
|
2326
|
+
WEBGPU_LOG_DEBUG("ggml_webgpu op not supported due to size: ");
|
|
2327
|
+
}
|
|
2328
|
+
|
|
2329
|
+
if (!supports_op) {
|
|
2330
|
+
WEBGPU_LOG_DEBUG("ggml_webgpu op not supported: "
|
|
2331
|
+
<< ggml_op_name(op->op) << " with types dst: " << ggml_type_name(op->type)
|
|
2332
|
+
<< ", src0: " << (op->src[0] ? ggml_type_name(op->src[0]->type) : "null")
|
|
2333
|
+
<< ", src1: " << (op->src[1] ? ggml_type_name(op->src[1]->type) : "null"));
|
|
2334
|
+
} else {
|
|
2335
|
+
WEBGPU_LOG_DEBUG("ggml_webgpu op supported: "
|
|
2336
|
+
<< ggml_op_name(op->op) << " with types dst: " << ggml_type_name(op->type)
|
|
2337
|
+
<< ", src0: " << (op->src[0] ? ggml_type_name(op->src[0]->type) : "null")
|
|
2338
|
+
<< ", src1: " << (op->src[1] ? ggml_type_name(op->src[1]->type) : "null"));
|
|
2339
|
+
}
|
|
2340
|
+
return supports_op;
|
|
1108
2341
|
}
|
|
1109
2342
|
|
|
1110
2343
|
static struct ggml_backend_device_i ggml_backend_webgpu_device_i = {
|
|
@@ -1145,33 +2378,92 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
1145
2378
|
GGML_ASSERT(index == 0);
|
|
1146
2379
|
WEBGPU_LOG_DEBUG("ggml_backend_reg_get_device()");
|
|
1147
2380
|
|
|
2381
|
+
WEBGPU_CPU_PROFILE_TOTAL_START(reg_get_device);
|
|
2382
|
+
|
|
1148
2383
|
ggml_backend_webgpu_reg_context * reg_ctx = static_cast<ggml_backend_webgpu_reg_context *>(reg->context);
|
|
1149
2384
|
|
|
1150
2385
|
webgpu_context ctx = reg_ctx->webgpu_ctx;
|
|
1151
2386
|
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
2387
|
+
// TODO: track need for these toggles: https://issues.chromium.org/issues/42251215
|
|
2388
|
+
const char * const adapterEnabledToggles[] = { "vulkan_enable_f16_on_nvidia", "use_vulkan_memory_model" };
|
|
2389
|
+
wgpu::DawnTogglesDescriptor adapterTogglesDesc;
|
|
2390
|
+
adapterTogglesDesc.enabledToggles = adapterEnabledToggles;
|
|
2391
|
+
adapterTogglesDesc.enabledToggleCount = 2;
|
|
2392
|
+
wgpu::RequestAdapterOptions options = {};
|
|
2393
|
+
options.nextInChain = &adapterTogglesDesc;
|
|
2394
|
+
ctx->instance.WaitAny(ctx->instance.RequestAdapter(
|
|
2395
|
+
&options, wgpu::CallbackMode::AllowSpontaneous,
|
|
2396
|
+
[&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) {
|
|
2397
|
+
if (status != wgpu::RequestAdapterStatus::Success) {
|
|
2398
|
+
GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message);
|
|
2399
|
+
return;
|
|
2400
|
+
}
|
|
2401
|
+
ctx->adapter = std::move(adapter);
|
|
2402
|
+
}),
|
|
2403
|
+
UINT64_MAX);
|
|
1164
2404
|
GGML_ASSERT(ctx->adapter != nullptr);
|
|
1165
2405
|
|
|
1166
2406
|
ctx->adapter.GetLimits(&ctx->limits);
|
|
2407
|
+
ctx->max_wg_size_x = 288; // default value
|
|
1167
2408
|
|
|
1168
|
-
wgpu::AdapterInfo
|
|
2409
|
+
wgpu::AdapterInfo info{};
|
|
2410
|
+
wgpu::AdapterPropertiesSubgroupMatrixConfigs subgroup_matrix_configs{};
|
|
2411
|
+
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
|
2412
|
+
info.nextInChain = &subgroup_matrix_configs;
|
|
2413
|
+
}
|
|
1169
2414
|
ctx->adapter.GetInfo(&info);
|
|
1170
2415
|
|
|
2416
|
+
wgpu::SupportedFeatures features;
|
|
2417
|
+
ctx->adapter.GetFeatures(&features);
|
|
2418
|
+
// we require f16 support
|
|
2419
|
+
GGML_ASSERT(ctx->adapter.HasFeature(wgpu::FeatureName::ShaderF16));
|
|
2420
|
+
|
|
2421
|
+
// Only support square f16 matrices of size 8 or 16 for now
|
|
2422
|
+
bool valid_subgroup_matrix_config = false;
|
|
2423
|
+
if (ctx->adapter.HasFeature(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix)) {
|
|
2424
|
+
for (size_t i = 0; i < subgroup_matrix_configs.configCount; i++) {
|
|
2425
|
+
const wgpu::SubgroupMatrixConfig config = subgroup_matrix_configs.configs[i];
|
|
2426
|
+
if (config.M == config.N && config.N == config.K && (config.K == 8 || config.K == 16) &&
|
|
2427
|
+
config.componentType == wgpu::SubgroupMatrixComponentType::F16 &&
|
|
2428
|
+
config.resultComponentType == wgpu::SubgroupMatrixComponentType::F16) {
|
|
2429
|
+
ctx->subgroup_matrix_config = config;
|
|
2430
|
+
valid_subgroup_matrix_config = true;
|
|
2431
|
+
break;
|
|
2432
|
+
}
|
|
2433
|
+
}
|
|
2434
|
+
}
|
|
2435
|
+
|
|
2436
|
+
// For subgroup matrix code to be the most efficient, we would like the subgroup size to be consistent and accurate.
|
|
2437
|
+
// Unfortunately, that is not possible, so we use the maximum subgroup size reported by the adapter.
|
|
2438
|
+
ctx->subgroup_size = info.subgroupMaxSize;
|
|
2439
|
+
ctx->supports_subgroup_matrix = valid_subgroup_matrix_config;
|
|
2440
|
+
|
|
1171
2441
|
// Initialize device
|
|
1172
2442
|
std::vector<wgpu::FeatureName> required_features = { wgpu::FeatureName::ShaderF16,
|
|
1173
2443
|
wgpu::FeatureName::ImplicitDeviceSynchronization };
|
|
1174
|
-
|
|
2444
|
+
if (ctx->supports_subgroup_matrix) {
|
|
2445
|
+
required_features.push_back(wgpu::FeatureName::Subgroups);
|
|
2446
|
+
required_features.push_back(wgpu::FeatureName::ChromiumExperimentalSubgroupMatrix);
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2449
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
2450
|
+
required_features.push_back(wgpu::FeatureName::TimestampQuery);
|
|
2451
|
+
#endif
|
|
2452
|
+
|
|
2453
|
+
// Enable Dawn-specific toggles to increase native performance
|
|
2454
|
+
// TODO: Don't enable for WASM builds, they won't have an effect anyways
|
|
2455
|
+
// TODO: Maybe WebGPU needs a "fast" mode where you can request compilers skip adding checks like these,
|
|
2456
|
+
// only for native performance?
|
|
2457
|
+
const char * const deviceEnabledToggles[] = { "skip_validation", "disable_robustness", "disable_workgroup_init",
|
|
2458
|
+
"disable_polyfills_on_integer_div_and_mod" };
|
|
2459
|
+
const char * const deviceDisabledToggles[] = { "timestamp_quantization" };
|
|
2460
|
+
wgpu::DawnTogglesDescriptor deviceTogglesDesc;
|
|
2461
|
+
deviceTogglesDesc.enabledToggles = deviceEnabledToggles;
|
|
2462
|
+
deviceTogglesDesc.enabledToggleCount = 4;
|
|
2463
|
+
deviceTogglesDesc.disabledToggles = deviceDisabledToggles;
|
|
2464
|
+
deviceTogglesDesc.disabledToggleCount = 1;
|
|
2465
|
+
|
|
2466
|
+
wgpu::DeviceDescriptor dev_desc;
|
|
1175
2467
|
dev_desc.requiredLimits = &ctx->limits;
|
|
1176
2468
|
dev_desc.requiredFeatures = required_features.data();
|
|
1177
2469
|
dev_desc.requiredFeatureCount = required_features.size();
|
|
@@ -1179,21 +2471,22 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
1179
2471
|
wgpu::CallbackMode::AllowSpontaneous,
|
|
1180
2472
|
[](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) {
|
|
1181
2473
|
GGML_UNUSED(device);
|
|
1182
|
-
GGML_LOG_ERROR(
|
|
1183
|
-
|
|
2474
|
+
GGML_LOG_ERROR("ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
|
2475
|
+
std::string(message).c_str());
|
|
1184
2476
|
});
|
|
1185
2477
|
dev_desc.SetUncapturedErrorCallback(
|
|
1186
2478
|
[](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) {
|
|
1187
2479
|
GGML_UNUSED(device);
|
|
1188
|
-
|
|
1189
|
-
|
|
2480
|
+
GGML_ABORT("ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast<int>(reason),
|
|
2481
|
+
std::string(message).c_str());
|
|
1190
2482
|
});
|
|
2483
|
+
dev_desc.nextInChain = &deviceTogglesDesc;
|
|
1191
2484
|
ctx->instance.WaitAny(ctx->adapter.RequestDevice(
|
|
1192
|
-
&dev_desc,
|
|
1193
|
-
wgpu::CallbackMode::AllowSpontaneous,
|
|
2485
|
+
&dev_desc, wgpu::CallbackMode::AllowSpontaneous,
|
|
1194
2486
|
[ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) {
|
|
1195
2487
|
if (status != wgpu::RequestDeviceStatus::Success) {
|
|
1196
|
-
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n",
|
|
2488
|
+
GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n",
|
|
2489
|
+
std::string(message).c_str());
|
|
1197
2490
|
return;
|
|
1198
2491
|
}
|
|
1199
2492
|
ctx->device = std::move(device);
|
|
@@ -1205,34 +2498,43 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
1205
2498
|
ctx->queue = ctx->device.GetQueue();
|
|
1206
2499
|
|
|
1207
2500
|
// Create buffer pool for shader parameters
|
|
1208
|
-
ctx->param_buf_pool.init(ctx->device,
|
|
1209
|
-
WEBGPU_NUM_PARAM_BUFS,
|
|
1210
|
-
WEBGPU_PARAMS_BUF_SIZE_BYTES,
|
|
2501
|
+
ctx->param_buf_pool.init(ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES,
|
|
1211
2502
|
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform,
|
|
1212
2503
|
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite);
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
2504
|
+
|
|
2505
|
+
#ifdef GGML_WEBGPU_GPU_PROFILE
|
|
2506
|
+
// Initialize buffer pool for timestamp queries (profiling)
|
|
2507
|
+
ctx->timestamp_query_buf_pool.init(ctx->device, WEBGPU_NUM_TIMESTAMP_QUERY_BUFS,
|
|
2508
|
+
WEBGPU_TIMESTAMP_QUERY_BUF_SIZE_BYTES,
|
|
2509
|
+
wgpu::BufferUsage::QueryResolve | wgpu::BufferUsage::CopySrc,
|
|
2510
|
+
wgpu::BufferUsage::MapRead | wgpu::BufferUsage::CopyDst);
|
|
2511
|
+
#endif
|
|
2512
|
+
|
|
2513
|
+
ctx->set_rows_error_buf_pool.init(ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS, WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES,
|
|
1216
2514
|
wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage,
|
|
1217
2515
|
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead);
|
|
1218
2516
|
|
|
1219
2517
|
ggml_webgpu_init_memset_pipeline(ctx);
|
|
1220
2518
|
ggml_webgpu_init_mul_mat_pipeline(ctx);
|
|
1221
2519
|
ggml_webgpu_init_set_rows_pipeline(ctx);
|
|
2520
|
+
ggml_webgpu_init_get_rows_pipeline(ctx);
|
|
1222
2521
|
ggml_webgpu_init_cpy_pipeline(ctx);
|
|
2522
|
+
ggml_webgpu_init_add_pipeline(ctx);
|
|
2523
|
+
ggml_webgpu_init_sub_pipeline(ctx);
|
|
2524
|
+
ggml_webgpu_init_mul_pipeline(ctx);
|
|
2525
|
+
ggml_webgpu_init_div_pipeline(ctx);
|
|
2526
|
+
ggml_webgpu_init_rms_norm_pipeline(ctx);
|
|
2527
|
+
ggml_webgpu_init_rope_pipeline(ctx);
|
|
2528
|
+
ggml_webgpu_init_glu_pipeline(ctx);
|
|
2529
|
+
ggml_webgpu_init_scale_pipeline(ctx);
|
|
2530
|
+
ggml_webgpu_init_soft_max_pipeline(ctx);
|
|
1223
2531
|
|
|
1224
2532
|
#ifdef GGML_WEBGPU_DEBUG
|
|
1225
2533
|
// Initialize debug buffers
|
|
1226
|
-
ggml_webgpu_create_buffer(ctx->device,
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
wgpu::BufferUsage::
|
|
1230
|
-
"debug_host_buf");
|
|
1231
|
-
ggml_webgpu_create_buffer(ctx->device,
|
|
1232
|
-
ctx->debug_dev_buf,
|
|
1233
|
-
WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
|
|
1234
|
-
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc,
|
|
1235
|
-
"debug_dev_buf");
|
|
2534
|
+
ggml_webgpu_create_buffer(ctx->device, ctx->debug_host_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
|
|
2535
|
+
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "debug_host_buf");
|
|
2536
|
+
ggml_webgpu_create_buffer(ctx->device, ctx->debug_dev_buf, WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t),
|
|
2537
|
+
wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, "debug_dev_buf");
|
|
1236
2538
|
#endif
|
|
1237
2539
|
|
|
1238
2540
|
static ggml_backend_webgpu_device_context device_ctx;
|
|
@@ -1243,12 +2545,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
1243
2545
|
GGML_LOG_INFO(
|
|
1244
2546
|
"ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | "
|
|
1245
2547
|
"device_desc: %s\n",
|
|
1246
|
-
info.vendorID,
|
|
1247
|
-
std::string(info.
|
|
1248
|
-
std::string(info.architecture).c_str(),
|
|
1249
|
-
info.deviceID,
|
|
1250
|
-
std::string(info.device).c_str(),
|
|
1251
|
-
std::string(info.description).c_str());
|
|
2548
|
+
info.vendorID, std::string(info.vendor).c_str(), std::string(info.architecture).c_str(), info.deviceID,
|
|
2549
|
+
std::string(info.device).c_str(), std::string(info.description).c_str());
|
|
1252
2550
|
|
|
1253
2551
|
// See GGML Backend Device Interface section
|
|
1254
2552
|
static ggml_backend_device device = {
|
|
@@ -1256,6 +2554,8 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t
|
|
|
1256
2554
|
/* .reg = */ reg,
|
|
1257
2555
|
/* .context = */ &device_ctx,
|
|
1258
2556
|
};
|
|
2557
|
+
|
|
2558
|
+
WEBGPU_CPU_PROFILE_TOTAL_END(reg_get_device, ctx);
|
|
1259
2559
|
return &device;
|
|
1260
2560
|
}
|
|
1261
2561
|
|
|
@@ -1278,11 +2578,18 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() {
|
|
|
1278
2578
|
ctx.name = GGML_WEBGPU_NAME;
|
|
1279
2579
|
ctx.device_count = 1;
|
|
1280
2580
|
|
|
2581
|
+
const char * const instanceEnabledToggles[] = { "allow_unsafe_apis" };
|
|
2582
|
+
|
|
2583
|
+
wgpu::DawnTogglesDescriptor instanceTogglesDesc;
|
|
2584
|
+
instanceTogglesDesc.enabledToggles = instanceEnabledToggles;
|
|
2585
|
+
instanceTogglesDesc.enabledToggleCount = 1;
|
|
1281
2586
|
wgpu::InstanceDescriptor instance_descriptor{};
|
|
1282
2587
|
std::vector<wgpu::InstanceFeatureName> instance_features = { wgpu::InstanceFeatureName::TimedWaitAny };
|
|
1283
2588
|
instance_descriptor.requiredFeatures = instance_features.data();
|
|
1284
2589
|
instance_descriptor.requiredFeatureCount = instance_features.size();
|
|
1285
|
-
|
|
2590
|
+
instance_descriptor.nextInChain = &instanceTogglesDesc;
|
|
2591
|
+
|
|
2592
|
+
webgpu_ctx->instance = wgpu::CreateInstance(&instance_descriptor);
|
|
1286
2593
|
GGML_ASSERT(webgpu_ctx->instance != nullptr);
|
|
1287
2594
|
|
|
1288
2595
|
static ggml_backend_reg reg = {
|