@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -0
- package/android/CMakeLists.txt +2 -0
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -15,13 +15,12 @@
|
|
|
15
15
|
|
|
16
16
|
#include <CL/cl.h>
|
|
17
17
|
|
|
18
|
+
#include <inttypes.h>
|
|
18
19
|
#include <string.h>
|
|
19
20
|
|
|
20
21
|
#include <cstddef>
|
|
21
22
|
#include <cstdint>
|
|
22
|
-
#include <atomic>
|
|
23
23
|
#include <fstream>
|
|
24
|
-
#include <limits>
|
|
25
24
|
#include <vector>
|
|
26
25
|
#include <string>
|
|
27
26
|
#include <cmath>
|
|
@@ -54,6 +53,37 @@
|
|
|
54
53
|
|
|
55
54
|
bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor);
|
|
56
55
|
|
|
56
|
+
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
|
|
57
|
+
// Precompute mp (m' in the paper) and L such that division
|
|
58
|
+
// can be computed using a multiply (high 32b of 64b result)
|
|
59
|
+
// and a shift:
|
|
60
|
+
//
|
|
61
|
+
// n/d = (mulhi(n, mp) + n) >> L;
|
|
62
|
+
struct fastdiv_vals {
|
|
63
|
+
uint32_t mp;
|
|
64
|
+
uint32_t L;
|
|
65
|
+
uint32_t d;
|
|
66
|
+
uint32_t pad;
|
|
67
|
+
};
|
|
68
|
+
static_assert(sizeof(fastdiv_vals) == 16, "fastdiv_vals size incorrect");
|
|
69
|
+
|
|
70
|
+
static fastdiv_vals init_fastdiv_values(uint64_t d_64) {
|
|
71
|
+
GGML_ASSERT(d_64 != 0);
|
|
72
|
+
GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
|
|
73
|
+
|
|
74
|
+
uint32_t d = (uint32_t)d_64;
|
|
75
|
+
|
|
76
|
+
// compute L = ceil(log2(d));
|
|
77
|
+
uint32_t L = 0;
|
|
78
|
+
while (L < 32 && (uint32_t{ 1 } << L) < d) {
|
|
79
|
+
L++;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
|
|
83
|
+
// pack divisor as well to reduce error surface
|
|
84
|
+
return { mp, L, d, 0 };
|
|
85
|
+
}
|
|
86
|
+
|
|
57
87
|
enum GPU_FAMILY {
|
|
58
88
|
ADRENO,
|
|
59
89
|
INTEL,
|
|
@@ -367,7 +397,9 @@ struct ggml_backend_opencl_context {
|
|
|
367
397
|
cl_program program_mul_mv_q4_0_f32_1d_8x_flat;
|
|
368
398
|
cl_program program_mul_mv_q4_0_f32_1d_16x_flat;
|
|
369
399
|
cl_program program_mul_mv_q6_K;
|
|
400
|
+
cl_program program_mul_mv_q8_0_f32, program_mul_mv_q8_0_f32_flat;
|
|
370
401
|
cl_program program_mul_mv_mxfp4_f32;
|
|
402
|
+
cl_program program_mul_mv_mxfp4_f32_flat;
|
|
371
403
|
cl_program program_mul_mv_f16_f16;
|
|
372
404
|
cl_program program_mul_mv_f16_f32_1row;
|
|
373
405
|
cl_program program_mul_mv_f16_f32_l4;
|
|
@@ -375,6 +407,8 @@ struct ggml_backend_opencl_context {
|
|
|
375
407
|
cl_program program_mul_mv_f32_f32;
|
|
376
408
|
cl_program program_mul;
|
|
377
409
|
cl_program program_mul_mat_f16_f32_tiled;
|
|
410
|
+
cl_program program_mul_mm_f16_f32_kqv;
|
|
411
|
+
cl_program program_mul_mm_f16_f32_kq;
|
|
378
412
|
cl_program program_div;
|
|
379
413
|
cl_program program_sub;
|
|
380
414
|
cl_program program_norm;
|
|
@@ -400,10 +434,14 @@ struct ggml_backend_opencl_context {
|
|
|
400
434
|
cl_program program_conv_2d_f32;
|
|
401
435
|
cl_program program_conv_2d_f16_f32;
|
|
402
436
|
cl_program program_tsembd;
|
|
437
|
+
cl_program program_gemv_moe_mxfp4_f32, program_gemm_moe_mxfp4_f32;
|
|
403
438
|
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
|
439
|
+
cl_program program_mul_mv_id_q8_0_f32, program_mul_mv_id_q8_0_f32_flat;
|
|
404
440
|
cl_program program_mul_mv_id_mxfp4_f32;
|
|
441
|
+
cl_program program_mul_mv_id_mxfp4_f32_flat;
|
|
405
442
|
cl_program program_mul_mm_f32_f32_l4_lm;
|
|
406
443
|
cl_program program_mul_mm_f16_f32_l4_lm;
|
|
444
|
+
cl_program program_mul_mm_q8_0_f32_l4_lm;
|
|
407
445
|
|
|
408
446
|
cl_kernel kernel_add, kernel_add_row, kernel_add_f16, kernel_add_row_f16;
|
|
409
447
|
cl_kernel kernel_mul, kernel_mul_row, kernel_mul_f16, kernel_mul_row_f16;
|
|
@@ -435,7 +473,7 @@ struct ggml_backend_opencl_context {
|
|
|
435
473
|
std::map<std::pair<int, int>, int> kernels_flash_attn_bm;
|
|
436
474
|
std::map<std::pair<int, int>, int> kernels_flash_attn_bn;
|
|
437
475
|
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
|
438
|
-
cl_kernel
|
|
476
|
+
cl_kernel kernel_set_rows_f32_i64, kernel_set_rows_f32_i32, kernel_set_rows_f16_i64, kernel_set_rows_f16_i32;
|
|
439
477
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
|
440
478
|
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
|
441
479
|
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
@@ -445,13 +483,18 @@ struct ggml_backend_opencl_context {
|
|
|
445
483
|
cl_kernel kernel_mul_mat_f16_f32;
|
|
446
484
|
cl_kernel kernel_mul_mat_f16_f32_l4;
|
|
447
485
|
cl_kernel kernel_mul_mat_f16_f32_tiled;
|
|
486
|
+
cl_kernel kernel_mul_mm_f16_f32_kqv;
|
|
487
|
+
cl_kernel kernel_mul_mm_f16_f32_kq;
|
|
448
488
|
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v;
|
|
449
489
|
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0;
|
|
490
|
+
cl_kernel kernel_convert_block_mxfp4, kernel_convert_block_mxfp4_trans, kernel_restore_block_mxfp4, kernel_restore_block_mxfp4_trans;
|
|
491
|
+
cl_kernel kernel_convert_block_q8_0, kernel_restore_block_q8_0;
|
|
450
492
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat;
|
|
451
493
|
cl_kernel kernel_convert_block_q4_0_noshuffle;
|
|
452
494
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat;
|
|
453
495
|
cl_kernel kernel_mul_mv_q6_K_f32;
|
|
454
|
-
cl_kernel kernel_mul_mv_mxfp4_f32;
|
|
496
|
+
cl_kernel kernel_mul_mv_mxfp4_f32, kernel_mul_mv_mxfp4_f32_flat;
|
|
497
|
+
cl_kernel kernel_mul_mv_q8_0_f32, kernel_mul_mv_q8_0_f32_flat;
|
|
455
498
|
cl_kernel kernel_im2col_f32, kernel_im2col_f16;
|
|
456
499
|
cl_kernel kernel_argsort_f32_i32;
|
|
457
500
|
cl_kernel kernel_sum_rows_f32;
|
|
@@ -467,10 +510,14 @@ struct ggml_backend_opencl_context {
|
|
|
467
510
|
cl_kernel kernel_conv_2d_f32;
|
|
468
511
|
cl_kernel kernel_conv_2d_f16_f32;
|
|
469
512
|
cl_kernel kernel_timestep_embedding;
|
|
513
|
+
cl_kernel kernel_gemv_moe_mxfp4_f32, kernel_gemm_moe_mxfp4_f32;
|
|
470
514
|
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
|
515
|
+
cl_kernel kernel_mul_mv_id_q8_0_f32, kernel_mul_mv_id_q8_0_f32_flat;
|
|
471
516
|
cl_kernel kernel_mul_mv_id_mxfp4_f32;
|
|
517
|
+
cl_kernel kernel_mul_mv_id_mxfp4_f32_flat;
|
|
472
518
|
cl_kernel kernel_mul_mm_f32_f32_l4_lm;
|
|
473
519
|
cl_kernel kernel_mul_mm_f16_f32_l4_lm;
|
|
520
|
+
cl_kernel kernel_mul_mm_q8_0_f32_l4_lm;
|
|
474
521
|
|
|
475
522
|
std::vector<ProfilingInfo> profiling_info;
|
|
476
523
|
|
|
@@ -520,25 +567,17 @@ struct ggml_backend_opencl_context {
|
|
|
520
567
|
}
|
|
521
568
|
|
|
522
569
|
// Dump a csv
|
|
523
|
-
|
|
524
|
-
fprintf(fperf, "op name, kernel name, queued duration (ms), submit duration(ms), exec duration (ms), complete duration (ms), total duration (ms), global size, local size, output size\n");
|
|
570
|
+
fprintf(fperf, "op name, kernel name, exec duration (ms), global size, local size, output size\n");
|
|
525
571
|
for (const ProfilingInfo & info : profiling_info) {
|
|
526
|
-
|
|
527
|
-
fprintf(fperf, "%s,%s,%f,%f,%f,%f,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
572
|
+
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n",
|
|
528
573
|
info.op_name.c_str(), info.kernel_name.c_str(),
|
|
529
|
-
info.cmd_queued_duration_ns/1.e6f,
|
|
530
|
-
info.cmd_submit_duration_ns/1.e6f,
|
|
531
574
|
info.cmd_duration_ns/1.e6f,
|
|
532
|
-
info.cmd_complete_duration_ns/1.e6f,
|
|
533
|
-
info.cmd_total_duration_ns/1.e6f,
|
|
534
575
|
info.global_size[0], info.global_size[1], info.global_size[2],
|
|
535
576
|
info.local_size[0], info.local_size[1], info.local_size[2],
|
|
536
577
|
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]);
|
|
537
578
|
}
|
|
538
579
|
fclose(fperf);
|
|
539
580
|
|
|
540
|
-
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time);
|
|
541
|
-
|
|
542
581
|
// Dump a simple chrome trace
|
|
543
582
|
FILE* ftrace = fopen("cl_trace.json", "w");
|
|
544
583
|
if (!ftrace) {
|
|
@@ -548,14 +587,14 @@ struct ggml_backend_opencl_context {
|
|
|
548
587
|
|
|
549
588
|
fprintf(ftrace, "[\n");
|
|
550
589
|
for (const ProfilingInfo & info : profiling_info) {
|
|
551
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %
|
|
590
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
552
591
|
info.kernel_name.c_str(), info.cmd_queued/1000);
|
|
553
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %
|
|
592
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Host\"},\n",
|
|
554
593
|
info.kernel_name.c_str(), info.cmd_submit/1000);
|
|
555
594
|
|
|
556
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %
|
|
595
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"B\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
557
596
|
info.kernel_name.c_str(), info.cmd_start/1000);
|
|
558
|
-
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %
|
|
597
|
+
fprintf(ftrace, "{\"name\": \"%s\", \"cat\": \"OpenCL\", \"ph\": \"E\", \"ts\": %" PRIu64 ", \"pid\": \"\", \"tid\": \"Device\"},\n",
|
|
559
598
|
info.kernel_name.c_str(), info.cmd_end/1000);
|
|
560
599
|
}
|
|
561
600
|
fclose(ftrace);
|
|
@@ -765,6 +804,12 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
765
804
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0_noshuffle", &err), err));
|
|
766
805
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q4_0", &err), err));
|
|
767
806
|
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_0", &err), err));
|
|
807
|
+
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4", &err), err));
|
|
808
|
+
CL_CHECK((backend_ctx->kernel_convert_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_mxfp4_trans", &err), err));
|
|
809
|
+
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4_trans = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4_trans", &err), err));
|
|
810
|
+
CL_CHECK((backend_ctx->kernel_restore_block_mxfp4 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_mxfp4", &err), err));
|
|
811
|
+
CL_CHECK((backend_ctx->kernel_convert_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q8_0", &err), err));
|
|
812
|
+
CL_CHECK((backend_ctx->kernel_restore_block_q8_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q8_0", &err), err));
|
|
768
813
|
GGML_LOG_CONT(".");
|
|
769
814
|
}
|
|
770
815
|
|
|
@@ -986,6 +1031,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
986
1031
|
GGML_LOG_CONT(".");
|
|
987
1032
|
}
|
|
988
1033
|
|
|
1034
|
+
// mul_mv_q8_0_f32
|
|
1035
|
+
{
|
|
1036
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1037
|
+
const std::string kernel_src {
|
|
1038
|
+
#include "mul_mv_q8_0_f32.cl.h"
|
|
1039
|
+
};
|
|
1040
|
+
#else
|
|
1041
|
+
const std::string kernel_src = read_file("mul_mv_q8_0_f32.cl");
|
|
1042
|
+
#endif
|
|
1043
|
+
backend_ctx->program_mul_mv_q8_0_f32 =
|
|
1044
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1045
|
+
|
|
1046
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32, "kernel_mul_mv_q8_0_f32", &err), err));
|
|
1047
|
+
GGML_LOG_CONT(".");
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
// mul_mv_q8_0_f32_flat
|
|
1051
|
+
{
|
|
1052
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1053
|
+
const std::string kernel_src {
|
|
1054
|
+
#include "mul_mv_q8_0_f32_flat.cl.h"
|
|
1055
|
+
};
|
|
1056
|
+
#else
|
|
1057
|
+
const std::string kernel_src = read_file("mul_mv_q8_0_f32_flat.cl");
|
|
1058
|
+
#endif
|
|
1059
|
+
backend_ctx->program_mul_mv_q8_0_f32_flat =
|
|
1060
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1061
|
+
|
|
1062
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_q8_0_f32_flat, "kernel_mul_mv_q8_0_f32_flat", &err), err));
|
|
1063
|
+
GGML_LOG_CONT(".");
|
|
1064
|
+
}
|
|
1065
|
+
|
|
989
1066
|
// mul_mv_mxfp4_f32
|
|
990
1067
|
{
|
|
991
1068
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1002,6 +1079,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1002
1079
|
GGML_LOG_CONT(".");
|
|
1003
1080
|
}
|
|
1004
1081
|
|
|
1082
|
+
// mul_mv_mxfp4_f32_flat
|
|
1083
|
+
{
|
|
1084
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1085
|
+
const std::string kernel_src {
|
|
1086
|
+
#include "mul_mv_mxfp4_f32_flat.cl.h"
|
|
1087
|
+
};
|
|
1088
|
+
#else
|
|
1089
|
+
const std::string kernel_src = read_file("mul_mv_mxfp4_f32_flat.cl");
|
|
1090
|
+
#endif
|
|
1091
|
+
backend_ctx->program_mul_mv_mxfp4_f32_flat =
|
|
1092
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1093
|
+
|
|
1094
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_mxfp4_f32_flat, "kernel_mul_mv_mxfp4_f32_flat", &err), err));
|
|
1095
|
+
GGML_LOG_CONT(".");
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1005
1098
|
// mul_mv_f16_f16
|
|
1006
1099
|
{
|
|
1007
1100
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1130,6 +1223,41 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1130
1223
|
GGML_LOG_CONT(".");
|
|
1131
1224
|
}
|
|
1132
1225
|
|
|
1226
|
+
// mul_mm_q8_0_f32_l4_lm
|
|
1227
|
+
{
|
|
1228
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1229
|
+
const std::string kernel_src {
|
|
1230
|
+
#include "mul_mm_q8_0_f32_l4_lm.cl.h"
|
|
1231
|
+
};
|
|
1232
|
+
#else
|
|
1233
|
+
const std::string kernel_src = read_file("mul_mm_q8_0_f32_l4_lm.cl");
|
|
1234
|
+
#endif
|
|
1235
|
+
backend_ctx->program_mul_mm_q8_0_f32_l4_lm =
|
|
1236
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1237
|
+
|
|
1238
|
+
CL_CHECK((backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm = clCreateKernel(backend_ctx->program_mul_mm_q8_0_f32_l4_lm, "kernel_mul_mm_q8_0_f32_l4_lm", &err), err));
|
|
1239
|
+
GGML_LOG_CONT(".");
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
// mul_mm_f16_f32_kq_kqv
|
|
1243
|
+
{
|
|
1244
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1245
|
+
const std::string kernel_src {
|
|
1246
|
+
#include "mul_mm_f16_f32_kq_kqv.cl.h"
|
|
1247
|
+
};
|
|
1248
|
+
#else
|
|
1249
|
+
const std::string kernel_src = read_file("mul_mm_f16_f32_kq_kqv.cl");
|
|
1250
|
+
#endif
|
|
1251
|
+
backend_ctx->program_mul_mm_f16_f32_kqv =
|
|
1252
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts+" -DKQV ");
|
|
1253
|
+
backend_ctx->program_mul_mm_f16_f32_kq =
|
|
1254
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1255
|
+
|
|
1256
|
+
CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kqv = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kqv, "mul_mm_f16_f32_kqv", &err), err));
|
|
1257
|
+
CL_CHECK((backend_ctx->kernel_mul_mm_f16_f32_kq = clCreateKernel(backend_ctx->program_mul_mm_f16_f32_kq, "mul_mm_f16_f32_kq", &err), err));
|
|
1258
|
+
GGML_LOG_CONT(".");
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1133
1261
|
// mul
|
|
1134
1262
|
{
|
|
1135
1263
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1339,7 +1467,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1339
1467
|
|
|
1340
1468
|
if (!kernel_src_f16.empty() && !kernel_src_f32.empty() && !kernel_src_f32_f16.empty()) {
|
|
1341
1469
|
const struct { int dk; int dv; int bm; int bn; } fa_dims[] = {
|
|
1342
|
-
{ 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
|
|
1470
|
+
{ 40, 40, 32, 32}, { 64, 64, 64, 64}, { 80, 80, 64, 32}, { 96, 96, 64, 32},
|
|
1343
1471
|
{112, 112, 32, 32}, {128, 128, 32, 32}, {192, 128, 16, 16},
|
|
1344
1472
|
{192, 192, 16, 16}, {256, 256, 16, 16},
|
|
1345
1473
|
};
|
|
@@ -1649,8 +1777,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1649
1777
|
backend_ctx->program_set_rows =
|
|
1650
1778
|
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1651
1779
|
|
|
1652
|
-
CL_CHECK((backend_ctx->
|
|
1653
|
-
CL_CHECK((backend_ctx->
|
|
1780
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f32_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i64", &err), err));
|
|
1781
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f32_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32_i32", &err), err));
|
|
1782
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f16_i64 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i64", &err), err));
|
|
1783
|
+
CL_CHECK((backend_ctx->kernel_set_rows_f16_i32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16_i32", &err), err));
|
|
1654
1784
|
GGML_LOG_CONT(".");
|
|
1655
1785
|
}
|
|
1656
1786
|
|
|
@@ -1711,6 +1841,38 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1711
1841
|
GGML_LOG_CONT(".");
|
|
1712
1842
|
}
|
|
1713
1843
|
|
|
1844
|
+
// mul_mv_id_q8_0_f32
|
|
1845
|
+
{
|
|
1846
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1847
|
+
const std::string kernel_src {
|
|
1848
|
+
#include "mul_mv_id_q8_0_f32.cl.h"
|
|
1849
|
+
};
|
|
1850
|
+
#else
|
|
1851
|
+
const std::string kernel_src = read_file("mul_mv_id_q8_0_f32.cl");
|
|
1852
|
+
#endif
|
|
1853
|
+
backend_ctx->program_mul_mv_id_q8_0_f32 =
|
|
1854
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1855
|
+
|
|
1856
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32 = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32, "kernel_mul_mv_id_q8_0_f32", &err), err));
|
|
1857
|
+
GGML_LOG_CONT(".");
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
// mul_mv_id_q8_0_f32_flat
|
|
1861
|
+
{
|
|
1862
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1863
|
+
const std::string kernel_src {
|
|
1864
|
+
#include "mul_mv_id_q8_0_f32_flat.cl.h"
|
|
1865
|
+
};
|
|
1866
|
+
#else
|
|
1867
|
+
const std::string kernel_src = read_file("mul_mv_id_q8_0_f32_flat.cl");
|
|
1868
|
+
#endif
|
|
1869
|
+
backend_ctx->program_mul_mv_id_q8_0_f32_flat =
|
|
1870
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1871
|
+
|
|
1872
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_id_q8_0_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q8_0_f32_flat, "kernel_mul_mv_id_q8_0_f32_flat", &err), err));
|
|
1873
|
+
GGML_LOG_CONT(".");
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1714
1876
|
// mul_mv_id_mxfp4_f32
|
|
1715
1877
|
{
|
|
1716
1878
|
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -1727,6 +1889,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1727
1889
|
GGML_LOG_CONT(".");
|
|
1728
1890
|
}
|
|
1729
1891
|
|
|
1892
|
+
// mul_mv_id_mxfp4_f32_flat
|
|
1893
|
+
{
|
|
1894
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
1895
|
+
const std::string kernel_src {
|
|
1896
|
+
#include "mul_mv_id_mxfp4_f32_flat.cl.h"
|
|
1897
|
+
};
|
|
1898
|
+
#else
|
|
1899
|
+
const std::string kernel_src = read_file("mul_mv_id_mxfp4_f32_flat.cl");
|
|
1900
|
+
#endif
|
|
1901
|
+
backend_ctx->program_mul_mv_id_mxfp4_f32_flat =
|
|
1902
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
|
1903
|
+
|
|
1904
|
+
CL_CHECK((backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat = clCreateKernel(backend_ctx->program_mul_mv_id_mxfp4_f32_flat, "kernel_mul_mv_id_mxfp4_f32_flat", &err), err));
|
|
1905
|
+
GGML_LOG_CONT(".");
|
|
1906
|
+
}
|
|
1907
|
+
|
|
1730
1908
|
// Adreno kernels
|
|
1731
1909
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1732
1910
|
// transpose
|
|
@@ -1862,6 +2040,42 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
|
1862
2040
|
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err));
|
|
1863
2041
|
GGML_LOG_CONT(".");
|
|
1864
2042
|
}
|
|
2043
|
+
|
|
2044
|
+
std::string CL_moe_compile_opts = std::string("-cl-std=") + opencl_c_std +
|
|
2045
|
+
" -cl-mad-enable "
|
|
2046
|
+
" -cl-fast-relaxed-math";
|
|
2047
|
+
|
|
2048
|
+
// gemv_moe_mxfp4_f32
|
|
2049
|
+
{
|
|
2050
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2051
|
+
const std::string kernel_src {
|
|
2052
|
+
#include "gemv_moe_mxfp4_f32.cl.h"
|
|
2053
|
+
};
|
|
2054
|
+
#else
|
|
2055
|
+
const std::string kernel_src = read_file("gemv_moe_mxfp4_f32.cl");
|
|
2056
|
+
#endif
|
|
2057
|
+
backend_ctx->program_gemv_moe_mxfp4_f32 =
|
|
2058
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
|
2059
|
+
|
|
2060
|
+
CL_CHECK((backend_ctx->kernel_gemv_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemv_moe_mxfp4_f32, "kernel_gemv_moe_mxfp4_f32", &err), err));
|
|
2061
|
+
GGML_LOG_CONT(".");
|
|
2062
|
+
}
|
|
2063
|
+
|
|
2064
|
+
// gemm_moe_mxfp4_f32
|
|
2065
|
+
{
|
|
2066
|
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
2067
|
+
const std::string kernel_src {
|
|
2068
|
+
#include "gemm_moe_mxfp4_f32.cl.h"
|
|
2069
|
+
};
|
|
2070
|
+
#else
|
|
2071
|
+
const std::string kernel_src = read_file("gemm_moe_mxfp4_f32.cl");
|
|
2072
|
+
#endif
|
|
2073
|
+
backend_ctx->program_gemm_moe_mxfp4_f32 =
|
|
2074
|
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_moe_compile_opts);
|
|
2075
|
+
|
|
2076
|
+
CL_CHECK((backend_ctx->kernel_gemm_moe_mxfp4_f32 = clCreateKernel(backend_ctx->program_gemm_moe_mxfp4_f32, "kernel_gemm_moe_mxfp4_f32", &err), err));
|
|
2077
|
+
GGML_LOG_CONT(".");
|
|
2078
|
+
}
|
|
1865
2079
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
1866
2080
|
GGML_LOG_CONT("\n");
|
|
1867
2081
|
}
|
|
@@ -2237,8 +2451,13 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
|
|
|
2237
2451
|
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false");
|
|
2238
2452
|
|
|
2239
2453
|
if (opencl_c_version.major >= 3) {
|
|
2454
|
+
// Assume it is not available for 3.0, since it is optional in 3.0.
|
|
2455
|
+
// If compiling against 3.0, then we can query.
|
|
2456
|
+
backend_ctx->non_uniform_workgroups = false;
|
|
2457
|
+
#if CL_TARGET_OPENCL_VERSION >= 300
|
|
2240
2458
|
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool),
|
|
2241
2459
|
&backend_ctx->non_uniform_workgroups, 0));
|
|
2460
|
+
#endif
|
|
2242
2461
|
} else {
|
|
2243
2462
|
GGML_ASSERT(opencl_c_version.major == 2);
|
|
2244
2463
|
// Non-uniform workgroup sizes is mandatory feature in v2.x.
|
|
@@ -2391,6 +2610,84 @@ struct ggml_tensor_extra_cl_q4_0 {
|
|
|
2391
2610
|
}
|
|
2392
2611
|
};
|
|
2393
2612
|
|
|
2613
|
+
struct ggml_tensor_extra_cl_mxfp4 {
|
|
2614
|
+
// Quantized values.
|
|
2615
|
+
cl_mem q = nullptr;
|
|
2616
|
+
// Quantized values in image1d_buffer_t.
|
|
2617
|
+
cl_mem q_img = nullptr;
|
|
2618
|
+
// Scales in E8M0.
|
|
2619
|
+
cl_mem e = nullptr;
|
|
2620
|
+
// Scales in image1d_buffer_t.
|
|
2621
|
+
cl_mem e_img = nullptr;
|
|
2622
|
+
// Size of quantized values.
|
|
2623
|
+
size_t size_q = 0;
|
|
2624
|
+
// Size of scales.
|
|
2625
|
+
size_t size_e = 0;
|
|
2626
|
+
|
|
2627
|
+
~ggml_tensor_extra_cl_mxfp4() {
|
|
2628
|
+
reset();
|
|
2629
|
+
}
|
|
2630
|
+
|
|
2631
|
+
void reset() {
|
|
2632
|
+
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
|
|
2633
|
+
// They must be properly released so that the original buffer can be
|
|
2634
|
+
// properly released to avoid memory leak.
|
|
2635
|
+
if (q != nullptr) {
|
|
2636
|
+
CL_CHECK(clReleaseMemObject(q));
|
|
2637
|
+
q = nullptr;
|
|
2638
|
+
}
|
|
2639
|
+
if (e != nullptr) {
|
|
2640
|
+
CL_CHECK(clReleaseMemObject(e));
|
|
2641
|
+
e = nullptr;
|
|
2642
|
+
}
|
|
2643
|
+
if (q != nullptr) {
|
|
2644
|
+
CL_CHECK(clReleaseMemObject(q_img));
|
|
2645
|
+
q = nullptr;
|
|
2646
|
+
}
|
|
2647
|
+
// Currently, q_img and d_img are not used. They can be image1d_buffer_t
|
|
2648
|
+
// that wraps around q and d to utilize image access path.
|
|
2649
|
+
q_img = nullptr;
|
|
2650
|
+
e_img = nullptr;
|
|
2651
|
+
size_q = 0;
|
|
2652
|
+
size_e = 0;
|
|
2653
|
+
}
|
|
2654
|
+
};
|
|
2655
|
+
|
|
2656
|
+
struct ggml_tensor_extra_cl_q8_0 {
|
|
2657
|
+
cl_mem q = nullptr;
|
|
2658
|
+
cl_mem q_img = nullptr;
|
|
2659
|
+
|
|
2660
|
+
cl_mem d = nullptr;
|
|
2661
|
+
cl_mem d_img = nullptr;
|
|
2662
|
+
|
|
2663
|
+
size_t size_q = 0;
|
|
2664
|
+
size_t size_d = 0;
|
|
2665
|
+
|
|
2666
|
+
~ggml_tensor_extra_cl_q8_0() {
|
|
2667
|
+
reset();
|
|
2668
|
+
}
|
|
2669
|
+
|
|
2670
|
+
void reset() {
|
|
2671
|
+
// q and d are subbuffers into the bigger buffer allocated in ggml_backend_buffer.
|
|
2672
|
+
// They must be properly released so that the original buffer can be
|
|
2673
|
+
// properly released to avoid memory leak.
|
|
2674
|
+
if (q != nullptr) {
|
|
2675
|
+
CL_CHECK(clReleaseMemObject(q));
|
|
2676
|
+
q = nullptr;
|
|
2677
|
+
}
|
|
2678
|
+
if (d != nullptr) {
|
|
2679
|
+
CL_CHECK(clReleaseMemObject(d));
|
|
2680
|
+
d = nullptr;
|
|
2681
|
+
}
|
|
2682
|
+
// Currently, q_img and d_img are not used. They can be image1d_buffer_t
|
|
2683
|
+
// that wraps around q and d to utilize image access path.
|
|
2684
|
+
q_img = nullptr;
|
|
2685
|
+
d_img = nullptr;
|
|
2686
|
+
size_q = 0;
|
|
2687
|
+
size_d = 0;
|
|
2688
|
+
}
|
|
2689
|
+
};
|
|
2690
|
+
|
|
2394
2691
|
//------------------------------------------------------------------------------
|
|
2395
2692
|
// Backend API
|
|
2396
2693
|
//------------------------------------------------------------------------------
|
|
@@ -2492,7 +2789,7 @@ static bool ggml_opencl_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
|
|
|
2492
2789
|
|
|
2493
2790
|
// if rms_norm is the B operand, then we don't handle broadcast
|
|
2494
2791
|
if (rms_norm == mul->src[1] &&
|
|
2495
|
-
!ggml_are_same_shape(mul->src[0], rms_norm
|
|
2792
|
+
!ggml_are_same_shape(mul->src[0], rms_norm)) {
|
|
2496
2793
|
return false;
|
|
2497
2794
|
}
|
|
2498
2795
|
|
|
@@ -2616,7 +2913,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2616
2913
|
switch (op->type) {
|
|
2617
2914
|
case GGML_TYPE_F16:
|
|
2618
2915
|
case GGML_TYPE_F32:
|
|
2619
|
-
return
|
|
2916
|
+
return (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
|
|
2620
2917
|
default:
|
|
2621
2918
|
return false;
|
|
2622
2919
|
}
|
|
@@ -2700,10 +2997,12 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2700
2997
|
case GGML_OP_REPEAT:
|
|
2701
2998
|
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; // Assuming F32 for now, can be expanded
|
|
2702
2999
|
case GGML_OP_PAD:
|
|
2703
|
-
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
|
2704
|
-
op->src[0]->ne[3] == 1 && op->ne[3] == 1;
|
|
2705
|
-
case GGML_OP_UPSCALE:
|
|
2706
3000
|
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
|
|
3001
|
+
case GGML_OP_UPSCALE: {
|
|
3002
|
+
ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
|
|
3003
|
+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
|
|
3004
|
+
(mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR);
|
|
3005
|
+
}
|
|
2707
3006
|
case GGML_OP_CONV_2D:
|
|
2708
3007
|
return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
|
|
2709
3008
|
(op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) ||
|
|
@@ -2722,10 +3021,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2722
3021
|
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || op->src[0]->type == GGML_TYPE_MXFP4 ||
|
|
2723
3022
|
op->src[0]->type == GGML_TYPE_Q6_K) {
|
|
2724
3023
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
3024
|
+
} else if (op->src[0]->type == GGML_TYPE_Q8_0) {
|
|
3025
|
+
return op->src[1]->type == GGML_TYPE_F32;
|
|
2725
3026
|
}
|
|
2726
3027
|
return false;
|
|
2727
3028
|
case GGML_OP_MUL_MAT_ID:
|
|
2728
3029
|
if (op->src[0]->type == GGML_TYPE_Q4_0 ||
|
|
3030
|
+
op->src[0]->type == GGML_TYPE_Q8_0 ||
|
|
2729
3031
|
op->src[0]->type == GGML_TYPE_MXFP4) {
|
|
2730
3032
|
if (op->src[1]->type == GGML_TYPE_F32) {
|
|
2731
3033
|
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
@@ -2776,10 +3078,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2776
3078
|
return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]);
|
|
2777
3079
|
case GGML_OP_FLASH_ATTN_EXT:
|
|
2778
3080
|
{
|
|
2779
|
-
if (op->src[4]) {
|
|
2780
|
-
return false;
|
|
2781
|
-
}
|
|
2782
|
-
|
|
2783
3081
|
const ggml_tensor * q = op->src[0];
|
|
2784
3082
|
const ggml_tensor * k = op->src[1];
|
|
2785
3083
|
const ggml_tensor * v = op->src[2];
|
|
@@ -2788,7 +3086,7 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
|
2788
3086
|
const int dv = v->ne[0];
|
|
2789
3087
|
|
|
2790
3088
|
const struct { int dk; int dv; } supported_dims[] = {
|
|
2791
|
-
{ 64, 64}, { 80, 80}, { 96, 96},
|
|
3089
|
+
{ 40, 40}, { 64, 64}, { 80, 80}, { 96, 96},
|
|
2792
3090
|
{112, 112}, {128, 128}, {192, 128},
|
|
2793
3091
|
{192, 192}, {256, 256},
|
|
2794
3092
|
};
|
|
@@ -2840,6 +3138,7 @@ static ggml_backend_i ggml_backend_opencl_i = {
|
|
|
2840
3138
|
/* .graph_compute = */ ggml_backend_opencl_graph_compute,
|
|
2841
3139
|
/* .event_record = */ NULL,
|
|
2842
3140
|
/* .event_wait = */ NULL,
|
|
3141
|
+
/* .graph_optimize = */ NULL,
|
|
2843
3142
|
};
|
|
2844
3143
|
|
|
2845
3144
|
ggml_backend_t ggml_backend_opencl_init(void) {
|
|
@@ -2895,6 +3194,18 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
2895
3194
|
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) {
|
|
2896
3195
|
delete e;
|
|
2897
3196
|
}
|
|
3197
|
+
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4) {
|
|
3198
|
+
delete e;
|
|
3199
|
+
}
|
|
3200
|
+
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
|
|
3201
|
+
delete e;
|
|
3202
|
+
}
|
|
3203
|
+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0) {
|
|
3204
|
+
delete e;
|
|
3205
|
+
}
|
|
3206
|
+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
|
|
3207
|
+
delete e;
|
|
3208
|
+
}
|
|
2898
3209
|
}
|
|
2899
3210
|
|
|
2900
3211
|
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() {
|
|
@@ -2927,6 +3238,36 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
2927
3238
|
return extra;
|
|
2928
3239
|
}
|
|
2929
3240
|
|
|
3241
|
+
ggml_tensor_extra_cl_mxfp4 * ggml_opencl_alloc_temp_tensor_extra_mxfp4() {
|
|
3242
|
+
ggml_tensor_extra_cl_mxfp4 * extra;
|
|
3243
|
+
if (temp_tensor_extras_mxfp4.empty()) {
|
|
3244
|
+
extra = new ggml_tensor_extra_cl_mxfp4();
|
|
3245
|
+
} else {
|
|
3246
|
+
extra = temp_tensor_extras_mxfp4.back();
|
|
3247
|
+
temp_tensor_extras_mxfp4.pop_back();
|
|
3248
|
+
}
|
|
3249
|
+
|
|
3250
|
+
temp_tensor_extras_mxfp4_in_use.push_back(extra);
|
|
3251
|
+
|
|
3252
|
+
extra->reset();
|
|
3253
|
+
return extra;
|
|
3254
|
+
}
|
|
3255
|
+
|
|
3256
|
+
ggml_tensor_extra_cl_q8_0 * ggml_opencl_alloc_temp_tensor_extra_q8_0() {
|
|
3257
|
+
ggml_tensor_extra_cl_q8_0 * extra;
|
|
3258
|
+
if (temp_tensor_extras_q8_0.empty()) {
|
|
3259
|
+
extra = new ggml_tensor_extra_cl_q8_0();
|
|
3260
|
+
} else {
|
|
3261
|
+
extra = temp_tensor_extras_q8_0.back();
|
|
3262
|
+
temp_tensor_extras_q8_0.pop_back();
|
|
3263
|
+
}
|
|
3264
|
+
|
|
3265
|
+
temp_tensor_extras_q8_0_in_use.push_back(extra);
|
|
3266
|
+
|
|
3267
|
+
extra->reset();
|
|
3268
|
+
return extra;
|
|
3269
|
+
}
|
|
3270
|
+
|
|
2930
3271
|
void reset() {
|
|
2931
3272
|
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) {
|
|
2932
3273
|
temp_tensor_extras.push_back(e);
|
|
@@ -2937,6 +3278,16 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
2937
3278
|
temp_tensor_extras_q4_0.push_back(e);
|
|
2938
3279
|
}
|
|
2939
3280
|
temp_tensor_extras_q4_0_in_use.clear();
|
|
3281
|
+
|
|
3282
|
+
for (ggml_tensor_extra_cl_mxfp4 * e : temp_tensor_extras_mxfp4_in_use) {
|
|
3283
|
+
temp_tensor_extras_mxfp4.push_back(e);
|
|
3284
|
+
}
|
|
3285
|
+
temp_tensor_extras_mxfp4_in_use.clear();
|
|
3286
|
+
|
|
3287
|
+
for (ggml_tensor_extra_cl_q8_0 * e : temp_tensor_extras_q8_0_in_use) {
|
|
3288
|
+
temp_tensor_extras_q8_0.push_back(e);
|
|
3289
|
+
}
|
|
3290
|
+
temp_tensor_extras_q8_0_in_use.clear();
|
|
2940
3291
|
}
|
|
2941
3292
|
|
|
2942
3293
|
// Pools for extras. Available extras are in `temp_tensor_extras`. Extras
|
|
@@ -2948,6 +3299,10 @@ struct ggml_backend_opencl_buffer_context {
|
|
|
2948
3299
|
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use;
|
|
2949
3300
|
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0;
|
|
2950
3301
|
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use;
|
|
3302
|
+
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4;
|
|
3303
|
+
std::vector<ggml_tensor_extra_cl_mxfp4 *> temp_tensor_extras_mxfp4_in_use;
|
|
3304
|
+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0;
|
|
3305
|
+
std::vector<ggml_tensor_extra_cl_q8_0 *> temp_tensor_extras_q8_0_in_use;
|
|
2951
3306
|
|
|
2952
3307
|
// The buffer_context is initially created by ggml_backend_buft_alloc_buffer
|
|
2953
3308
|
// before any tensor is initialized (at the beginning of alloc_tensor_range).
|
|
@@ -3032,6 +3387,12 @@ inline bool use_adreno_kernels(const ggml_backend_opencl_context *backend_ctx, c
|
|
|
3032
3387
|
tensor->ne[2] == 1 && tensor->ne[3] == 1;
|
|
3033
3388
|
}
|
|
3034
3389
|
|
|
3390
|
+
inline bool use_adreno_moe_kernels(const ggml_backend_opencl_context *backend_ctx, const ggml_tensor *tensor) {
|
|
3391
|
+
GGML_UNUSED(backend_ctx);
|
|
3392
|
+
int ne01 = tensor->ne[1];
|
|
3393
|
+
return ((strstr(tensor->name, "ffn") != NULL) || (strstr(tensor->name, "as") != NULL)) && (ne01 % 64 == 0);
|
|
3394
|
+
}
|
|
3395
|
+
|
|
3035
3396
|
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
|
3036
3397
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
|
3037
3398
|
|
|
@@ -3291,39 +3652,192 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
|
3291
3652
|
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
3292
3653
|
|
|
3293
3654
|
return;
|
|
3294
|
-
}
|
|
3295
|
-
#endif // GGML_OPENCL_SOA_Q
|
|
3296
|
-
|
|
3297
|
-
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
|
|
3298
|
-
GGML_ASSERT(extra);
|
|
3299
|
-
|
|
3300
|
-
CL_CHECK(clEnqueueWriteBuffer(
|
|
3301
|
-
queue, extra->data_device, CL_TRUE, extra->offset + offset,
|
|
3302
|
-
size, data, 0, NULL, NULL));
|
|
3303
|
-
|
|
3304
|
-
GGML_UNUSED(buffer);
|
|
3305
|
-
}
|
|
3306
3655
|
|
|
3307
|
-
|
|
3308
|
-
|
|
3309
|
-
|
|
3310
|
-
|
|
3311
|
-
|
|
3312
|
-
cl_context context = backend_ctx->context;
|
|
3313
|
-
cl_command_queue queue = backend_ctx->queue;
|
|
3656
|
+
}
|
|
3657
|
+
if (tensor->type == GGML_TYPE_MXFP4) {
|
|
3658
|
+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
|
3659
|
+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
|
3314
3660
|
|
|
3315
|
-
|
|
3316
|
-
|
|
3661
|
+
// Allocate the new extra and create aliases from the original.
|
|
3662
|
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
3663
|
+
ggml_tensor_extra_cl_mxfp4 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_mxfp4();
|
|
3317
3664
|
|
|
3318
|
-
|
|
3319
|
-
|
|
3320
|
-
|
|
3321
|
-
|
|
3322
|
-
|
|
3323
|
-
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3665
|
+
size_t size_e = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(char);
|
|
3666
|
+
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2;
|
|
3667
|
+
GGML_ASSERT(size_e + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
|
3668
|
+
|
|
3669
|
+
cl_int err;
|
|
3670
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
3671
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
3672
|
+
CL_CHECK(err);
|
|
3673
|
+
CL_CHECK(clEnqueueWriteBuffer(
|
|
3674
|
+
queue, data_device, CL_TRUE, 0,
|
|
3675
|
+
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
|
3676
|
+
|
|
3677
|
+
// The original tensor memory is divided into scales and quants, i.e.,
|
|
3678
|
+
// we first store scales, then quants.
|
|
3679
|
+
cl_buffer_region region;
|
|
3680
|
+
|
|
3681
|
+
// Create subbuffer for scales.
|
|
3682
|
+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
|
3683
|
+
region.size = size_e;
|
|
3684
|
+
extra->e = clCreateSubBuffer(
|
|
3685
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
3686
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
3687
|
+
CL_CHECK(err);
|
|
3688
|
+
auto previous_origin = region.origin;
|
|
3689
|
+
|
|
3690
|
+
// Create subbuffer for quants.
|
|
3691
|
+
region.origin = align_to(previous_origin + size_e, backend_ctx->alignment);
|
|
3692
|
+
region.size = size_q;
|
|
3693
|
+
extra->q = clCreateSubBuffer(
|
|
3694
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
3695
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
3696
|
+
CL_CHECK(err);
|
|
3697
|
+
|
|
3698
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
3699
|
+
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
|
3700
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4_trans;
|
|
3701
|
+
|
|
3702
|
+
int ne00 = tensor->ne[0];
|
|
3703
|
+
int ne01 = tensor->ne[1];
|
|
3704
|
+
int ne02 = tensor->ne[2];
|
|
3705
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
3706
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
3707
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
|
3708
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &ne00));
|
|
3709
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01));
|
|
3710
|
+
|
|
3711
|
+
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
|
3712
|
+
size_t local_work_size[3] = {64, 2, 1};
|
|
3713
|
+
|
|
3714
|
+
cl_event evt;
|
|
3715
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3716
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3717
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
3718
|
+
tensor->extra = extra;
|
|
3719
|
+
|
|
3720
|
+
return;
|
|
3721
|
+
}
|
|
3722
|
+
#endif
|
|
3723
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_mxfp4;
|
|
3724
|
+
|
|
3725
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
3726
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
3727
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->e));
|
|
3728
|
+
|
|
3729
|
+
size_t global_work_size[3] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
3730
|
+
size_t local_work_size[3] = {64, 1, 1};
|
|
3731
|
+
|
|
3732
|
+
cl_event evt;
|
|
3733
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3734
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3735
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
3736
|
+
|
|
3737
|
+
// Create image for Q
|
|
3738
|
+
cl_image_format img_format_q = {CL_RG, CL_UNSIGNED_INT32};
|
|
3739
|
+
cl_image_desc img_desc_q = {
|
|
3740
|
+
CL_MEM_OBJECT_IMAGE1D_BUFFER,
|
|
3741
|
+
static_cast<size_t>(ggml_nelements(tensor)/32*2),
|
|
3742
|
+
0, 0, 0, 0, 0, 0, 0,
|
|
3743
|
+
{ extra->q }
|
|
3744
|
+
};
|
|
3745
|
+
extra->q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_format_q, &img_desc_q, NULL, &err);
|
|
3746
|
+
tensor->extra = extra;
|
|
3747
|
+
|
|
3748
|
+
return;
|
|
3749
|
+
}
|
|
3750
|
+
if (tensor->type == GGML_TYPE_Q8_0) {
|
|
3751
|
+
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra;
|
|
3752
|
+
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized");
|
|
3753
|
+
|
|
3754
|
+
// Allocate the new extra and create aliases from the original.
|
|
3755
|
+
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
|
|
3756
|
+
ggml_tensor_extra_cl_q8_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q8_0();
|
|
3757
|
+
|
|
3758
|
+
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t);
|
|
3759
|
+
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*(ggml_blck_size(tensor->type)*sizeof(char));
|
|
3760
|
+
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size");
|
|
3761
|
+
|
|
3762
|
+
cl_int err;
|
|
3763
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
3764
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
3765
|
+
CL_CHECK(err);
|
|
3766
|
+
CL_CHECK(clEnqueueWriteBuffer(
|
|
3767
|
+
queue, data_device, CL_TRUE, 0,
|
|
3768
|
+
ggml_nbytes(tensor), data, 0, NULL, NULL));
|
|
3769
|
+
|
|
3770
|
+
// The original tensor memory is divided into scales and quants, i.e.,
|
|
3771
|
+
// we first store scales, then quants.
|
|
3772
|
+
cl_buffer_region region;
|
|
3773
|
+
|
|
3774
|
+
// Create subbuffer for scales.
|
|
3775
|
+
region.origin = align_to(extra_orig->offset + tensor->view_offs + offset, backend_ctx->alignment);
|
|
3776
|
+
region.size = size_d;
|
|
3777
|
+
extra->d = clCreateSubBuffer(
|
|
3778
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
3779
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
3780
|
+
CL_CHECK(err);
|
|
3781
|
+
auto previous_origin = region.origin;
|
|
3782
|
+
|
|
3783
|
+
// Create subbuffer for quants.
|
|
3784
|
+
region.origin = align_to(previous_origin + size_d, backend_ctx->alignment);
|
|
3785
|
+
region.size = size_q;
|
|
3786
|
+
extra->q = clCreateSubBuffer(
|
|
3787
|
+
extra_orig->data_device, CL_MEM_READ_WRITE,
|
|
3788
|
+
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err);
|
|
3789
|
+
CL_CHECK(err);
|
|
3790
|
+
|
|
3791
|
+
cl_kernel kernel = backend_ctx->kernel_convert_block_q8_0;
|
|
3792
|
+
|
|
3793
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device));
|
|
3794
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q));
|
|
3795
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d));
|
|
3796
|
+
|
|
3797
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
3798
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3799
|
+
|
|
3800
|
+
cl_event evt;
|
|
3801
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
|
3802
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3803
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
3804
|
+
|
|
3805
|
+
tensor->extra = extra;
|
|
3806
|
+
|
|
3807
|
+
return;
|
|
3808
|
+
}
|
|
3809
|
+
#endif // GGML_OPENCL_SOA_Q
|
|
3810
|
+
|
|
3811
|
+
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
|
|
3812
|
+
GGML_ASSERT(extra);
|
|
3813
|
+
|
|
3814
|
+
CL_CHECK(clEnqueueWriteBuffer(
|
|
3815
|
+
queue, extra->data_device, CL_TRUE, extra->offset + offset,
|
|
3816
|
+
size, data, 0, NULL, NULL));
|
|
3817
|
+
|
|
3818
|
+
GGML_UNUSED(buffer);
|
|
3819
|
+
}
|
|
3820
|
+
|
|
3821
|
+
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
|
3822
|
+
GGML_ASSERT(tensor->extra);
|
|
3823
|
+
|
|
3824
|
+
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device);
|
|
3825
|
+
|
|
3826
|
+
cl_context context = backend_ctx->context;
|
|
3827
|
+
cl_command_queue queue = backend_ctx->queue;
|
|
3828
|
+
|
|
3829
|
+
// Make sure all previously submitted commands in other devices are finished.
|
|
3830
|
+
sync_with_other_backends(backend_ctx);
|
|
3831
|
+
|
|
3832
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
3833
|
+
// In end-to-end runs, get_tensor is usually used to get back the logits,
|
|
3834
|
+
// where we can simply do clEnqueueReadBuffer since they are f32.
|
|
3835
|
+
// However, in test-backend-ops, the GPU graph is copied to the CPU backend,
|
|
3836
|
+
// which requires reading back quantized weight tensors.
|
|
3837
|
+
// To properly support this, we need to restore block_q4_0 struct arrays
|
|
3838
|
+
// from the flattened buffers.
|
|
3839
|
+
if (tensor->type == GGML_TYPE_Q4_0) {
|
|
3840
|
+
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra;
|
|
3327
3841
|
|
|
3328
3842
|
cl_int err;
|
|
3329
3843
|
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
@@ -3338,6 +3852,84 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
|
3338
3852
|
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
3339
3853
|
size_t local_work_size[] = {1, 1, 1};
|
|
3340
3854
|
|
|
3855
|
+
cl_event evt;
|
|
3856
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
3857
|
+
global_work_size, local_work_size, 0, NULL, &evt));
|
|
3858
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3859
|
+
CL_CHECK(clEnqueueReadBuffer(
|
|
3860
|
+
queue, data_device, CL_TRUE, offset,
|
|
3861
|
+
size, data, 0, NULL, NULL));
|
|
3862
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
3863
|
+
return;
|
|
3864
|
+
} else if (tensor->type == GGML_TYPE_MXFP4) {
|
|
3865
|
+
ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *)tensor->extra;
|
|
3866
|
+
|
|
3867
|
+
cl_int err;
|
|
3868
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
3869
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
3870
|
+
CL_CHECK(err);
|
|
3871
|
+
|
|
3872
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
3873
|
+
if (use_adreno_moe_kernels(backend_ctx, tensor)) {
|
|
3874
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4_trans;
|
|
3875
|
+
|
|
3876
|
+
int ne00 = tensor->ne[0];
|
|
3877
|
+
int ne01 = tensor->ne[1];
|
|
3878
|
+
int ne02 = tensor->ne[2];
|
|
3879
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
3880
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
|
|
3881
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
3882
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_int), &ne00));
|
|
3883
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_int), &ne01));
|
|
3884
|
+
|
|
3885
|
+
size_t global_work_size[3] = {static_cast<size_t>(((ne01 + 63) / 64) * 64), static_cast<size_t>(ne00 / 32), static_cast<size_t>(ne02)};
|
|
3886
|
+
size_t local_work_size[3] = {64, 2, 1};
|
|
3887
|
+
|
|
3888
|
+
cl_event evt;
|
|
3889
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
3890
|
+
global_work_size, local_work_size, 0, NULL, &evt));
|
|
3891
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3892
|
+
CL_CHECK(clEnqueueReadBuffer(
|
|
3893
|
+
queue, data_device, CL_TRUE, offset,
|
|
3894
|
+
size, data, 0, NULL, NULL));
|
|
3895
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
3896
|
+
return;
|
|
3897
|
+
}
|
|
3898
|
+
#endif
|
|
3899
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_mxfp4;
|
|
3900
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
3901
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->e));
|
|
3902
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
3903
|
+
|
|
3904
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
3905
|
+
size_t local_work_size[] = {1, 1, 1};
|
|
3906
|
+
|
|
3907
|
+
cl_event evt;
|
|
3908
|
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
3909
|
+
global_work_size, local_work_size, 0, NULL, &evt));
|
|
3910
|
+
CL_CHECK(clWaitForEvents(1, &evt));
|
|
3911
|
+
CL_CHECK(clEnqueueReadBuffer(
|
|
3912
|
+
queue, data_device, CL_TRUE, offset,
|
|
3913
|
+
size, data, 0, NULL, NULL));
|
|
3914
|
+
CL_CHECK(clReleaseMemObject(data_device));
|
|
3915
|
+
return;
|
|
3916
|
+
}
|
|
3917
|
+
if (tensor->type == GGML_TYPE_Q8_0) {
|
|
3918
|
+
ggml_tensor_extra_cl_q8_0 * extra = (ggml_tensor_extra_cl_q8_0 *)tensor->extra;
|
|
3919
|
+
|
|
3920
|
+
cl_int err;
|
|
3921
|
+
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
3922
|
+
ggml_nbytes(tensor), NULL, &err);
|
|
3923
|
+
CL_CHECK(err);
|
|
3924
|
+
|
|
3925
|
+
cl_kernel kernel = backend_ctx->kernel_restore_block_q8_0;
|
|
3926
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q));
|
|
3927
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d));
|
|
3928
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device));
|
|
3929
|
+
|
|
3930
|
+
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1};
|
|
3931
|
+
size_t local_work_size[] = {1, 1, 1};
|
|
3932
|
+
|
|
3341
3933
|
cl_event evt;
|
|
3342
3934
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL,
|
|
3343
3935
|
global_work_size, local_work_size, 0, NULL, &evt));
|
|
@@ -3659,6 +4251,19 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
|
|
|
3659
4251
|
CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
|
|
3660
4252
|
CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL));
|
|
3661
4253
|
CL_CHECK(clFinish(queue));
|
|
4254
|
+
} else if (tensor->type == GGML_TYPE_MXFP4) {
|
|
4255
|
+
ggml_tensor_extra_cl_mxfp4 * extra = (ggml_tensor_extra_cl_mxfp4 *) tensor->extra;
|
|
4256
|
+
GGML_ASSERT(extra);
|
|
4257
|
+
|
|
4258
|
+
size_t size_q = ggml_nelements(tensor)/QK_MXFP4 * QK_MXFP4/2;
|
|
4259
|
+
size_t size_e = ggml_nelements(tensor)/QK_MXFP4 * sizeof(char);
|
|
4260
|
+
GGML_ASSERT(size_q + size_e == ggml_nbytes(tensor));
|
|
4261
|
+
buf_q = malloc(size_q);
|
|
4262
|
+
buf_d = malloc(size_e);
|
|
4263
|
+
|
|
4264
|
+
CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL));
|
|
4265
|
+
CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_e, buf_d, 0, NULL, NULL));
|
|
4266
|
+
CL_CHECK(clFinish(queue));
|
|
3662
4267
|
} else {
|
|
3663
4268
|
// Read out the tensor from GPU memory.
|
|
3664
4269
|
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra;
|
|
@@ -3782,15 +4387,19 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3782
4387
|
GGML_ASSERT(dst);
|
|
3783
4388
|
GGML_ASSERT(dst->extra);
|
|
3784
4389
|
|
|
3785
|
-
const int ne00 = src0
|
|
3786
|
-
const cl_ulong nb01 = src0
|
|
3787
|
-
const cl_ulong nb02 = src0
|
|
3788
|
-
const
|
|
3789
|
-
const
|
|
3790
|
-
const
|
|
3791
|
-
const
|
|
3792
|
-
const
|
|
3793
|
-
const cl_ulong
|
|
4390
|
+
const int ne00 = src0->ne[0];
|
|
4391
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
4392
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
4393
|
+
const cl_ulong nb03 = src0->nb[3];
|
|
4394
|
+
const int ne10 = src1->ne[0];
|
|
4395
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
4396
|
+
const int ne11 = src1->ne[1];
|
|
4397
|
+
const int ne12 = src1->ne[2];
|
|
4398
|
+
const cl_ulong nb11 = src1->nb[1];
|
|
4399
|
+
const cl_ulong nb12 = src1->nb[2];
|
|
4400
|
+
const cl_ulong nb1 = dst->nb[1];
|
|
4401
|
+
const cl_ulong nb2 = dst->nb[2];
|
|
4402
|
+
const cl_ulong nb3 = dst->nb[3];
|
|
3794
4403
|
|
|
3795
4404
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
3796
4405
|
|
|
@@ -3827,14 +4436,17 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3827
4436
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
3828
4437
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
3829
4438
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
3830
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(
|
|
3831
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(
|
|
3832
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &
|
|
3833
|
-
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &
|
|
3834
|
-
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &
|
|
3835
|
-
|
|
3836
|
-
|
|
3837
|
-
|
|
4439
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
4440
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne10));
|
|
4441
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb10));
|
|
4442
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
|
4443
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
|
4444
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb1));
|
|
4445
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb2));
|
|
4446
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb3));
|
|
4447
|
+
|
|
4448
|
+
size_t global_work_size[] = {(size_t)ne10*64, (size_t)ne11, (size_t)ne12};
|
|
4449
|
+
size_t local_work_size[] = {64, 1, 1};
|
|
3838
4450
|
|
|
3839
4451
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
3840
4452
|
}
|
|
@@ -3846,6 +4458,7 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3846
4458
|
GGML_ASSERT(src1->extra);
|
|
3847
4459
|
GGML_ASSERT(dst);
|
|
3848
4460
|
GGML_ASSERT(dst->extra);
|
|
4461
|
+
GGML_ASSERT(src1->type == GGML_TYPE_I64 || src1->type == GGML_TYPE_I32);
|
|
3849
4462
|
|
|
3850
4463
|
// ne0 = ne00
|
|
3851
4464
|
// ne2 = ne02
|
|
@@ -3888,15 +4501,26 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3888
4501
|
|
|
3889
4502
|
switch (dst->type) {
|
|
3890
4503
|
case GGML_TYPE_F32:
|
|
3891
|
-
|
|
4504
|
+
if (src1->type == GGML_TYPE_I64) {
|
|
4505
|
+
kernel = backend_ctx->kernel_set_rows_f32_i64;
|
|
4506
|
+
} else {
|
|
4507
|
+
kernel = backend_ctx->kernel_set_rows_f32_i32;
|
|
4508
|
+
}
|
|
3892
4509
|
break;
|
|
3893
4510
|
case GGML_TYPE_F16:
|
|
3894
|
-
|
|
4511
|
+
if (src1->type == GGML_TYPE_I64) {
|
|
4512
|
+
kernel = backend_ctx->kernel_set_rows_f16_i64;
|
|
4513
|
+
} else {
|
|
4514
|
+
kernel = backend_ctx->kernel_set_rows_f16_i32;
|
|
4515
|
+
}
|
|
3895
4516
|
break;
|
|
3896
4517
|
default:
|
|
3897
4518
|
GGML_ABORT("not implemented");
|
|
3898
4519
|
}
|
|
3899
4520
|
|
|
4521
|
+
fastdiv_vals ne11_ = init_fastdiv_values(ne11);
|
|
4522
|
+
fastdiv_vals ne12_ = init_fastdiv_values(ne12);
|
|
4523
|
+
|
|
3900
4524
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
3901
4525
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
3902
4526
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
@@ -3907,8 +4531,8 @@ static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
|
3907
4531
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
3908
4532
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
3909
4533
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
3910
|
-
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(
|
|
3911
|
-
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(
|
|
4534
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(fastdiv_vals), &ne11_));
|
|
4535
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(fastdiv_vals), &ne12_));
|
|
3912
4536
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
|
|
3913
4537
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
|
|
3914
4538
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
|
|
@@ -5081,7 +5705,7 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
|
|
|
5081
5705
|
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb2));
|
|
5082
5706
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb3));
|
|
5083
5707
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &eps));
|
|
5084
|
-
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*
|
|
5708
|
+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs, NULL));
|
|
5085
5709
|
|
|
5086
5710
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
5087
5711
|
}
|
|
@@ -5425,7 +6049,6 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
|
5425
6049
|
GGML_ASSERT(dst->extra);
|
|
5426
6050
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
|
5427
6051
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
5428
|
-
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1);
|
|
5429
6052
|
|
|
5430
6053
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5431
6054
|
|
|
@@ -5443,28 +6066,67 @@ static void ggml_cl_pad(ggml_backend_t backend, const ggml_tensor * src0, ggml_t
|
|
|
5443
6066
|
const int s_ne0 = src0->ne[0];
|
|
5444
6067
|
const int s_ne1 = src0->ne[1];
|
|
5445
6068
|
const int s_ne2 = src0->ne[2];
|
|
6069
|
+
const int s_ne3 = src0->ne[3];
|
|
6070
|
+
|
|
6071
|
+
const int s_nb0 = src0->nb[0];
|
|
6072
|
+
const int s_nb1 = src0->nb[1];
|
|
6073
|
+
const int s_nb2 = src0->nb[2];
|
|
6074
|
+
const int s_nb3 = src0->nb[3];
|
|
5446
6075
|
|
|
5447
6076
|
const int d_ne0 = dst->ne[0];
|
|
5448
6077
|
const int d_ne1 = dst->ne[1];
|
|
5449
6078
|
const int d_ne2 = dst->ne[2];
|
|
6079
|
+
const int d_ne3 = dst->ne[3];
|
|
6080
|
+
|
|
6081
|
+
const int d_nb0 = dst->nb[0];
|
|
6082
|
+
const int d_nb1 = dst->nb[1];
|
|
6083
|
+
const int d_nb2 = dst->nb[2];
|
|
6084
|
+
const int d_nb3 = dst->nb[3];
|
|
6085
|
+
|
|
6086
|
+
const int lp0 = ((const int*)(dst->op_params))[0];
|
|
6087
|
+
const int rp0 = ((const int*)(dst->op_params))[1];
|
|
6088
|
+
const int lp1 = ((const int*)(dst->op_params))[2];
|
|
6089
|
+
const int rp1 = ((const int*)(dst->op_params))[3];
|
|
6090
|
+
const int lp2 = ((const int*)(dst->op_params))[4];
|
|
6091
|
+
const int rp2 = ((const int*)(dst->op_params))[5];
|
|
6092
|
+
const int lp3 = ((const int*)(dst->op_params))[6];
|
|
6093
|
+
const int rp3 = ((const int*)(dst->op_params))[7];
|
|
5450
6094
|
|
|
5451
6095
|
cl_kernel kernel = backend_ctx->kernel_pad;
|
|
5452
6096
|
|
|
5453
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5454
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5455
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5456
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5457
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5458
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5459
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5460
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5461
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
5462
|
-
CL_CHECK(clSetKernelArg(kernel,
|
|
6097
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra_src0->data_device));
|
|
6098
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &off_src0));
|
|
6099
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra_dst->data_device));
|
|
6100
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &off_dst));
|
|
6101
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &s_ne0));
|
|
6102
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &s_ne1));
|
|
6103
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &s_ne2));
|
|
6104
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &s_ne3));
|
|
6105
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &s_nb0));
|
|
6106
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &s_nb1));
|
|
6107
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &s_nb2));
|
|
6108
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &s_nb3));
|
|
6109
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &d_ne0));
|
|
6110
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &d_ne1));
|
|
6111
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &d_ne2));
|
|
6112
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &d_ne3));
|
|
6113
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &d_nb0));
|
|
6114
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &d_nb1));
|
|
6115
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &d_nb2));
|
|
6116
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &d_nb3));
|
|
6117
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &lp0));
|
|
6118
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &rp0));
|
|
6119
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &lp1));
|
|
6120
|
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &rp1));
|
|
6121
|
+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &lp2));
|
|
6122
|
+
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &rp2));
|
|
6123
|
+
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &lp3));
|
|
6124
|
+
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(int), &rp3));
|
|
5463
6125
|
|
|
5464
6126
|
size_t lws0 = 64;
|
|
5465
6127
|
size_t gws0 = (( (size_t)d_ne0 + lws0 - 1 ) / lws0) * lws0;
|
|
5466
6128
|
|
|
5467
|
-
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2 };
|
|
6129
|
+
size_t global_work_size[] = { gws0, (size_t)d_ne1, (size_t)d_ne2*d_ne3 };
|
|
5468
6130
|
size_t local_work_size[] = { lws0, 1, 1 };
|
|
5469
6131
|
|
|
5470
6132
|
size_t * local_work_size_ptr = local_work_size;
|
|
@@ -5554,8 +6216,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
|
|
|
5554
6216
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float), &sf3));
|
|
5555
6217
|
} else if (mode == GGML_SCALE_MODE_BILINEAR) {
|
|
5556
6218
|
if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
5557
|
-
sf0 = (float)(ne0 - 1) / (ne00 - 1);
|
|
5558
|
-
sf1 = (float)(ne1 - 1) / (ne01 - 1);
|
|
6219
|
+
sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
|
|
6220
|
+
sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
|
|
5559
6221
|
pixel_offset = 0.0f;
|
|
5560
6222
|
}
|
|
5561
6223
|
|
|
@@ -5670,12 +6332,12 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
5670
6332
|
} else {
|
|
5671
6333
|
cl_kernel kernel = backend_ctx->kernel_concat_f32_non_contiguous;
|
|
5672
6334
|
|
|
5673
|
-
|
|
6335
|
+
cl_long ne00 = src0->ne[0], ne01 = src0->ne[1], ne02 = src0->ne[2], ne03 = src0->ne[3];
|
|
5674
6336
|
cl_ulong nb00 = src0->nb[0], nb01 = src0->nb[1], nb02 = src0->nb[2], nb03 = src0->nb[3];
|
|
5675
6337
|
|
|
5676
6338
|
cl_ulong nb10 = src1->nb[0], nb11 = src1->nb[1], nb12 = src1->nb[2], nb13 = src1->nb[3];
|
|
5677
6339
|
|
|
5678
|
-
|
|
6340
|
+
cl_long d_ne0 = dst->ne[0], d_ne1 = dst->ne[1], d_ne2 = dst->ne[2], d_ne3 = dst->ne[3];
|
|
5679
6341
|
cl_ulong d_nb0 = dst->nb[0], d_nb1 = dst->nb[1], d_nb2 = dst->nb[2], d_nb3 = dst->nb[3];
|
|
5680
6342
|
|
|
5681
6343
|
|
|
@@ -5686,10 +6348,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
5686
6348
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad_cl->data_device));
|
|
5687
6349
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &off_dst));
|
|
5688
6350
|
|
|
5689
|
-
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(
|
|
5690
|
-
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(
|
|
5691
|
-
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(
|
|
5692
|
-
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(
|
|
6351
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_long), &ne00));
|
|
6352
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_long), &ne01));
|
|
6353
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_long), &ne02));
|
|
6354
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_long), &ne03));
|
|
5693
6355
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00));
|
|
5694
6356
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01));
|
|
5695
6357
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
|
@@ -5700,10 +6362,10 @@ static void ggml_cl_concat(ggml_backend_t backend, const ggml_tensor * src0, con
|
|
|
5700
6362
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb12));
|
|
5701
6363
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb13));
|
|
5702
6364
|
|
|
5703
|
-
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(
|
|
5704
|
-
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(
|
|
5705
|
-
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(
|
|
5706
|
-
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(
|
|
6365
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_long), &d_ne0));
|
|
6366
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_long), &d_ne1));
|
|
6367
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_long), &d_ne2));
|
|
6368
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_long), &d_ne3));
|
|
5707
6369
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &d_nb0));
|
|
5708
6370
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &d_nb1));
|
|
5709
6371
|
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(cl_ulong), &d_nb2));
|
|
@@ -5765,6 +6427,7 @@ static void ggml_cl_timestep_embedding(ggml_backend_t backend, const ggml_tensor
|
|
|
5765
6427
|
static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, const ggml_tensor * k, ggml_tensor * dst) {
|
|
5766
6428
|
const ggml_tensor * v = dst->src[2];
|
|
5767
6429
|
const ggml_tensor * mask = dst->src[3];
|
|
6430
|
+
const ggml_tensor * sinks = dst->src[4];
|
|
5768
6431
|
GGML_ASSERT(q->extra);
|
|
5769
6432
|
GGML_ASSERT(k->extra);
|
|
5770
6433
|
GGML_ASSERT(v->extra);
|
|
@@ -5772,6 +6435,9 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
|
|
5772
6435
|
if (mask) {
|
|
5773
6436
|
GGML_ASSERT(mask->extra);
|
|
5774
6437
|
}
|
|
6438
|
+
if (sinks) {
|
|
6439
|
+
GGML_ASSERT(sinks->extra);
|
|
6440
|
+
}
|
|
5775
6441
|
|
|
5776
6442
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
5777
6443
|
|
|
@@ -5813,6 +6479,7 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
|
|
5813
6479
|
ggml_tensor_extra_cl * extra_v = (ggml_tensor_extra_cl *)v->extra;
|
|
5814
6480
|
ggml_tensor_extra_cl * extra_o = (ggml_tensor_extra_cl *)dst->extra;
|
|
5815
6481
|
ggml_tensor_extra_cl * extra_mask = mask ? (ggml_tensor_extra_cl *)mask->extra : NULL;
|
|
6482
|
+
ggml_tensor_extra_cl * extra_sinks = sinks ? (ggml_tensor_extra_cl *)sinks->extra : NULL;
|
|
5816
6483
|
|
|
5817
6484
|
cl_ulong offset_q = extra_q->offset + q->view_offs;
|
|
5818
6485
|
cl_ulong offset_k = extra_k->offset + k->view_offs;
|
|
@@ -5820,6 +6487,8 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
|
|
5820
6487
|
cl_ulong offset_o = extra_o->offset + dst->view_offs;
|
|
5821
6488
|
cl_mem mask_buffer = extra_mask ? extra_mask->data_device : NULL;
|
|
5822
6489
|
cl_ulong offset_mask = extra_mask ? extra_mask->offset + mask->view_offs : 0;
|
|
6490
|
+
cl_mem sinks_buffer = extra_sinks ? extra_sinks->data_device : NULL;
|
|
6491
|
+
cl_ulong offset_sinks = extra_sinks ? extra_sinks->offset + sinks->view_offs : 0;
|
|
5823
6492
|
|
|
5824
6493
|
const cl_ulong q_nb1 = q->nb[1], q_nb2 = q->nb[2], q_nb3 = q->nb[3];
|
|
5825
6494
|
const cl_ulong k_nb1 = k->nb[1], k_nb2 = k->nb[2], k_nb3 = k->nb[3];
|
|
@@ -5874,6 +6543,8 @@ static void ggml_cl_flash_attn(ggml_backend_t backend, const ggml_tensor * q, co
|
|
|
5874
6543
|
CL_CHECK(clSetKernelArg(kernel, 35, sizeof(cl_ulong), &mask_nb3));
|
|
5875
6544
|
CL_CHECK(clSetKernelArg(kernel, 36, sizeof(int), &mask_ne2));
|
|
5876
6545
|
CL_CHECK(clSetKernelArg(kernel, 37, sizeof(int), &mask_ne3));
|
|
6546
|
+
CL_CHECK(clSetKernelArg(kernel, 38, sizeof(cl_mem), &sinks_buffer));
|
|
6547
|
+
CL_CHECK(clSetKernelArg(kernel, 39, sizeof(cl_ulong), &offset_sinks));
|
|
5877
6548
|
|
|
5878
6549
|
if (n_q == 1) {
|
|
5879
6550
|
const size_t wg_size = 64;
|
|
@@ -6017,6 +6688,146 @@ static void ggml_cl_conv_2d(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6017
6688
|
backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size, local_work_size, dst);
|
|
6018
6689
|
}
|
|
6019
6690
|
|
|
6691
|
+
static void ggml_cl_mul_mat_kq_kqv_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
6692
|
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
|
6693
|
+
|
|
6694
|
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
|
6695
|
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
|
6696
|
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
|
6697
|
+
|
|
6698
|
+
const int ne00 = src0->ne[0];
|
|
6699
|
+
const int ne01 = src0->ne[1];
|
|
6700
|
+
const int ne02 = src0->ne[2];
|
|
6701
|
+
|
|
6702
|
+
const cl_ulong nb01 = src0->nb[1];
|
|
6703
|
+
const cl_ulong nb02 = src0->nb[2];
|
|
6704
|
+
|
|
6705
|
+
const int ne10 = src1->ne[0];
|
|
6706
|
+
const int ne11 = src1->ne[1];
|
|
6707
|
+
const int ne12 = src1->ne[2];
|
|
6708
|
+
|
|
6709
|
+
const cl_ulong nb10 = src1->nb[0];
|
|
6710
|
+
|
|
6711
|
+
const int ne0 = dst->ne[0];
|
|
6712
|
+
const int ne1 = dst->ne[1];
|
|
6713
|
+
|
|
6714
|
+
GGML_ASSERT(ne00 == ne10);
|
|
6715
|
+
|
|
6716
|
+
cl_kernel kernel;
|
|
6717
|
+
cl_context context = backend_ctx->context;
|
|
6718
|
+
|
|
6719
|
+
cl_int status;
|
|
6720
|
+
cl_image_format img_fmt_1d;
|
|
6721
|
+
cl_image_desc img_desc_1d;
|
|
6722
|
+
cl_buffer_region region;
|
|
6723
|
+
cl_mem A_image1d;
|
|
6724
|
+
cl_mem A_sub_buffer;
|
|
6725
|
+
cl_mem B_sub_buffer;
|
|
6726
|
+
cl_mem D_image1d;
|
|
6727
|
+
cl_mem D_sub_buffer;
|
|
6728
|
+
|
|
6729
|
+
int M = ne01;
|
|
6730
|
+
int N = ne1;
|
|
6731
|
+
int K = ne00;
|
|
6732
|
+
|
|
6733
|
+
if (nb01 > nb02) {
|
|
6734
|
+
// KQ
|
|
6735
|
+
kernel = backend_ctx->kernel_mul_mm_f16_f32_kq;
|
|
6736
|
+
} else {
|
|
6737
|
+
// KQV
|
|
6738
|
+
kernel = backend_ctx->kernel_mul_mm_f16_f32_kqv;
|
|
6739
|
+
}
|
|
6740
|
+
// create sub-buffer for A
|
|
6741
|
+
// <--------------------------------------------> //
|
|
6742
|
+
extra0 = src0->view_src ? (ggml_tensor_extra_cl *)src0->view_src->extra : (ggml_tensor_extra_cl *)src0->extra;
|
|
6743
|
+
|
|
6744
|
+
region.origin = (extra0->offset);
|
|
6745
|
+
if (nb01 > nb02) {
|
|
6746
|
+
// KQ
|
|
6747
|
+
region.size = nb01 * ne01;
|
|
6748
|
+
} else {
|
|
6749
|
+
// KQV
|
|
6750
|
+
region.size = nb02 * ne02;
|
|
6751
|
+
}
|
|
6752
|
+
|
|
6753
|
+
A_sub_buffer = clCreateSubBuffer((extra0->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
6754
|
+
CL_CHECK(status);
|
|
6755
|
+
|
|
6756
|
+
// <--------------------------------------------> //
|
|
6757
|
+
|
|
6758
|
+
// create sub-buffer for B
|
|
6759
|
+
// <--------------------------------------------> //
|
|
6760
|
+
region.origin = (extra1->offset);
|
|
6761
|
+
region.size = nb10 * ne10 * ne11 * ne12;
|
|
6762
|
+
B_sub_buffer = clCreateSubBuffer((extra1->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
6763
|
+
CL_CHECK(status);
|
|
6764
|
+
// <--------------------------------------------> //
|
|
6765
|
+
|
|
6766
|
+
img_fmt_1d = {CL_RGBA, CL_FLOAT};
|
|
6767
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
6768
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
6769
|
+
if (nb01 > nb02) {
|
|
6770
|
+
img_desc_1d.image_width = (nb01 * ne01 / 4)/4;
|
|
6771
|
+
}
|
|
6772
|
+
else {
|
|
6773
|
+
img_desc_1d.image_width = (nb02 * ne02 / 4)/4;
|
|
6774
|
+
}
|
|
6775
|
+
img_desc_1d.buffer = A_sub_buffer;
|
|
6776
|
+
A_image1d = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
6777
|
+
CL_CHECK(status);
|
|
6778
|
+
|
|
6779
|
+
// create sub-buffer for output C
|
|
6780
|
+
// <--------------------------------------------> //
|
|
6781
|
+
region.origin = (extrad->offset);
|
|
6782
|
+
region.size = ne0 * ne1 * dst->ne[2] * dst->nb[0]; // size of C in bytes
|
|
6783
|
+
D_sub_buffer = clCreateSubBuffer((extrad->data_device), 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
6784
|
+
CL_CHECK(status);
|
|
6785
|
+
// <--------------------------------------------> //
|
|
6786
|
+
|
|
6787
|
+
// create image for C output
|
|
6788
|
+
// <--------------------------------------------> //
|
|
6789
|
+
img_fmt_1d = {CL_R, CL_FLOAT};
|
|
6790
|
+
memset(&img_desc_1d, 0, sizeof(img_desc_1d));
|
|
6791
|
+
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
|
|
6792
|
+
img_desc_1d.image_width = ne0 * ne1 * dst->ne[2] * dst->nb[0] / 4;
|
|
6793
|
+
img_desc_1d.buffer = D_sub_buffer;
|
|
6794
|
+
D_image1d = clCreateImage(context, CL_MEM_WRITE_ONLY, &img_fmt_1d, &img_desc_1d, NULL, &status);
|
|
6795
|
+
CL_CHECK(status);
|
|
6796
|
+
// <--------------------------------------------> //
|
|
6797
|
+
|
|
6798
|
+
int offset_src0 = 0;
|
|
6799
|
+
int offset_src1 = 0;
|
|
6800
|
+
|
|
6801
|
+
// set kernel args
|
|
6802
|
+
// <--------------------------------------------> //
|
|
6803
|
+
cl_uint k_arg = 0;
|
|
6804
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d));
|
|
6805
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src0));
|
|
6806
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_sub_buffer));
|
|
6807
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &offset_src1));
|
|
6808
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &D_image1d));
|
|
6809
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &extrad->offset));
|
|
6810
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &M));
|
|
6811
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &K));
|
|
6812
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &N));
|
|
6813
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02));
|
|
6814
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12));
|
|
6815
|
+
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &nb01));
|
|
6816
|
+
|
|
6817
|
+
size_t global_work_size[3] = {64, static_cast<size_t>(((M+63)/64)), static_cast<size_t>(((N+31)/32)*ne12)};
|
|
6818
|
+
size_t local_work_size[3] = {64, 1, 2};
|
|
6819
|
+
|
|
6820
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6821
|
+
|
|
6822
|
+
// deallocate sub buffers and images
|
|
6823
|
+
// <--------------------------------------------> //
|
|
6824
|
+
CL_CHECK(clReleaseMemObject(A_image1d));
|
|
6825
|
+
CL_CHECK(clReleaseMemObject(D_image1d));
|
|
6826
|
+
CL_CHECK(clReleaseMemObject(A_sub_buffer));
|
|
6827
|
+
CL_CHECK(clReleaseMemObject(B_sub_buffer));
|
|
6828
|
+
CL_CHECK(clReleaseMemObject(D_sub_buffer));
|
|
6829
|
+
}
|
|
6830
|
+
|
|
6020
6831
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
6021
6832
|
GGML_ASSERT(src0);
|
|
6022
6833
|
GGML_ASSERT(src0->extra);
|
|
@@ -6040,6 +6851,8 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6040
6851
|
|
|
6041
6852
|
#ifdef GGML_OPENCL_SOA_Q
|
|
6042
6853
|
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
|
6854
|
+
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
|
6855
|
+
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
|
6043
6856
|
#endif
|
|
6044
6857
|
|
|
6045
6858
|
const int ne00 = src0 ? src0->ne[0] : 0;
|
|
@@ -6081,6 +6894,27 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6081
6894
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
6082
6895
|
cl_context context = backend_ctx->context;
|
|
6083
6896
|
|
|
6897
|
+
if(src0t == GGML_TYPE_F16 && src1t == GGML_TYPE_F32){
|
|
6898
|
+
if (ne01 >= 64 && ne1 >= 32 && ne00 >= 16 && (ne12 % ne02) == 0) {
|
|
6899
|
+
// For KQ
|
|
6900
|
+
if (ggml_is_permuted(src0) && ggml_is_permuted(src1) &&
|
|
6901
|
+
nb00 <= nb02 &&
|
|
6902
|
+
nb02 <= nb01 &&
|
|
6903
|
+
nb01 <= nb03 &&
|
|
6904
|
+
nb10 <= nb12 &&
|
|
6905
|
+
nb12 <= nb11 &&
|
|
6906
|
+
nb11 <= nb13) {
|
|
6907
|
+
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
|
6908
|
+
return;
|
|
6909
|
+
}
|
|
6910
|
+
// For KQV
|
|
6911
|
+
if (!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
|
6912
|
+
ggml_cl_mul_mat_kq_kqv_adreno(backend, src0, src1, dst);
|
|
6913
|
+
return;
|
|
6914
|
+
}
|
|
6915
|
+
}
|
|
6916
|
+
}
|
|
6917
|
+
|
|
6084
6918
|
if (ne01 && ne1 && use_adreno_kernels(backend_ctx, src0)) {
|
|
6085
6919
|
|
|
6086
6920
|
// init CL objects
|
|
@@ -6454,6 +7288,44 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6454
7288
|
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
6455
7289
|
return;
|
|
6456
7290
|
}
|
|
7291
|
+
case GGML_TYPE_Q8_0: {
|
|
7292
|
+
if (ne11 < 32) {
|
|
7293
|
+
break;
|
|
7294
|
+
}
|
|
7295
|
+
kernel = backend_ctx->kernel_mul_mm_q8_0_f32_l4_lm;
|
|
7296
|
+
nth0 = 128; // calculated as (BM*BN)/(TM*TN)
|
|
7297
|
+
|
|
7298
|
+
int batch_stride_a = ne00*ne01;
|
|
7299
|
+
int batch_stride_b = ne10*ne11;
|
|
7300
|
+
int batch_stride_d = ne0*ne1;
|
|
7301
|
+
|
|
7302
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
|
7303
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
|
7304
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
7305
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
7306
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
7307
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
7308
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
7309
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
|
7310
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02));
|
|
7311
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne11));
|
|
7312
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
|
7313
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne10)); // stride_a
|
|
7314
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); // stride_b
|
|
7315
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne01)); // stride_d
|
|
7316
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &batch_stride_a));
|
|
7317
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &batch_stride_b));
|
|
7318
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &batch_stride_d));
|
|
7319
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
|
7320
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
|
7321
|
+
|
|
7322
|
+
// 64 is block tile size BM and BN - change here when BM and BN in the kernel are changed.
|
|
7323
|
+
size_t global_work_size[] = {(size_t)(CEIL_DIV(ne01, 64)*nth0), (size_t)(CEIL_DIV(ne11, 64)), (size_t)ne12*ne13};
|
|
7324
|
+
size_t local_work_size[] = {(size_t)nth0, 1, 1};
|
|
7325
|
+
|
|
7326
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
|
7327
|
+
return;
|
|
7328
|
+
}
|
|
6457
7329
|
default:
|
|
6458
7330
|
break;
|
|
6459
7331
|
}
|
|
@@ -6709,7 +7581,84 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6709
7581
|
#endif // GGML_OPENCL_SOA_Q
|
|
6710
7582
|
break;
|
|
6711
7583
|
case GGML_TYPE_Q4_1:
|
|
6712
|
-
case GGML_TYPE_Q8_0:
|
|
7584
|
+
case GGML_TYPE_Q8_0: {
|
|
7585
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
7586
|
+
kernel = backend_ctx->kernel_mul_mv_q8_0_f32_flat;
|
|
7587
|
+
|
|
7588
|
+
// nth0 - subgroup size
|
|
7589
|
+
// nth1 - number of subgroups per workgroup
|
|
7590
|
+
// ndst - number of output values per workgroup = output per subgroup * number of subgroups
|
|
7591
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
7592
|
+
nth0 = 16;
|
|
7593
|
+
nth1 = 2;
|
|
7594
|
+
ndst = nth1*4;
|
|
7595
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
7596
|
+
nth0 = 64;
|
|
7597
|
+
nth1 = 2;
|
|
7598
|
+
ndst = nth1*4;
|
|
7599
|
+
} else {
|
|
7600
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
7601
|
+
}
|
|
7602
|
+
|
|
7603
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
|
7604
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
|
7605
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
7606
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
7607
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
7608
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
7609
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
7610
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
|
7611
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
7612
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
7613
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
7614
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
|
7615
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
|
7616
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
|
7617
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
|
7618
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
|
|
7619
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
|
|
7620
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
|
7621
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
|
7622
|
+
#else
|
|
7623
|
+
kernel = backend_ctx->kernel_mul_mv_q8_0_f32;
|
|
7624
|
+
|
|
7625
|
+
// nth0 - subgroup size
|
|
7626
|
+
// nth1 - number of subgroups per workgroup
|
|
7627
|
+
// ndst - number of output values per workgroup = output per subgroup * number of subgroups
|
|
7628
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
7629
|
+
nth0 = 16;
|
|
7630
|
+
nth1 = 2;
|
|
7631
|
+
ndst = nth1*4;
|
|
7632
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
7633
|
+
nth0 = 64;
|
|
7634
|
+
nth1 = 2;
|
|
7635
|
+
ndst = nth1*4;
|
|
7636
|
+
} else {
|
|
7637
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
7638
|
+
}
|
|
7639
|
+
|
|
7640
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7641
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7642
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
7643
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
7644
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
7645
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
7646
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
7647
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01));
|
|
7648
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01));
|
|
7649
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02));
|
|
7650
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03));
|
|
7651
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
|
7652
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
|
|
7653
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
|
|
7654
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
|
|
7655
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne0));
|
|
7656
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne1));
|
|
7657
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r2));
|
|
7658
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &r3));
|
|
7659
|
+
#endif // GGML_OPENCL_SOA_Q
|
|
7660
|
+
break;
|
|
7661
|
+
}
|
|
6713
7662
|
case GGML_TYPE_Q2_K:
|
|
6714
7663
|
case GGML_TYPE_Q3_K:
|
|
6715
7664
|
case GGML_TYPE_Q4_K:
|
|
@@ -6744,6 +7693,45 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6744
7693
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3));
|
|
6745
7694
|
break;
|
|
6746
7695
|
case GGML_TYPE_MXFP4: {
|
|
7696
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
7697
|
+
kernel = backend_ctx->kernel_mul_mv_mxfp4_f32_flat;
|
|
7698
|
+
|
|
7699
|
+
cl_mem q;
|
|
7700
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
7701
|
+
nth0 = 16;
|
|
7702
|
+
nth1 = 2;
|
|
7703
|
+
ndst = nth1*2;
|
|
7704
|
+
|
|
7705
|
+
q = extra0_mxfp4->q;
|
|
7706
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
7707
|
+
nth0 = 64;
|
|
7708
|
+
nth1 = 2;
|
|
7709
|
+
ndst = nth1*2;
|
|
7710
|
+
|
|
7711
|
+
q = extra0_mxfp4->q_img;
|
|
7712
|
+
} else {
|
|
7713
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
7714
|
+
}
|
|
7715
|
+
|
|
7716
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
|
|
7717
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
|
|
7718
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
7719
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
7720
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
|
7721
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
|
7722
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
|
|
7723
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
|
7724
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
|
7725
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
|
7726
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
|
|
7727
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11));
|
|
7728
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb12));
|
|
7729
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb13));
|
|
7730
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne0));
|
|
7731
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne1));
|
|
7732
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
|
|
7733
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
|
|
7734
|
+
#else
|
|
6747
7735
|
kernel = backend_ctx->kernel_mul_mv_mxfp4_f32;
|
|
6748
7736
|
|
|
6749
7737
|
if (backend_ctx->gpu_family == INTEL) {
|
|
@@ -6777,6 +7765,7 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
|
6777
7765
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &r2));
|
|
6778
7766
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &r3));
|
|
6779
7767
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float)*nth0,nullptr));
|
|
7768
|
+
#endif
|
|
6780
7769
|
break;
|
|
6781
7770
|
}
|
|
6782
7771
|
default:
|
|
@@ -6842,8 +7831,12 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
6842
7831
|
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
|
6843
7832
|
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
|
6844
7833
|
|
|
7834
|
+
GGML_UNUSED(offset0);
|
|
7835
|
+
|
|
6845
7836
|
#ifdef GGML_OPENCL_SOA_Q
|
|
6846
7837
|
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
|
7838
|
+
ggml_tensor_extra_cl_mxfp4 * extra0_mxfp4 = (ggml_tensor_extra_cl_mxfp4 *)src0->extra;
|
|
7839
|
+
ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra;
|
|
6847
7840
|
#endif
|
|
6848
7841
|
|
|
6849
7842
|
const int ne00 = src0->ne[0];
|
|
@@ -6869,6 +7862,9 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
6869
7862
|
const int ne21 = src2->ne[1];
|
|
6870
7863
|
|
|
6871
7864
|
const cl_ulong nb21 = src2->nb[1];
|
|
7865
|
+
const cl_ulong nb20 = src2->nb[0];
|
|
7866
|
+
|
|
7867
|
+
UNUSED(nb20);
|
|
6872
7868
|
|
|
6873
7869
|
const int ne0 = dst->ne[0];
|
|
6874
7870
|
const int ne1 = dst->ne[1];
|
|
@@ -6931,7 +7927,227 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
6931
7927
|
|
|
6932
7928
|
break;
|
|
6933
7929
|
}
|
|
7930
|
+
case GGML_TYPE_Q8_0: {
|
|
7931
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
7932
|
+
kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32_flat;
|
|
7933
|
+
|
|
7934
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
7935
|
+
sgs = 16;
|
|
7936
|
+
nsg = 2;
|
|
7937
|
+
ndst = 4;
|
|
7938
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
7939
|
+
sgs = 64;
|
|
7940
|
+
nsg = 2;
|
|
7941
|
+
ndst = 4;
|
|
7942
|
+
} else {
|
|
7943
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
7944
|
+
}
|
|
7945
|
+
|
|
7946
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q8_0->q));
|
|
7947
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q8_0->d));
|
|
7948
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
7949
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
7950
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
|
7951
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
|
7952
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
|
7953
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
|
7954
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
7955
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
|
7956
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
|
7957
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
|
7958
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
|
|
7959
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
|
|
7960
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
|
7961
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
|
7962
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20));
|
|
7963
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21));
|
|
7964
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
|
|
7965
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0));
|
|
7966
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1));
|
|
7967
|
+
#else
|
|
7968
|
+
kernel = backend_ctx->kernel_mul_mv_id_q8_0_f32;
|
|
7969
|
+
|
|
7970
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
7971
|
+
sgs = 16;
|
|
7972
|
+
nsg = 2;
|
|
7973
|
+
ndst = 4;
|
|
7974
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
7975
|
+
sgs = 64;
|
|
7976
|
+
nsg = 2;
|
|
7977
|
+
ndst = 4;
|
|
7978
|
+
} else {
|
|
7979
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
7980
|
+
}
|
|
7981
|
+
|
|
7982
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
|
7983
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
|
7984
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
7985
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
7986
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
|
7987
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
|
7988
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
|
7989
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
|
7990
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
7991
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
|
7992
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01));
|
|
7993
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02));
|
|
7994
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
|
|
7995
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
|
|
7996
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
|
7997
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
|
7998
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne20));
|
|
7999
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne21));
|
|
8000
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb21));
|
|
8001
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne0));
|
|
8002
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne1));
|
|
8003
|
+
#endif // GGML_OPENCL_SOA_Q
|
|
8004
|
+
break;
|
|
8005
|
+
}
|
|
6934
8006
|
case GGML_TYPE_MXFP4: {
|
|
8007
|
+
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
|
8008
|
+
if (use_adreno_moe_kernels(backend_ctx, src0)) {
|
|
8009
|
+
cl_int status;
|
|
8010
|
+
|
|
8011
|
+
size_t local_size[3] = {64, 2, 1};
|
|
8012
|
+
size_t global_size[3] = {64, 2, 1};
|
|
8013
|
+
|
|
8014
|
+
cl_mem src1_sub_buffer, buf_src1_image, buf_src2;
|
|
8015
|
+
|
|
8016
|
+
int tile_size = 320;
|
|
8017
|
+
if (ne12 == 1) { // for gemv
|
|
8018
|
+
kernel = backend_ctx->kernel_gemv_moe_mxfp4_f32;
|
|
8019
|
+
|
|
8020
|
+
// create a sub_buffer for src2
|
|
8021
|
+
cl_buffer_region region;
|
|
8022
|
+
region.origin = offset2;
|
|
8023
|
+
region.size = ne20 * ne21 * sizeof(int);
|
|
8024
|
+
buf_src2 = clCreateSubBuffer(extra2->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
8025
|
+
CL_CHECK(status);
|
|
8026
|
+
|
|
8027
|
+
// set thread grid
|
|
8028
|
+
global_size[0] = static_cast<size_t>(ne01);
|
|
8029
|
+
global_size[1] = 4;
|
|
8030
|
+
global_size[2] = static_cast<size_t>(ne20);
|
|
8031
|
+
local_size[1] = 4;
|
|
8032
|
+
} else { // for gemm
|
|
8033
|
+
kernel = backend_ctx->kernel_gemm_moe_mxfp4_f32;
|
|
8034
|
+
|
|
8035
|
+
// preprocess router table
|
|
8036
|
+
int num_tiles_per_expert = (ne01 + tile_size - 1) / tile_size;
|
|
8037
|
+
void * host_src2_reorder = malloc(ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short));
|
|
8038
|
+
void * host_src2 = malloc(ne21 * nb21);
|
|
8039
|
+
CL_CHECK(clEnqueueReadBuffer(backend_ctx->queue, extra2->data_device, CL_TRUE, offset2, ne21 * nb21, host_src2, 0, NULL, NULL));
|
|
8040
|
+
int total_experts = nb21 / nb20;
|
|
8041
|
+
int out_idx = 0;
|
|
8042
|
+
for (int i_expert = 0; i_expert < ne02; i_expert++) {
|
|
8043
|
+
for (int i_tile = 0; i_tile < num_tiles_per_expert; i_tile++) {
|
|
8044
|
+
for (int j = 0; j < ne21; j++) {
|
|
8045
|
+
for (int i = 0; i < ne20; i++) {
|
|
8046
|
+
int expert = ((int *)host_src2)[j * total_experts + i];
|
|
8047
|
+
if (i_expert == expert) {
|
|
8048
|
+
((short *)host_src2_reorder)[out_idx] = static_cast<short>(expert);
|
|
8049
|
+
((short *)host_src2_reorder)[out_idx + 1] = static_cast<short>(j * ne11 + (i % ne11));
|
|
8050
|
+
((short *)host_src2_reorder)[out_idx + 2] = static_cast<short>(j * ne20 + i);
|
|
8051
|
+
((short *)host_src2_reorder)[out_idx + 3] = static_cast<short>(i_tile);
|
|
8052
|
+
out_idx += 4;
|
|
8053
|
+
}
|
|
8054
|
+
}
|
|
8055
|
+
}
|
|
8056
|
+
}
|
|
8057
|
+
}
|
|
8058
|
+
buf_src2 = clCreateBuffer(backend_ctx->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ne20 * ne21 * 4 * num_tiles_per_expert * sizeof(short), host_src2_reorder, &status);
|
|
8059
|
+
CL_CHECK(status);
|
|
8060
|
+
|
|
8061
|
+
// set thread grid
|
|
8062
|
+
global_size[0] = static_cast<size_t>(tile_size);
|
|
8063
|
+
global_size[2] = static_cast<size_t>(ne20 * ne21 * num_tiles_per_expert);
|
|
8064
|
+
}
|
|
8065
|
+
|
|
8066
|
+
// create a sub_buffer for src1
|
|
8067
|
+
cl_buffer_region region;
|
|
8068
|
+
region.origin = offset1;
|
|
8069
|
+
region.size = ne10 * ne11 * ne12 * sizeof(float);
|
|
8070
|
+
src1_sub_buffer = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status);
|
|
8071
|
+
CL_CHECK(status);
|
|
8072
|
+
|
|
8073
|
+
// create image for src1
|
|
8074
|
+
cl_image_format image_format_buf_src1 = {CL_RGBA, CL_FLOAT};
|
|
8075
|
+
cl_image_desc image_desc_buf_src1 = {CL_MEM_OBJECT_IMAGE1D_BUFFER, static_cast<size_t>(ne10 * ne11 * ne12 / 4), 0,0,0,0,0,0,0, {src1_sub_buffer}};
|
|
8076
|
+
buf_src1_image = clCreateImage(backend_ctx->context, CL_MEM_READ_ONLY, &image_format_buf_src1, &image_desc_buf_src1, NULL, &status);
|
|
8077
|
+
CL_CHECK(status);
|
|
8078
|
+
|
|
8079
|
+
// Set kernel args
|
|
8080
|
+
int arg_idx = 0;
|
|
8081
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->q));
|
|
8082
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extra0_mxfp4->e));
|
|
8083
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src1_image));
|
|
8084
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &buf_src2));
|
|
8085
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_mem), &extrad->data_device));
|
|
8086
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(cl_ulong), &offsetd));
|
|
8087
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne00));
|
|
8088
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne01));
|
|
8089
|
+
if (ne12 == 1) {
|
|
8090
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &ne11));
|
|
8091
|
+
} else {
|
|
8092
|
+
CL_CHECK(clSetKernelArg(kernel, arg_idx++, sizeof(int), &tile_size));
|
|
8093
|
+
}
|
|
8094
|
+
|
|
8095
|
+
// launch kernel
|
|
8096
|
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_size, local_size, dst);
|
|
8097
|
+
|
|
8098
|
+
// deallocate sub buffers and images
|
|
8099
|
+
CL_CHECK(clReleaseMemObject(src1_sub_buffer));
|
|
8100
|
+
CL_CHECK(clReleaseMemObject(buf_src1_image));
|
|
8101
|
+
CL_CHECK(clReleaseMemObject(buf_src2));
|
|
8102
|
+
return;
|
|
8103
|
+
} // else fallback to generic kernel
|
|
8104
|
+
#endif // GGML_OPENCL_USE_ADRENO_KERNELS
|
|
8105
|
+
|
|
8106
|
+
#ifdef GGML_OPENCL_SOA_Q
|
|
8107
|
+
kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32_flat;
|
|
8108
|
+
|
|
8109
|
+
cl_mem q;
|
|
8110
|
+
if (backend_ctx->gpu_family == INTEL) {
|
|
8111
|
+
sgs = 16;
|
|
8112
|
+
nsg = 2;
|
|
8113
|
+
ndst = 2;
|
|
8114
|
+
|
|
8115
|
+
q = extra0_mxfp4->q;
|
|
8116
|
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
|
8117
|
+
sgs = 64;
|
|
8118
|
+
nsg = 1;
|
|
8119
|
+
ndst = 4;
|
|
8120
|
+
|
|
8121
|
+
q = extra0_mxfp4->q_img;
|
|
8122
|
+
} else {
|
|
8123
|
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
|
8124
|
+
}
|
|
8125
|
+
|
|
8126
|
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q));
|
|
8127
|
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_mxfp4->e));
|
|
8128
|
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
|
8129
|
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
|
8130
|
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
|
8131
|
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
|
8132
|
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
|
8133
|
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
|
8134
|
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
|
8135
|
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
|
|
8136
|
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
|
|
8137
|
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
|
|
8138
|
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne11));
|
|
8139
|
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne12));
|
|
8140
|
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
|
|
8141
|
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
|
|
8142
|
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
|
|
8143
|
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne20));
|
|
8144
|
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne21));
|
|
8145
|
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb21));
|
|
8146
|
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0));
|
|
8147
|
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1));
|
|
8148
|
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
|
|
8149
|
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
|
|
8150
|
+
#else // GGML_OPENCL_SOA_Q
|
|
6935
8151
|
kernel = backend_ctx->kernel_mul_mv_id_mxfp4_f32;
|
|
6936
8152
|
|
|
6937
8153
|
if (backend_ctx->gpu_family == INTEL) {
|
|
@@ -6971,7 +8187,7 @@ static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0,
|
|
|
6971
8187
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2));
|
|
6972
8188
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3));
|
|
6973
8189
|
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,nullptr));
|
|
6974
|
-
|
|
8190
|
+
#endif // GGML_OPENCL_SOA_Q
|
|
6975
8191
|
break;
|
|
6976
8192
|
}
|
|
6977
8193
|
default:
|
|
@@ -7404,6 +8620,7 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
7404
8620
|
const bool is_neox = mode & 2;
|
|
7405
8621
|
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
|
7406
8622
|
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
|
8623
|
+
const int is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
|
|
7407
8624
|
|
|
7408
8625
|
if (is_mrope) {
|
|
7409
8626
|
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
|
@@ -7494,9 +8711,14 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
|
|
|
7494
8711
|
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor));
|
|
7495
8712
|
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast));
|
|
7496
8713
|
CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow));
|
|
8714
|
+
// both mrope and vision kernels have sections
|
|
7497
8715
|
if (is_mrope || is_vision) {
|
|
7498
8716
|
CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, §ions));
|
|
7499
8717
|
}
|
|
8718
|
+
// only mrope has is_imrope
|
|
8719
|
+
if (is_mrope && !is_vision) {
|
|
8720
|
+
CL_CHECK(clSetKernelArg(kernel, 34, sizeof(int), &is_imrope));
|
|
8721
|
+
}
|
|
7500
8722
|
|
|
7501
8723
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
|
|
7502
8724
|
size_t local_work_size[] = {(size_t)nth, 1, 1};
|