@novastera-oss/llamarn 0.4.1 → 0.4.3-beta4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -0
- package/android/CMakeLists.txt +2 -0
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -22,54 +22,58 @@
|
|
|
22
22
|
|
|
23
23
|
#include "aclnn_ops.h"
|
|
24
24
|
|
|
25
|
+
#include "ggml-impl.h"
|
|
26
|
+
#include "ggml.h"
|
|
27
|
+
|
|
28
|
+
#include <aclnnop/aclnn_add.h>
|
|
25
29
|
#include <aclnnop/aclnn_addcdiv.h>
|
|
30
|
+
#include <aclnnop/aclnn_argmax.h>
|
|
26
31
|
#include <aclnnop/aclnn_avgpool2d.h>
|
|
27
32
|
#include <aclnnop/aclnn_batch_matmul.h>
|
|
28
33
|
#include <aclnnop/aclnn_cast.h>
|
|
34
|
+
#include <aclnnop/aclnn_clamp.h>
|
|
29
35
|
#include <aclnnop/aclnn_constant_pad_nd.h>
|
|
36
|
+
#include <aclnnop/aclnn_convolution.h>
|
|
30
37
|
#include <aclnnop/aclnn_copy.h>
|
|
31
38
|
#include <aclnnop/aclnn_div.h>
|
|
39
|
+
#include <aclnnop/aclnn_elu.h>
|
|
32
40
|
#include <aclnnop/aclnn_embedding.h>
|
|
41
|
+
#include <aclnnop/aclnn_eq_tensor.h>
|
|
33
42
|
#include <aclnnop/aclnn_exp.h>
|
|
34
43
|
#include <aclnnop/aclnn_fill_scalar.h>
|
|
44
|
+
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
|
35
45
|
#include <aclnnop/aclnn_group_norm.h>
|
|
46
|
+
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
|
47
|
+
#include <aclnnop/aclnn_gt_scalar.h>
|
|
48
|
+
#include <aclnnop/aclnn_im2col.h>
|
|
49
|
+
#include <aclnnop/aclnn_index_copy.h>
|
|
36
50
|
#include <aclnnop/aclnn_index_fill_tensor.h>
|
|
51
|
+
#include <aclnnop/aclnn_index_select.h>
|
|
37
52
|
#include <aclnnop/aclnn_layer_norm.h>
|
|
53
|
+
#include <aclnnop/aclnn_log.h>
|
|
38
54
|
#include <aclnnop/aclnn_matmul.h>
|
|
39
55
|
#include <aclnnop/aclnn_max_pool.h>
|
|
56
|
+
#include <aclnnop/aclnn_mean.h>
|
|
40
57
|
#include <aclnnop/aclnn_mm.h>
|
|
58
|
+
#include <aclnnop/aclnn_mul.h>
|
|
41
59
|
#include <aclnnop/aclnn_permute.h>
|
|
60
|
+
#include <aclnnop/aclnn_pow.h>
|
|
42
61
|
#include <aclnnop/aclnn_pow_tensor_tensor.h>
|
|
43
62
|
#include <aclnnop/aclnn_reduce_sum.h>
|
|
63
|
+
#include <aclnnop/aclnn_reflection_pad1d.h>
|
|
44
64
|
#include <aclnnop/aclnn_repeat.h>
|
|
45
65
|
#include <aclnnop/aclnn_repeat_interleave.h>
|
|
66
|
+
#include <aclnnop/aclnn_rms_norm.h>
|
|
46
67
|
#include <aclnnop/aclnn_roll.h>
|
|
47
68
|
#include <aclnnop/aclnn_softmax.h>
|
|
69
|
+
#include <aclnnop/aclnn_sub.h>
|
|
70
|
+
#include <aclnnop/aclnn_sum.h>
|
|
71
|
+
#include <aclnnop/aclnn_threshold.h>
|
|
48
72
|
#include <aclnnop/aclnn_tril.h>
|
|
49
73
|
#include <aclnnop/aclnn_triu.h>
|
|
50
74
|
#include <aclnnop/aclnn_upsample_nearest_2d.h>
|
|
51
75
|
#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
|
|
52
|
-
#include <aclnnop/aclnn_argmax.h>
|
|
53
|
-
#include <aclnnop/aclnn_sum.h>
|
|
54
|
-
#include <aclnnop/aclnn_rms_norm.h>
|
|
55
|
-
#include <aclnnop/aclnn_im2col.h>
|
|
56
|
-
#include <aclnnop/aclnn_add.h>
|
|
57
|
-
#include <aclnnop/aclnn_sub.h>
|
|
58
|
-
#include <aclnnop/aclnn_mul.h>
|
|
59
|
-
#include <aclnnop/aclnn_div.h>
|
|
60
|
-
#include <aclnnop/aclnn_convolution.h>
|
|
61
|
-
#include <aclnnop/aclnn_elu.h>
|
|
62
|
-
#include <aclnnop/aclnn_log.h>
|
|
63
|
-
#include <aclnnop/aclnn_mean.h>
|
|
64
|
-
#include <aclnnop/aclnn_reflection_pad1d.h>
|
|
65
|
-
#include <aclnnop/aclnn_eq_tensor.h>
|
|
66
|
-
#include <aclnnop/aclnn_gt_scalar.h>
|
|
67
|
-
#include <aclnnop/aclnn_pow.h>
|
|
68
|
-
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
|
69
|
-
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
|
70
76
|
#include <aclnnop/aclnn_zero.h>
|
|
71
|
-
#include <aclnnop/aclnn_index_copy.h>
|
|
72
|
-
#include <aclnnop/aclnn_index_select.h>
|
|
73
77
|
#include <float.h>
|
|
74
78
|
|
|
75
79
|
#include <cmath>
|
|
@@ -77,76 +81,71 @@
|
|
|
77
81
|
#include <exception>
|
|
78
82
|
#include <vector>
|
|
79
83
|
|
|
80
|
-
#include "ggml-impl.h"
|
|
81
|
-
#include "ggml.h"
|
|
82
|
-
|
|
83
84
|
#define GGML_COMMON_DECL_C
|
|
84
85
|
|
|
85
86
|
#include "../ggml-common.h"
|
|
86
87
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
88
|
+
void bcast_shape(ggml_tensor * src0,
|
|
89
|
+
ggml_tensor * src1,
|
|
90
|
+
ggml_tensor * dst,
|
|
91
|
+
acl_tensor_ptr & acl_src0,
|
|
92
|
+
acl_tensor_ptr & acl_src1,
|
|
93
|
+
acl_tensor_ptr & acl_dst) {
|
|
90
94
|
GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_can_repeat(src1, src0));
|
|
91
95
|
// Need bcast
|
|
92
96
|
if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
|
|
93
97
|
BCAST_SHAPE(src0, src1)
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
98
|
+
acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
|
|
99
|
+
acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
|
|
100
|
+
acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
|
|
97
101
|
} else {
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
102
|
+
acl_src0 = ggml_cann_create_tensor(src0);
|
|
103
|
+
acl_src1 = ggml_cann_create_tensor(src1);
|
|
104
|
+
acl_dst = ggml_cann_create_tensor(dst);
|
|
101
105
|
}
|
|
102
106
|
}
|
|
103
107
|
|
|
104
|
-
void ggml_cann_op_unary(
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
ggml_tensor* src = dst->src[0];
|
|
108
|
+
void ggml_cann_op_unary(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
|
|
109
|
+
ggml_backend_cann_context & ctx,
|
|
110
|
+
ggml_tensor * dst) {
|
|
111
|
+
ggml_tensor * src = dst->src[0];
|
|
108
112
|
|
|
109
|
-
|
|
110
|
-
|
|
113
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
114
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
111
115
|
|
|
112
|
-
unary_op(ctx, acl_src, acl_dst);
|
|
113
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
|
116
|
+
unary_op(ctx, acl_src.get(), acl_dst.get());
|
|
114
117
|
}
|
|
115
118
|
|
|
116
|
-
void ggml_cann_op_unary_gated(
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
ggml_tensor* src0 = dst->src[0];
|
|
120
|
-
ggml_tensor* src1 = dst->src[1];
|
|
119
|
+
void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, aclTensor *, aclTensor *)> unary_op,
|
|
120
|
+
ggml_backend_cann_context & ctx,
|
|
121
|
+
ggml_tensor * dst) {
|
|
122
|
+
ggml_tensor * src0 = dst->src[0];
|
|
123
|
+
ggml_tensor * src1 = dst->src[1];
|
|
121
124
|
|
|
122
125
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
|
123
126
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
|
124
127
|
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
|
125
128
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
if(src1) {
|
|
129
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
130
|
+
acl_tensor_ptr acl_src0, acl_src1;
|
|
131
|
+
if (src1) {
|
|
129
132
|
GGML_ASSERT(ggml_is_contiguous_1(src1));
|
|
130
133
|
GGML_ASSERT(src0->type == src1->type);
|
|
131
134
|
|
|
132
135
|
acl_src0 = ggml_cann_create_tensor(src0);
|
|
133
136
|
acl_src1 = ggml_cann_create_tensor(src1);
|
|
134
137
|
} else {
|
|
135
|
-
int64_t ne[] = {src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3]};
|
|
136
|
-
size_t
|
|
137
|
-
acl_src0
|
|
138
|
+
int64_t ne[] = { src0->ne[0] / 2, src0->ne[1], src0->ne[2], src0->ne[3] };
|
|
139
|
+
size_t nb[] = { src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3] };
|
|
140
|
+
acl_src0 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, 0);
|
|
138
141
|
acl_src1 = ggml_cann_create_tensor(src0, ne, nb, GGML_MAX_DIMS, ACL_FORMAT_ND, ne[0] * ggml_element_size(src0));
|
|
139
142
|
if (swapped) {
|
|
140
143
|
std::swap(acl_src0, acl_src1);
|
|
141
144
|
}
|
|
142
145
|
}
|
|
143
146
|
|
|
144
|
-
unary_op(ctx, acl_src0, acl_dst);
|
|
145
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst, acl_src1);
|
|
146
|
-
|
|
147
|
-
ggml_cann_release_resources(ctx, acl_src0, acl_dst);
|
|
148
|
-
if(src1)
|
|
149
|
-
ggml_cann_release_resources(ctx, acl_src1);
|
|
147
|
+
unary_op(ctx, acl_src0.get(), acl_dst.get());
|
|
148
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst.get(), acl_src1.get());
|
|
150
149
|
}
|
|
151
150
|
|
|
152
151
|
/**
|
|
@@ -159,13 +158,14 @@ void ggml_cann_op_unary_gated(
|
|
|
159
158
|
* @param repeat_array The array specifying the number of repetitions along each
|
|
160
159
|
* dimension.
|
|
161
160
|
*/
|
|
162
|
-
static void aclnn_repeat(ggml_backend_cann_context& ctx,
|
|
163
|
-
aclTensor*
|
|
161
|
+
static void aclnn_repeat(ggml_backend_cann_context & ctx,
|
|
162
|
+
aclTensor * acl_src,
|
|
163
|
+
aclTensor * acl_dst,
|
|
164
|
+
int64_t * repeat_array) {
|
|
164
165
|
// repeat tensor along each dim with repeat_array
|
|
165
|
-
|
|
166
|
+
acl_int_array_ptr repeats = ggml_cann_create_int_array(repeat_array, GGML_MAX_DIMS);
|
|
166
167
|
|
|
167
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats, acl_dst);
|
|
168
|
-
ggml_cann_release_resources(ctx, repeats);
|
|
168
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Repeat, acl_src, repeats.get(), acl_dst);
|
|
169
169
|
}
|
|
170
170
|
|
|
171
171
|
/**
|
|
@@ -181,61 +181,60 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
|
181
181
|
* @param cast_data_type The target data type to which the source tensor will be
|
|
182
182
|
* casted.
|
|
183
183
|
*/
|
|
184
|
-
static void aclnn_cast(ggml_backend_cann_context& ctx,
|
|
185
|
-
|
|
184
|
+
static void aclnn_cast(ggml_backend_cann_context & ctx,
|
|
185
|
+
aclTensor * acl_src,
|
|
186
|
+
aclTensor * acl_dst,
|
|
187
|
+
aclDataType cast_data_type) {
|
|
186
188
|
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src, cast_data_type, acl_dst);
|
|
187
189
|
}
|
|
188
190
|
|
|
189
|
-
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
190
|
-
ggml_tensor* src = dst->src[0];
|
|
191
|
+
void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
192
|
+
ggml_tensor * src = dst->src[0];
|
|
191
193
|
GGML_ASSERT(ggml_can_repeat(src, dst));
|
|
192
194
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
196
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
195
197
|
|
|
196
|
-
int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2],
|
|
197
|
-
|
|
198
|
+
int64_t repeatsArray[] = { dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], dst->ne[1] / src->ne[1],
|
|
199
|
+
dst->ne[0] / src->ne[0] };
|
|
198
200
|
|
|
199
|
-
aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray);
|
|
200
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
|
201
|
+
aclnn_repeat(ctx, acl_src.get(), acl_dst.get(), repeatsArray);
|
|
201
202
|
}
|
|
202
203
|
|
|
203
|
-
void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
ggml_cann_release_resources(ctx, alpha);
|
|
204
|
+
void aclnn_add(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
|
|
205
|
+
float alphaValue = 1.0f;
|
|
206
|
+
acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
207
|
+
if (acl_dst != nullptr) {
|
|
208
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha.get(), acl_dst);
|
|
209
|
+
} else {
|
|
210
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_src0, acl_src1, alpha.get());
|
|
211
|
+
}
|
|
212
212
|
}
|
|
213
213
|
|
|
214
|
-
void aclnn_sub(ggml_backend_cann_context& ctx, aclTensor* acl_src0,
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
ggml_cann_release_resources(ctx, alpha);
|
|
214
|
+
void aclnn_sub(ggml_backend_cann_context & ctx, aclTensor * acl_src0, aclTensor * acl_src1, aclTensor * acl_dst) {
|
|
215
|
+
float alphaValue = 1.0f;
|
|
216
|
+
acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
217
|
+
if (acl_dst != nullptr) {
|
|
218
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Sub, acl_src0, acl_src1, alpha.get(), acl_dst);
|
|
219
|
+
} else {
|
|
220
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSub, acl_src0, acl_src1, alpha.get());
|
|
221
|
+
}
|
|
223
222
|
}
|
|
224
223
|
|
|
225
|
-
void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
226
|
-
|
|
227
|
-
if (acl_dst != nullptr)
|
|
224
|
+
void aclnn_mul(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
|
|
225
|
+
if (acl_dst != nullptr) {
|
|
228
226
|
GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_src, acl_other, acl_dst);
|
|
229
|
-
else
|
|
227
|
+
} else {
|
|
230
228
|
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_src, acl_other);
|
|
229
|
+
}
|
|
231
230
|
}
|
|
232
231
|
|
|
233
|
-
void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
234
|
-
|
|
235
|
-
if (acl_dst != nullptr)
|
|
232
|
+
void aclnn_div(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_other, aclTensor * acl_dst) {
|
|
233
|
+
if (acl_dst != nullptr) {
|
|
236
234
|
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src, acl_other, acl_dst);
|
|
237
|
-
else
|
|
235
|
+
} else {
|
|
238
236
|
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDiv, acl_src, acl_other);
|
|
237
|
+
}
|
|
239
238
|
}
|
|
240
239
|
|
|
241
240
|
/**
|
|
@@ -260,33 +259,30 @@ void aclnn_div(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
|
260
259
|
* @param inplace Flag indicating whether to perform the operation in-place on
|
|
261
260
|
* `acl_src`.
|
|
262
261
|
*/
|
|
263
|
-
static void aclnn_muls(ggml_backend_cann_context& ctx,
|
|
264
|
-
|
|
265
|
-
|
|
262
|
+
static void aclnn_muls(ggml_backend_cann_context & ctx,
|
|
263
|
+
aclTensor * acl_src,
|
|
264
|
+
float scale,
|
|
265
|
+
aclTensor * acl_dst,
|
|
266
|
+
bool inplace) {
|
|
267
|
+
acl_scalar_ptr acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
|
|
266
268
|
if (inplace) {
|
|
267
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale);
|
|
269
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_src, acl_scale.get());
|
|
268
270
|
} else {
|
|
269
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale, acl_dst);
|
|
271
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, acl_scale.get(), acl_dst);
|
|
270
272
|
}
|
|
271
|
-
ggml_cann_release_resources(ctx, acl_scale);
|
|
272
273
|
}
|
|
273
274
|
|
|
274
|
-
void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
275
|
-
ggml_tensor* src = dst->src[0];
|
|
275
|
+
void ggml_cann_leaky_relu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
276
|
+
ggml_tensor * src = dst->src[0];
|
|
276
277
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
|
281
|
-
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
|
278
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
279
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
282
280
|
|
|
283
281
|
float negative_slope;
|
|
284
282
|
memcpy(&negative_slope, dst->op_params, sizeof(float));
|
|
285
|
-
|
|
286
|
-
aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT);
|
|
283
|
+
acl_scalar_ptr acl_negative_slope = ggml_cann_create_scalar(&negative_slope, aclDataType::ACL_FLOAT);
|
|
287
284
|
|
|
288
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src, acl_negative_slope, acl_dst);
|
|
289
|
-
ggml_cann_release_resources(ctx, acl_negative_slope, acl_src, acl_dst);
|
|
285
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, LeakyRelu, acl_src.get(), acl_negative_slope.get(), acl_dst.get());
|
|
290
286
|
}
|
|
291
287
|
|
|
292
288
|
/**
|
|
@@ -299,29 +295,27 @@ void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
299
295
|
* stored.
|
|
300
296
|
* @param concat_dim The dimension along which the tensors will be concatenated.
|
|
301
297
|
*/
|
|
302
|
-
static void aclnn_concat(ggml_backend_cann_context& ctx,
|
|
303
|
-
aclTensorList*
|
|
304
|
-
|
|
298
|
+
static void aclnn_concat(ggml_backend_cann_context & ctx,
|
|
299
|
+
aclTensorList * tensorList,
|
|
300
|
+
aclTensor * acl_dst,
|
|
301
|
+
int64_t concat_dim) {
|
|
305
302
|
GGML_CANN_CALL_ACLNN_OP(ctx, Cat, tensorList, concat_dim, acl_dst);
|
|
306
303
|
}
|
|
307
304
|
|
|
308
|
-
void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
309
|
-
ggml_tensor*
|
|
310
|
-
ggml_tensor*
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
305
|
+
void ggml_cann_concat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
306
|
+
ggml_tensor * src0 = dst->src[0];
|
|
307
|
+
ggml_tensor * src1 = dst->src[1];
|
|
308
|
+
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
|
309
|
+
acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
|
|
310
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
314
311
|
|
|
315
312
|
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
|
316
313
|
|
|
317
314
|
GGML_ASSERT(dim >= 0 && dim < 4);
|
|
318
315
|
int32_t acl_dim = 3 - dim;
|
|
319
316
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
aclnn_concat(ctx, tensor_list, acl_dst, acl_dim);
|
|
323
|
-
|
|
324
|
-
ggml_cann_release_resources(ctx, tensor_list, acl_dst);
|
|
317
|
+
acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(acl_src0, acl_src1);
|
|
318
|
+
aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), acl_dim);
|
|
325
319
|
}
|
|
326
320
|
|
|
327
321
|
/**
|
|
@@ -341,169 +335,277 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
341
335
|
* @param step The step size between consecutive values.
|
|
342
336
|
* @param n_elements The number of elements in the destination tensor.
|
|
343
337
|
*/
|
|
344
|
-
static void aclnn_arange(ggml_backend_cann_context& ctx,
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
338
|
+
static void aclnn_arange(ggml_backend_cann_context & ctx,
|
|
339
|
+
aclTensor * acl_dst,
|
|
340
|
+
float start,
|
|
341
|
+
float stop,
|
|
342
|
+
float step,
|
|
343
|
+
int64_t n_elements) {
|
|
344
|
+
int64_t steps = (int64_t) std::ceil((stop - start) / step);
|
|
348
345
|
GGML_ASSERT(n_elements == steps);
|
|
349
346
|
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
347
|
+
acl_scalar_ptr acl_start = ggml_cann_create_scalar(&start, aclDataType::ACL_FLOAT);
|
|
348
|
+
acl_scalar_ptr acl_end = ggml_cann_create_scalar(&stop, aclDataType::ACL_FLOAT);
|
|
349
|
+
acl_scalar_ptr acl_step = ggml_cann_create_scalar(&step, aclDataType::ACL_FLOAT);
|
|
353
350
|
|
|
354
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start, acl_end, acl_step, acl_dst);
|
|
355
|
-
ggml_cann_release_resources(ctx, acl_start, acl_end, acl_step);
|
|
351
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Arange, acl_start.get(), acl_end.get(), acl_step.get(), acl_dst);
|
|
356
352
|
}
|
|
357
353
|
|
|
358
|
-
void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
354
|
+
void ggml_cann_arange(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
359
355
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
360
356
|
|
|
361
|
-
|
|
357
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
362
358
|
|
|
363
359
|
int64_t n_elements = ggml_nelements(dst);
|
|
364
|
-
float
|
|
365
|
-
float
|
|
366
|
-
float
|
|
367
|
-
memcpy(&start, (float*)dst->op_params + 0, sizeof(float));
|
|
368
|
-
memcpy(&stop, (float*)dst->op_params + 1, sizeof(float));
|
|
369
|
-
memcpy(&step, (float*)dst->op_params + 2, sizeof(float));
|
|
360
|
+
float start;
|
|
361
|
+
float stop;
|
|
362
|
+
float step;
|
|
363
|
+
memcpy(&start, (float *) dst->op_params + 0, sizeof(float));
|
|
364
|
+
memcpy(&stop, (float *) dst->op_params + 1, sizeof(float));
|
|
365
|
+
memcpy(&step, (float *) dst->op_params + 2, sizeof(float));
|
|
370
366
|
|
|
371
|
-
aclnn_arange(ctx, acl_dst, start, stop, step, n_elements);
|
|
372
|
-
ggml_cann_release_resources(ctx, acl_dst);
|
|
367
|
+
aclnn_arange(ctx, acl_dst.get(), start, stop, step, n_elements);
|
|
373
368
|
}
|
|
374
369
|
|
|
375
|
-
void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
376
|
-
ggml_tensor* src = dst->src[0];
|
|
370
|
+
void ggml_cann_clamp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
371
|
+
ggml_tensor * src = dst->src[0];
|
|
377
372
|
|
|
378
373
|
float min;
|
|
379
374
|
float max;
|
|
380
375
|
memcpy(&min, dst->op_params, sizeof(float));
|
|
381
|
-
memcpy(&max, (float*)dst->op_params + 1, sizeof(float));
|
|
376
|
+
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
|
382
377
|
|
|
383
|
-
|
|
384
|
-
|
|
378
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
379
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
385
380
|
|
|
386
|
-
|
|
387
|
-
|
|
381
|
+
acl_scalar_ptr acl_min = ggml_cann_create_scalar(&min, aclDataType::ACL_FLOAT);
|
|
382
|
+
acl_scalar_ptr acl_max = ggml_cann_create_scalar(&max, aclDataType::ACL_FLOAT);
|
|
388
383
|
|
|
389
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src, acl_min, acl_max, acl_dst);
|
|
390
|
-
ggml_cann_release_resources(ctx, acl_min, acl_max, acl_src, acl_dst);
|
|
384
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_src.get(), acl_min.get(), acl_max.get(), acl_dst.get());
|
|
391
385
|
}
|
|
392
386
|
|
|
393
|
-
void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
394
|
-
ggml_tensor* src = dst->src[0];
|
|
387
|
+
void ggml_cann_scale(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
388
|
+
ggml_tensor * src = dst->src[0];
|
|
395
389
|
|
|
396
390
|
// scale factor
|
|
397
391
|
float v;
|
|
398
392
|
memcpy(&v, dst->op_params, sizeof(float));
|
|
399
393
|
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
394
|
+
acl_scalar_ptr scale = ggml_cann_create_scalar(&v, aclDataType::ACL_FLOAT);
|
|
395
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
396
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
403
397
|
|
|
404
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src, scale, acl_dst);
|
|
405
|
-
ggml_cann_release_resources(ctx, scale, acl_src, acl_dst);
|
|
398
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_src.get(), scale.get(), acl_dst.get());
|
|
406
399
|
}
|
|
407
400
|
|
|
408
|
-
void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
409
|
-
ggml_tensor*
|
|
410
|
-
enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0];
|
|
401
|
+
void ggml_cann_argsort(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
402
|
+
ggml_tensor * src = dst->src[0];
|
|
403
|
+
enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
|
|
411
404
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
ggml_cann_pool_alloc temp_buffer_allocator(
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
GGML_CANN_CALL_ACLNN_OP(ctx,
|
|
421
|
-
tmp_tensor);
|
|
422
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor, ggml_cann_type_mapping(dst->type), acl_dst);
|
|
423
|
-
ggml_cann_release_resources(ctx, acl_src, tmp_tensor, acl_dst);
|
|
405
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
406
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
407
|
+
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(int64_t));
|
|
408
|
+
void * buffer = temp_buffer_allocator.get();
|
|
409
|
+
acl_tensor_ptr tmp_tensor =
|
|
410
|
+
ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), dst->ne, dst->nb, GGML_MAX_DIMS);
|
|
411
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Argsort, acl_src.get(), -1, (order == GGML_SORT_ORDER_DESC ? true : false),
|
|
412
|
+
tmp_tensor.get());
|
|
413
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, tmp_tensor.get(), ggml_cann_type_mapping(dst->type), acl_dst.get());
|
|
424
414
|
}
|
|
425
415
|
|
|
426
|
-
void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
427
|
-
ggml_tensor* src = dst->src[0];
|
|
416
|
+
void ggml_cann_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
417
|
+
ggml_tensor * src = dst->src[0];
|
|
428
418
|
|
|
429
|
-
|
|
430
|
-
|
|
419
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
420
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
431
421
|
|
|
432
422
|
float eps;
|
|
433
423
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
434
424
|
|
|
435
|
-
std::vector<int64_t> normData = {dst->ne[0]};
|
|
436
|
-
|
|
437
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src, norm, nullptr, nullptr,
|
|
438
|
-
|
|
439
|
-
|
|
425
|
+
std::vector<int64_t> normData = { dst->ne[0] };
|
|
426
|
+
acl_int_array_ptr norm = ggml_cann_create_int_array(normData.data(), normData.size());
|
|
427
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, LayerNorm, acl_src.get(), norm.get(), nullptr, nullptr, eps, acl_dst.get(), nullptr,
|
|
428
|
+
nullptr);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
432
|
+
ggml_tensor * src = dst->src[0];
|
|
433
|
+
|
|
434
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
435
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
436
|
+
|
|
437
|
+
size_t type_size = ggml_type_size(src->type);
|
|
438
|
+
int64_t n_bytes = src->ne[3] * src->ne[2] * src->ne[1] * type_size;
|
|
439
|
+
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes);
|
|
440
|
+
void * buffer = temp_buffer_allocator.get();
|
|
441
|
+
|
|
442
|
+
int64_t div_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
|
|
443
|
+
size_t div_nb[GGML_MAX_DIMS];
|
|
444
|
+
div_nb[0] = sizeof(float);
|
|
445
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
|
446
|
+
div_nb[i] = div_nb[i - 1] * div_ne[i - 1];
|
|
447
|
+
}
|
|
448
|
+
acl_tensor_ptr acl_div = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, div_ne, div_nb, GGML_MAX_DIMS);
|
|
449
|
+
|
|
450
|
+
std::vector<int64_t> norm_dims = { 3 };
|
|
451
|
+
acl_int_array_ptr dims_array = ggml_cann_create_int_array(norm_dims.data(), norm_dims.size());
|
|
452
|
+
|
|
453
|
+
float p_value = 2.0f;
|
|
454
|
+
acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
|
|
455
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
|
|
456
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
|
|
440
457
|
}
|
|
441
458
|
|
|
442
|
-
void
|
|
443
|
-
ggml_tensor*
|
|
459
|
+
void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
460
|
+
ggml_tensor * src0 = dst->src[0];
|
|
461
|
+
ggml_tensor * src1 = dst->src[1];
|
|
462
|
+
|
|
463
|
+
const int64_t nc = src0->ne[0];
|
|
464
|
+
const int64_t nr = ggml_nrows(src0);
|
|
444
465
|
|
|
445
|
-
|
|
446
|
-
|
|
466
|
+
int64_t logits_ne[] = { nc, nr };
|
|
467
|
+
size_t logits_nb[2];
|
|
468
|
+
logits_nb[0] = ggml_type_size(src0->type);
|
|
469
|
+
logits_nb[1] = logits_nb[0] * logits_ne[0];
|
|
470
|
+
acl_tensor_ptr acl_logits = ggml_cann_create_tensor(src0->data, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
|
|
471
|
+
|
|
472
|
+
size_t log_softmax_type_size = sizeof(float);
|
|
473
|
+
int64_t log_softmax_n_bytes = nr * nc * log_softmax_type_size;
|
|
474
|
+
ggml_cann_pool_alloc log_softmax_allocator(ctx.pool(), log_softmax_n_bytes);
|
|
475
|
+
void * log_softmax_buffer = log_softmax_allocator.get();
|
|
476
|
+
|
|
477
|
+
int64_t log_softmax_ne[] = { nc, nr };
|
|
478
|
+
size_t log_softmax_nb[2];
|
|
479
|
+
log_softmax_nb[0] = log_softmax_type_size;
|
|
480
|
+
log_softmax_nb[1] = log_softmax_nb[0] * log_softmax_ne[0];
|
|
481
|
+
acl_tensor_ptr acl_log_softmax = ggml_cann_create_tensor(log_softmax_buffer, ACL_FLOAT, log_softmax_type_size,
|
|
482
|
+
log_softmax_ne, log_softmax_nb, 2);
|
|
483
|
+
|
|
484
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, LogSoftmax, acl_logits.get(), 1, acl_log_softmax.get());
|
|
485
|
+
|
|
486
|
+
int64_t labels_ne[] = { nc, nr };
|
|
487
|
+
size_t labels_nb[2];
|
|
488
|
+
labels_nb[0] = ggml_type_size(src1->type);
|
|
489
|
+
labels_nb[1] = labels_nb[0] * labels_ne[0];
|
|
490
|
+
acl_tensor_ptr acl_labels = ggml_cann_create_tensor(src1->data, ACL_FLOAT, sizeof(float), labels_ne, labels_nb, 2);
|
|
491
|
+
|
|
492
|
+
size_t mul_type_size = sizeof(float);
|
|
493
|
+
int64_t mul_n_bytes = nr * nc * mul_type_size;
|
|
494
|
+
ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_n_bytes);
|
|
495
|
+
void * mul_buffer = mul_allocator.get();
|
|
496
|
+
|
|
497
|
+
int64_t mul_ne[] = { nc, nr };
|
|
498
|
+
size_t mul_nb[2];
|
|
499
|
+
mul_nb[0] = mul_type_size;
|
|
500
|
+
mul_nb[1] = mul_nb[0] * mul_ne[0];
|
|
501
|
+
acl_tensor_ptr acl_mul_result = ggml_cann_create_tensor(mul_buffer, ACL_FLOAT, mul_type_size, mul_ne, mul_nb, 2);
|
|
502
|
+
|
|
503
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_log_softmax.get(), acl_labels.get(), acl_mul_result.get());
|
|
504
|
+
|
|
505
|
+
size_t sum_per_sample_type_size = sizeof(float);
|
|
506
|
+
int64_t sum_per_sample_n_bytes = nr * sum_per_sample_type_size;
|
|
507
|
+
ggml_cann_pool_alloc sum_per_sample_allocator(ctx.pool(), sum_per_sample_n_bytes);
|
|
508
|
+
void * sum_per_sample_buffer = sum_per_sample_allocator.get();
|
|
509
|
+
|
|
510
|
+
int64_t sum_per_sample_ne[] = { nr };
|
|
511
|
+
size_t sum_per_sample_nb[1];
|
|
512
|
+
sum_per_sample_nb[0] = sum_per_sample_type_size;
|
|
513
|
+
acl_tensor_ptr acl_sum_per_sample = ggml_cann_create_tensor(
|
|
514
|
+
sum_per_sample_buffer, ACL_FLOAT, sum_per_sample_type_size, sum_per_sample_ne, sum_per_sample_nb, 1);
|
|
515
|
+
|
|
516
|
+
std::vector<int64_t> sum_dims = { 1 };
|
|
517
|
+
acl_int_array_ptr dims_array = ggml_cann_create_int_array(sum_dims.data(), sum_dims.size());
|
|
518
|
+
bool keep_dims = false;
|
|
519
|
+
|
|
520
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_mul_result.get(), dims_array.get(), keep_dims, ACL_FLOAT,
|
|
521
|
+
acl_sum_per_sample.get());
|
|
522
|
+
|
|
523
|
+
size_t total_sum_type_size = sizeof(float);
|
|
524
|
+
int64_t total_sum_n_bytes = 1 * total_sum_type_size;
|
|
525
|
+
ggml_cann_pool_alloc total_sum_allocator(ctx.pool(), total_sum_n_bytes);
|
|
526
|
+
void * total_sum_buffer = total_sum_allocator.get();
|
|
527
|
+
|
|
528
|
+
int64_t total_sum_ne[] = { 1 };
|
|
529
|
+
size_t total_sum_nb[1];
|
|
530
|
+
total_sum_nb[0] = total_sum_type_size;
|
|
531
|
+
|
|
532
|
+
acl_tensor_ptr acl_total_sum =
|
|
533
|
+
ggml_cann_create_tensor(total_sum_buffer, ACL_FLOAT, total_sum_type_size, total_sum_ne, total_sum_nb, 1);
|
|
534
|
+
|
|
535
|
+
std::vector<int64_t> total_sum_dims = { 0 };
|
|
536
|
+
acl_int_array_ptr total_sum_dims_array = ggml_cann_create_int_array(total_sum_dims.data(), total_sum_dims.size());
|
|
537
|
+
|
|
538
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sum_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
|
|
539
|
+
acl_total_sum.get());
|
|
540
|
+
|
|
541
|
+
float value = -1.0f / static_cast<float>(nr);
|
|
542
|
+
acl_scalar_ptr scale_factor = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
|
|
543
|
+
acl_tensor_ptr acl_dst =
|
|
544
|
+
ggml_cann_create_tensor(dst->data, ACL_FLOAT, sizeof(float), total_sum_ne, total_sum_nb, 1);
|
|
545
|
+
|
|
546
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Muls, acl_total_sum.get(), scale_factor.get(), acl_dst.get());
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
550
|
+
ggml_tensor * src = dst->src[0];
|
|
551
|
+
|
|
552
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
553
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
447
554
|
|
|
448
555
|
int n_groups = dst->op_params[0];
|
|
449
556
|
|
|
450
557
|
float eps;
|
|
451
558
|
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
|
452
559
|
|
|
453
|
-
int64_t N
|
|
454
|
-
int64_t C
|
|
560
|
+
int64_t N = src->ne[3];
|
|
561
|
+
int64_t C = src->ne[2];
|
|
455
562
|
int64_t HxW = src->ne[1] * src->ne[0];
|
|
456
563
|
|
|
457
|
-
size_t
|
|
458
|
-
int64_t ne[]
|
|
459
|
-
size_t
|
|
460
|
-
size_t
|
|
564
|
+
size_t type_size = ggml_type_size(src->type);
|
|
565
|
+
int64_t ne[] = { n_groups, N };
|
|
566
|
+
size_t nb[] = { type_size, type_size * n_groups };
|
|
567
|
+
size_t n_bytes = N * n_groups;
|
|
461
568
|
|
|
462
569
|
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2);
|
|
463
|
-
void*
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
(char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
|
|
570
|
+
void * buffer = temp_buffer_allocator.get();
|
|
571
|
+
acl_tensor_ptr acl_mean_out = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
|
|
572
|
+
acl_tensor_ptr acl_rstd_out =
|
|
573
|
+
ggml_cann_create_tensor((char *) buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND);
|
|
468
574
|
|
|
469
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps,
|
|
470
|
-
|
|
471
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_mean_out, acl_rstd_out);
|
|
575
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GroupNorm, acl_src.get(), nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst.get(),
|
|
576
|
+
acl_mean_out.get(), acl_rstd_out.get());
|
|
472
577
|
}
|
|
473
578
|
|
|
474
|
-
void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
475
|
-
ggml_tensor* src0 = dst->src[0];
|
|
476
|
-
ggml_tensor* src1 = dst->src[1];
|
|
579
|
+
void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
580
|
+
ggml_tensor * src0 = dst->src[0];
|
|
581
|
+
ggml_tensor * src1 = dst->src[1];
|
|
477
582
|
|
|
478
|
-
size_t nb1
|
|
479
|
-
size_t nb2
|
|
480
|
-
size_t nb3
|
|
481
|
-
size_t offset
|
|
482
|
-
bool
|
|
583
|
+
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
|
584
|
+
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
|
585
|
+
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
|
586
|
+
size_t offset = ((int32_t *) dst->op_params)[3];
|
|
587
|
+
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
|
483
588
|
|
|
484
|
-
size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3};
|
|
589
|
+
size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
|
|
485
590
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
|
|
591
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
|
|
592
|
+
acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
|
|
489
593
|
|
|
490
|
-
|
|
491
|
-
float
|
|
492
|
-
alpha
|
|
594
|
+
acl_scalar_ptr alpha = nullptr;
|
|
595
|
+
float alphaValue = 1.0f;
|
|
596
|
+
alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
493
597
|
|
|
494
598
|
if (!inplace) {
|
|
495
599
|
size_t cpy_size = ggml_nbytes(dst);
|
|
496
|
-
|
|
497
|
-
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
|
498
|
-
|
|
499
|
-
src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
|
|
600
|
+
ACL_CHECK(
|
|
601
|
+
aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
|
602
|
+
acl_tensor_ptr acl_src0 =
|
|
603
|
+
ggml_cann_create_tensor(src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
|
|
500
604
|
|
|
501
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0, acl_src1, alpha, acl_dst);
|
|
502
|
-
ggml_cann_release_resources(ctx, acl_src0);
|
|
605
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Add, acl_src0.get(), acl_src1.get(), alpha.get(), acl_dst.get());
|
|
503
606
|
} else {
|
|
504
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, acl_src1, alpha);
|
|
607
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), acl_src1.get(), alpha.get());
|
|
505
608
|
}
|
|
506
|
-
ggml_cann_release_resources(ctx, acl_src1, acl_dst);
|
|
507
609
|
}
|
|
508
610
|
|
|
509
611
|
/**
|
|
@@ -516,42 +618,36 @@ void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
516
618
|
* @param dim An array of dimension indices.
|
|
517
619
|
* @param dim_size The number of dimensions.
|
|
518
620
|
*/
|
|
519
|
-
static void aclnn_reduce_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
|
520
|
-
int64_t* dim, size_t dim_size) {
|
|
621
|
+
static void aclnn_reduce_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t * dim, size_t dim_size) {
|
|
521
622
|
GGML_ASSERT(dst->ne[0] == 1);
|
|
522
|
-
ggml_tensor*
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
623
|
+
ggml_tensor * src = dst->src[0];
|
|
624
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
625
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
626
|
+
acl_int_array_ptr reduce_dims = ggml_cann_create_int_array(dim, dim_size);
|
|
526
627
|
|
|
527
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src, reduce_dims, true,
|
|
528
|
-
|
|
529
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, reduce_dims);
|
|
628
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_src.get(), reduce_dims.get(), true, ggml_cann_type_mapping(dst->type),
|
|
629
|
+
acl_dst.get());
|
|
530
630
|
}
|
|
531
631
|
|
|
532
|
-
void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
533
|
-
int64_t reduce_dims[] = {3};
|
|
632
|
+
void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
633
|
+
int64_t reduce_dims[] = { 3 };
|
|
534
634
|
aclnn_reduce_sum(ctx, dst, reduce_dims, 1);
|
|
535
635
|
}
|
|
536
636
|
|
|
537
|
-
void ggml_cann_sum(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
538
|
-
int64_t reduce_dims[] = {0, 1, 2, 3};
|
|
637
|
+
void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
638
|
+
int64_t reduce_dims[] = { 0, 1, 2, 3 };
|
|
539
639
|
aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
|
|
540
640
|
}
|
|
541
641
|
|
|
542
|
-
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
547
|
-
aclTensor* acl_dst =
|
|
548
|
-
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
642
|
+
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
643
|
+
ggml_tensor * src = dst->src[0];
|
|
644
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
645
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
549
646
|
|
|
550
|
-
std::vector<int64_t> output_size{dst->ne[1], dst->ne[0]};
|
|
551
|
-
|
|
647
|
+
std::vector<int64_t> output_size{ dst->ne[1], dst->ne[0] };
|
|
648
|
+
acl_int_array_ptr output_size_array = ggml_cann_create_int_array(output_size.data(), 2);
|
|
552
649
|
|
|
553
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src, output_size_array, acl_dst);
|
|
554
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, output_size_array);
|
|
650
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, UpsampleNearest2d, acl_src.get(), output_size_array.get(), acl_dst.get());
|
|
555
651
|
}
|
|
556
652
|
|
|
557
653
|
/**
|
|
@@ -568,30 +664,37 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
|
|
|
568
664
|
* The size of the array should be twice the number of dimensions of the tensor.
|
|
569
665
|
* @param value The value to be used for padding. The default value is 0.0.
|
|
570
666
|
*/
|
|
571
|
-
static void aclnn_pad(ggml_backend_cann_context& ctx,
|
|
572
|
-
aclTensor
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
667
|
+
static void aclnn_pad(ggml_backend_cann_context & ctx,
|
|
668
|
+
aclTensor * acl_src,
|
|
669
|
+
aclTensor * acl_dst,
|
|
670
|
+
int64_t * paddings,
|
|
671
|
+
float value = 0.0f) {
|
|
672
|
+
acl_int_array_ptr acl_pad = ggml_cann_create_int_array(paddings, GGML_MAX_DIMS * 2);
|
|
673
|
+
acl_scalar_ptr acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
|
|
576
674
|
|
|
577
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad, acl_value, acl_dst);
|
|
578
|
-
ggml_cann_release_resources(ctx, acl_pad, acl_value);
|
|
675
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, ConstantPadNd, acl_src, acl_pad.get(), acl_value.get(), acl_dst);
|
|
579
676
|
}
|
|
580
677
|
|
|
581
|
-
void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
582
|
-
ggml_tensor*
|
|
583
|
-
|
|
584
|
-
|
|
678
|
+
void ggml_cann_pad(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
679
|
+
ggml_tensor * src = dst->src[0];
|
|
680
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
681
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
585
682
|
|
|
586
683
|
// padding: value in the array means how much distance will be padding.
|
|
587
684
|
// the position of elements in the array means which dirction to padding,
|
|
588
685
|
// each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
|
|
589
686
|
// dim2.front, dim2.behind, dim3.front, dim3.behind]
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
687
|
+
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
|
|
688
|
+
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
|
|
689
|
+
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
|
|
690
|
+
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
|
|
691
|
+
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
|
|
692
|
+
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
|
|
693
|
+
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
|
|
694
|
+
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
|
|
695
|
+
|
|
696
|
+
int64_t paddings[] = { lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3 };
|
|
697
|
+
aclnn_pad(ctx, acl_src.get(), acl_dst.get(), paddings);
|
|
595
698
|
}
|
|
596
699
|
|
|
597
700
|
/**
|
|
@@ -606,46 +709,40 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
606
709
|
* @param dst The destination tensor where the result will be stored. The source
|
|
607
710
|
* tensor is referenced by `dst->src[0]`.
|
|
608
711
|
*/
|
|
609
|
-
static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
|
|
610
|
-
|
|
611
|
-
ggml_tensor* src = dst->src[0];
|
|
712
|
+
static void ggml_cann_avg_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
713
|
+
ggml_tensor * src = dst->src[0];
|
|
612
714
|
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
|
613
715
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
614
716
|
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
const
|
|
621
|
-
const int
|
|
622
|
-
const int
|
|
623
|
-
const int
|
|
624
|
-
const int
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
std::vector<int64_t>
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
int64_t divisor_override = 0;
|
|
639
|
-
int8_t cube_math_type = 0;
|
|
717
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
718
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
719
|
+
|
|
720
|
+
const int32_t * opts = (const int32_t *) dst->op_params;
|
|
721
|
+
const int k0 = opts[1];
|
|
722
|
+
const int k1 = opts[2];
|
|
723
|
+
const int s0 = opts[3];
|
|
724
|
+
const int s1 = opts[4];
|
|
725
|
+
const int p0 = opts[5];
|
|
726
|
+
const int p1 = opts[6];
|
|
727
|
+
|
|
728
|
+
std::vector<int64_t> kernel_dims = { k1, k0 };
|
|
729
|
+
std::vector<int64_t> stride_dims = { s1, s0 };
|
|
730
|
+
std::vector<int64_t> padding_avg_dims = { p1, p0 }; // (padH, padW)
|
|
731
|
+
|
|
732
|
+
acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
|
|
733
|
+
acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
|
|
734
|
+
acl_int_array_ptr paddings_avg = ggml_cann_create_int_array(padding_avg_dims.data(), 2);
|
|
735
|
+
|
|
736
|
+
bool ceil_mode = false;
|
|
737
|
+
bool count_include_pad = true;
|
|
738
|
+
int64_t divisor_override = 0;
|
|
739
|
+
int8_t cube_math_type = 0;
|
|
640
740
|
#ifdef ASCEND_310P
|
|
641
741
|
cube_math_type = 1;
|
|
642
742
|
#endif
|
|
643
743
|
|
|
644
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src, kernel_size, strides, paddings_avg,
|
|
645
|
-
|
|
646
|
-
cube_math_type, acl_dst);
|
|
647
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, kernel_size, strides,
|
|
648
|
-
paddings_avg);
|
|
744
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, AvgPool2d, acl_src.get(), kernel_size.get(), strides.get(), paddings_avg.get(),
|
|
745
|
+
ceil_mode, count_include_pad, divisor_override, cube_math_type, acl_dst.get());
|
|
649
746
|
}
|
|
650
747
|
|
|
651
748
|
/**
|
|
@@ -660,68 +757,60 @@ static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx,
|
|
|
660
757
|
* @param dst The destination tensor where the result will be stored. The source
|
|
661
758
|
* tensor is referenced by `dst->src[0]`.
|
|
662
759
|
*/
|
|
663
|
-
static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx,
|
|
664
|
-
|
|
665
|
-
ggml_tensor* src = dst->src[0];
|
|
760
|
+
static void ggml_cann_max_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
761
|
+
ggml_tensor * src = dst->src[0];
|
|
666
762
|
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
|
667
763
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
668
764
|
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
aclTensor* acl_dst =
|
|
672
|
-
ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
765
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
766
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
|
673
767
|
|
|
674
|
-
const int32_t* opts = (const int32_t*)dst->op_params;
|
|
675
|
-
const int
|
|
676
|
-
const int
|
|
677
|
-
const int
|
|
678
|
-
const int
|
|
679
|
-
const int
|
|
680
|
-
const int
|
|
768
|
+
const int32_t * opts = (const int32_t *) dst->op_params;
|
|
769
|
+
const int k0 = opts[1];
|
|
770
|
+
const int k1 = opts[2];
|
|
771
|
+
const int s0 = opts[3];
|
|
772
|
+
const int s1 = opts[4];
|
|
773
|
+
const int p0 = opts[5];
|
|
774
|
+
const int p1 = opts[6];
|
|
681
775
|
|
|
682
|
-
int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2],
|
|
683
|
-
|
|
684
|
-
size_t temp_nb[GGML_MAX_DIMS];
|
|
776
|
+
int64_t temp_ne[] = { src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], src->ne[3] };
|
|
777
|
+
size_t temp_nb[GGML_MAX_DIMS];
|
|
685
778
|
|
|
686
779
|
temp_nb[0] = ggml_element_size(src);
|
|
687
780
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
688
781
|
temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1];
|
|
689
782
|
}
|
|
690
783
|
|
|
691
|
-
ggml_cann_pool_alloc temp_buffer_allocator(
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
|
|
696
|
-
GGML_MAX_DIMS, ACL_FORMAT_NCHW);
|
|
784
|
+
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]);
|
|
785
|
+
void * buffer = temp_buffer_allocator.get();
|
|
786
|
+
acl_tensor_ptr tmp_tensor = ggml_cann_create_tensor(buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb,
|
|
787
|
+
GGML_MAX_DIMS, ACL_FORMAT_NCHW);
|
|
697
788
|
|
|
698
789
|
// pad: see padding in ggml_cann_pad()
|
|
699
|
-
int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0};
|
|
700
|
-
float
|
|
701
|
-
aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value);
|
|
790
|
+
int64_t paddings[] = { p0, p0, p1, p1, 0, 0, 0, 0 };
|
|
791
|
+
float value = -FLT_MAX;
|
|
792
|
+
aclnn_pad(ctx, acl_src.get(), tmp_tensor.get(), paddings, value);
|
|
702
793
|
|
|
703
794
|
// max_pool
|
|
704
|
-
std::vector<int64_t> kernel_dims
|
|
705
|
-
std::vector<int64_t> stride_dims
|
|
795
|
+
std::vector<int64_t> kernel_dims = { k1, k0 };
|
|
796
|
+
std::vector<int64_t> stride_dims = { s1, s0 };
|
|
706
797
|
// padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end]
|
|
707
|
-
std::vector<int64_t> padding_max_dims = {0, 0, 0, 0};
|
|
708
|
-
std::vector<int64_t> dilation_size
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
bool
|
|
798
|
+
std::vector<int64_t> padding_max_dims = { 0, 0, 0, 0 };
|
|
799
|
+
std::vector<int64_t> dilation_size = { 1, 1 };
|
|
800
|
+
acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
|
|
801
|
+
acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
|
|
802
|
+
acl_int_array_ptr paddings_max = ggml_cann_create_int_array(padding_max_dims.data(), 4);
|
|
803
|
+
acl_int_array_ptr dilations = ggml_cann_create_int_array(dilation_size.data(), 2);
|
|
804
|
+
|
|
805
|
+
bool ceil_mode = false;
|
|
715
806
|
int64_t auto_pads = 0;
|
|
716
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor, kernel_size, strides, auto_pads,
|
|
717
|
-
|
|
718
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, tmp_tensor, kernel_size,
|
|
719
|
-
strides, paddings_max, dilations);
|
|
807
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, MaxPool, tmp_tensor.get(), kernel_size.get(), strides.get(), auto_pads,
|
|
808
|
+
paddings_max.get(), dilations.get(), ceil_mode, acl_dst.get());
|
|
720
809
|
}
|
|
721
810
|
|
|
722
|
-
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
723
|
-
const int32_t*
|
|
724
|
-
enum ggml_op_pool op
|
|
811
|
+
void ggml_cann_pool2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
812
|
+
const int32_t * opts = (const int32_t *) dst->op_params;
|
|
813
|
+
enum ggml_op_pool op = static_cast<ggml_op_pool>(opts[0]);
|
|
725
814
|
switch (op) {
|
|
726
815
|
case GGML_OP_POOL_AVG:
|
|
727
816
|
ggml_cann_avg_pool2d(ctx, dst);
|
|
@@ -745,42 +834,37 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
745
834
|
* @param acl_src The source tensor from which data will be copied.
|
|
746
835
|
* @param acl_dst The destination tensor where the data will be copied to.
|
|
747
836
|
*/
|
|
748
|
-
static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
749
|
-
aclTensor* acl_dst) {
|
|
837
|
+
static void cann_copy(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
|
750
838
|
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst, acl_src);
|
|
751
839
|
}
|
|
752
840
|
|
|
753
|
-
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
754
|
-
ggml_tensor* src0 = dst->src[0];
|
|
841
|
+
void ggml_cann_dup(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
842
|
+
ggml_tensor * src0 = dst->src[0];
|
|
755
843
|
|
|
756
844
|
if (ggml_are_same_shape(src0, dst)) {
|
|
757
|
-
|
|
758
|
-
|
|
845
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
|
846
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
759
847
|
if (dst->type == src0->type) {
|
|
760
|
-
cann_copy(ctx, acl_src, acl_dst);
|
|
848
|
+
cann_copy(ctx, acl_src.get(), acl_dst.get());
|
|
761
849
|
} else {
|
|
762
|
-
aclnn_cast(ctx, acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
|
850
|
+
aclnn_cast(ctx, acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
|
|
763
851
|
}
|
|
764
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
|
765
852
|
} else {
|
|
766
|
-
void*
|
|
853
|
+
void * src_trans_buffer = src0->data;
|
|
767
854
|
ggml_cann_pool_alloc src_buffer_allocator;
|
|
768
855
|
if (!ggml_is_contiguous(src0)) {
|
|
769
|
-
|
|
770
|
-
src_buffer_allocator.alloc(ctx.pool(),
|
|
771
|
-
ggml_nelements(src0) * ggml_type_size(src0->type));
|
|
856
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
|
857
|
+
src_buffer_allocator.alloc(ctx.pool(), ggml_nelements(src0) * ggml_type_size(src0->type));
|
|
772
858
|
src_trans_buffer = src_buffer_allocator.get();
|
|
773
859
|
size_t src_trans_nb[GGML_MAX_DIMS];
|
|
774
860
|
src_trans_nb[0] = ggml_type_size(src0->type);
|
|
775
861
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
776
862
|
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
777
863
|
}
|
|
778
|
-
|
|
779
|
-
src_trans_buffer, ggml_cann_type_mapping(src0->type),
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
cann_copy(ctx, acl_src, src_trans_tensor);
|
|
783
|
-
ggml_cann_release_resources(ctx, acl_src, src_trans_tensor);
|
|
864
|
+
acl_tensor_ptr src_trans_tensor =
|
|
865
|
+
ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type),
|
|
866
|
+
ggml_type_size(src0->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
867
|
+
cann_copy(ctx, acl_src.get(), src_trans_tensor.get());
|
|
784
868
|
}
|
|
785
869
|
|
|
786
870
|
size_t src_reshape_nb[GGML_MAX_DIMS];
|
|
@@ -789,19 +873,17 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
789
873
|
src_reshape_nb[i] = src_reshape_nb[i - 1] * dst->ne[i - 1];
|
|
790
874
|
}
|
|
791
875
|
|
|
792
|
-
|
|
793
|
-
ggml_cann_type_mapping(src0->type),ggml_type_size(src0->type),
|
|
794
|
-
|
|
795
|
-
|
|
876
|
+
acl_tensor_ptr trans_acl_src =
|
|
877
|
+
ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
|
|
878
|
+
dst->ne, src_reshape_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
|
879
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
796
880
|
|
|
797
881
|
if (dst->type == src0->type) {
|
|
798
|
-
cann_copy(ctx, trans_acl_src, acl_dst);
|
|
882
|
+
cann_copy(ctx, trans_acl_src.get(), acl_dst.get());
|
|
799
883
|
} else {
|
|
800
|
-
aclnn_cast(ctx, trans_acl_src, acl_dst, ggml_cann_type_mapping(dst->type));
|
|
884
|
+
aclnn_cast(ctx, trans_acl_src.get(), acl_dst.get(), ggml_cann_type_mapping(dst->type));
|
|
801
885
|
}
|
|
802
|
-
ggml_cann_release_resources(ctx, trans_acl_src, acl_dst);
|
|
803
886
|
}
|
|
804
|
-
return;
|
|
805
887
|
}
|
|
806
888
|
|
|
807
889
|
/**
|
|
@@ -818,20 +900,23 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
818
900
|
* @param dims The number of dimensions of the tensor.
|
|
819
901
|
* @param type The data type of the tensor.
|
|
820
902
|
* @param type_size The size of each element in the tensor data type.
|
|
821
|
-
* @return
|
|
903
|
+
* @return A tensor smart pointer initialized with zeros.
|
|
822
904
|
*/
|
|
823
|
-
static
|
|
824
|
-
|
|
825
|
-
|
|
905
|
+
static acl_tensor_ptr aclnn_zero(ggml_backend_cann_context & ctx,
|
|
906
|
+
void * buffer,
|
|
907
|
+
size_t n_bytes,
|
|
908
|
+
int64_t * ne,
|
|
909
|
+
int64_t dims,
|
|
910
|
+
aclDataType type,
|
|
911
|
+
size_t type_size) {
|
|
826
912
|
size_t nb[GGML_MAX_DIMS];
|
|
827
913
|
nb[0] = type_size;
|
|
828
914
|
for (int i = 1; i < dims; i++) {
|
|
829
915
|
nb[i] = nb[i - 1] * ne[i - 1];
|
|
830
916
|
}
|
|
831
917
|
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero);
|
|
918
|
+
acl_tensor_ptr zero = ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims);
|
|
919
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, zero.get());
|
|
835
920
|
return zero;
|
|
836
921
|
GGML_UNUSED(n_bytes);
|
|
837
922
|
}
|
|
@@ -852,18 +937,21 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
|
|
|
852
937
|
* @param type_size The size of each element in the tensor data type.
|
|
853
938
|
* @param value The value to be used for initializing the tensor (default
|
|
854
939
|
* is 1.0).
|
|
855
|
-
* @return
|
|
940
|
+
* @return A tensor smart pointer initialized with value.
|
|
856
941
|
*/
|
|
857
|
-
static
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
942
|
+
static acl_tensor_ptr aclnn_values(ggml_backend_cann_context & ctx,
|
|
943
|
+
void * buffer,
|
|
944
|
+
size_t n_bytes,
|
|
945
|
+
int64_t * ne,
|
|
946
|
+
int64_t dims,
|
|
947
|
+
aclDataType type,
|
|
948
|
+
size_t type_size,
|
|
949
|
+
float value = 1.0f) {
|
|
950
|
+
acl_tensor_ptr acl_tensor = aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
|
|
951
|
+
float alpha_host = 1.0f;
|
|
952
|
+
acl_scalar_ptr alpha = ggml_cann_create_scalar(&alpha_host, aclDataType::ACL_FLOAT);
|
|
953
|
+
acl_scalar_ptr other = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
|
|
954
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_tensor.get(), other.get(), alpha.get());
|
|
867
955
|
return acl_tensor;
|
|
868
956
|
}
|
|
869
957
|
|
|
@@ -877,22 +965,19 @@ static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
|
|
|
877
965
|
* @param scalar The scalar value used to fill the tensor.
|
|
878
966
|
* @param acl_dst The destination tensor to be filled with the scalar value.
|
|
879
967
|
*/
|
|
880
|
-
static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar);
|
|
884
|
-
ggml_cann_release_resources(ctx, acl_scalar);
|
|
968
|
+
static void aclnn_fill_scalar(ggml_backend_cann_context & ctx, float scalar, aclTensor * acl_dst) {
|
|
969
|
+
acl_scalar_ptr acl_scalar = ggml_cann_create_scalar(&scalar, aclDataType::ACL_FLOAT);
|
|
970
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst, acl_scalar.get());
|
|
885
971
|
}
|
|
886
972
|
|
|
887
973
|
/**
|
|
888
|
-
* @brief Get or expand a cached
|
|
974
|
+
* @brief Get or expand a cached tensor filled with a scalar value.
|
|
889
975
|
*
|
|
890
|
-
* This function manages cached device memory for
|
|
976
|
+
* This function manages cached device memory for tensors. If the current
|
|
891
977
|
* cache size is insufficient for the requested tensor shape, the old memory will
|
|
892
|
-
* be released and new memory will be allocated. The allocated buffer is
|
|
893
|
-
* initialized
|
|
894
|
-
*
|
|
895
|
-
* cached memory and returned.
|
|
978
|
+
* be released and new memory will be allocated. The allocated buffer is
|
|
979
|
+
* initialized with the given scalar value using CANN operations.
|
|
980
|
+
* Finally, an aclTensor object is created from the cached memory and returned.
|
|
896
981
|
*
|
|
897
982
|
* @param ctx The CANN backend context that manages device memory.
|
|
898
983
|
* @param buffer A pointer to the cached device buffer (will be allocated
|
|
@@ -901,25 +986,26 @@ static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar,
|
|
|
901
986
|
* updated when the cache is expanded.
|
|
902
987
|
* @param ne The tensor shape array (number of elements in each dimension).
|
|
903
988
|
* @param nb The stride size for each dimension.
|
|
989
|
+
* @param dtype Data type of cached tensor.
|
|
904
990
|
* @param dims The number of tensor dimensions.
|
|
905
991
|
* @param value The scalar value used to fill the tensor (supports zero
|
|
906
992
|
* initialization via memset or arbitrary values via fill_scalar).
|
|
907
|
-
* @return
|
|
993
|
+
* @return A tensor smart pointer created from the cached buffer.
|
|
908
994
|
*/
|
|
909
|
-
static
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
995
|
+
static acl_tensor_ptr get_cache_acl_tensor(ggml_backend_cann_context & ctx,
|
|
996
|
+
void ** buffer,
|
|
997
|
+
int64_t & cache_element,
|
|
998
|
+
int64_t * ne,
|
|
999
|
+
size_t * nb,
|
|
1000
|
+
ggml_type dtype,
|
|
1001
|
+
int64_t dims,
|
|
1002
|
+
float value) {
|
|
917
1003
|
// Calculate total number of elements
|
|
918
1004
|
int64_t n_element = 1;
|
|
919
1005
|
for (int i = 0; i < dims; i++) {
|
|
920
1006
|
n_element *= ne[i];
|
|
921
1007
|
}
|
|
922
|
-
size_t size = n_element *
|
|
1008
|
+
size_t size = n_element * ggml_type_size(dtype);
|
|
923
1009
|
|
|
924
1010
|
// Allocate or expand cache if needed
|
|
925
1011
|
if (cache_element < n_element) {
|
|
@@ -932,92 +1018,78 @@ static aclTensor* get_f32_cache_acl_tensor(
|
|
|
932
1018
|
cache_element = n_element;
|
|
933
1019
|
|
|
934
1020
|
// Initialize cache
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
aclTensor* acl_value = ggml_cann_create_tensor(
|
|
941
|
-
*buffer, ACL_FLOAT, sizeof(float), pool_ne, pool_nb, 1);
|
|
942
|
-
aclnn_fill_scalar(ctx, 1, acl_value);
|
|
943
|
-
ggml_cann_release_resources(ctx, acl_value);
|
|
944
|
-
}
|
|
1021
|
+
int64_t pool_ne[1] = { n_element };
|
|
1022
|
+
size_t pool_nb[1] = { ggml_type_size(dtype) };
|
|
1023
|
+
acl_tensor_ptr acl_value =
|
|
1024
|
+
ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), pool_ne, pool_nb, 1);
|
|
1025
|
+
aclnn_fill_scalar(ctx, value, acl_value.get());
|
|
945
1026
|
}
|
|
946
1027
|
|
|
947
|
-
return ggml_cann_create_tensor(*buffer,
|
|
1028
|
+
return ggml_cann_create_tensor(*buffer, ggml_cann_type_mapping(dtype), ggml_type_size(dtype), ne, nb, dims);
|
|
948
1029
|
}
|
|
949
1030
|
|
|
950
|
-
void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
951
|
-
ggml_tensor* src = dst->src[0];
|
|
1031
|
+
void ggml_cann_rms_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
1032
|
+
ggml_tensor * src = dst->src[0];
|
|
952
1033
|
|
|
953
|
-
|
|
954
|
-
|
|
1034
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
1035
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
955
1036
|
|
|
956
1037
|
float eps;
|
|
957
1038
|
memcpy(&eps, dst->op_params, sizeof(float));
|
|
958
1039
|
|
|
959
|
-
// build gamma
|
|
1040
|
+
// build gamma.
|
|
960
1041
|
size_t acl_gamma_nb[GGML_MAX_DIMS];
|
|
961
|
-
|
|
1042
|
+
// gamma's type is the same with dst.
|
|
1043
|
+
acl_gamma_nb[0] = ggml_type_size(dst->type);
|
|
962
1044
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
963
1045
|
acl_gamma_nb[i] = acl_gamma_nb[i - 1] * src->ne[i - 1];
|
|
964
1046
|
}
|
|
965
|
-
|
|
966
|
-
ctx,
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
src->ne,
|
|
970
|
-
acl_gamma_nb,
|
|
971
|
-
1, // dims
|
|
972
|
-
1.0f // value
|
|
1047
|
+
acl_tensor_ptr acl_gamma = get_cache_acl_tensor(
|
|
1048
|
+
ctx, &ctx.rms_norm_one_tensor_cache.cache, ctx.rms_norm_one_tensor_cache.size, src->ne, acl_gamma_nb, dst->type,
|
|
1049
|
+
1, // dims
|
|
1050
|
+
1.0f // value
|
|
973
1051
|
);
|
|
974
1052
|
|
|
975
|
-
// build rstd
|
|
976
|
-
|
|
1053
|
+
// build rstd.
|
|
1054
|
+
int64_t acl_rstd_ne[] = { src->ne[1], src->ne[2], src->ne[3] };
|
|
1055
|
+
size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
|
|
1056
|
+
// rstd will always be F32.
|
|
977
1057
|
acl_rstd_nb[0] = sizeof(float);
|
|
978
|
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
979
|
-
acl_rstd_nb[i] = acl_rstd_nb[i - 1] *
|
|
980
|
-
}
|
|
981
|
-
|
|
982
|
-
ctx,
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
acl_rstd_nb,
|
|
987
|
-
GGML_MAX_DIMS,
|
|
988
|
-
0.0f // value
|
|
989
|
-
);
|
|
1058
|
+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
|
1059
|
+
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
|
|
1060
|
+
}
|
|
1061
|
+
acl_tensor_ptr acl_rstd =
|
|
1062
|
+
get_cache_acl_tensor(ctx, &ctx.rms_norm_zero_tensor_cache.cache, ctx.rms_norm_zero_tensor_cache.size,
|
|
1063
|
+
acl_rstd_ne, acl_rstd_nb, GGML_TYPE_F32, GGML_MAX_DIMS - 1,
|
|
1064
|
+
0.0f // value
|
|
1065
|
+
);
|
|
990
1066
|
|
|
991
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src, acl_gamma, eps, acl_dst, acl_rstd);
|
|
992
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, acl_gamma, acl_rstd);
|
|
1067
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, RmsNorm, acl_src.get(), acl_gamma.get(), eps, acl_dst.get(), acl_rstd.get());
|
|
993
1068
|
}
|
|
994
1069
|
|
|
995
1070
|
// TODO: performace is low.
|
|
996
|
-
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
|
997
|
-
|
|
998
|
-
ggml_tensor* src = dst->src[0];
|
|
1071
|
+
void ggml_cann_diag_mask(ggml_backend_cann_context & ctx, ggml_tensor * dst, float value) {
|
|
1072
|
+
ggml_tensor * src = dst->src[0];
|
|
999
1073
|
|
|
1000
|
-
|
|
1001
|
-
|
|
1074
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
1075
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
1002
1076
|
|
|
1003
|
-
const int n_past = ((int32_t*)dst->op_params)[0];
|
|
1077
|
+
const int n_past = ((int32_t *) dst->op_params)[0];
|
|
1004
1078
|
|
|
1005
1079
|
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), ggml_nbytes(src));
|
|
1006
|
-
void*
|
|
1080
|
+
void * buffer = one_tensor_allocator.get();
|
|
1007
1081
|
|
|
1008
|
-
|
|
1009
|
-
|
|
1082
|
+
acl_tensor_ptr mask_tensor = ggml_cann_create_tensor(buffer, ggml_cann_type_mapping(src->type),
|
|
1083
|
+
ggml_type_size(src->type), src->ne, src->nb, GGML_MAX_DIMS);
|
|
1010
1084
|
|
|
1011
|
-
aclnn_fill_scalar(ctx, value, mask_tensor);
|
|
1085
|
+
aclnn_fill_scalar(ctx, value, mask_tensor.get());
|
|
1012
1086
|
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
1087
|
+
float alphaValue = 1.0f;
|
|
1088
|
+
acl_scalar_ptr alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
1016
1089
|
|
|
1017
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor, n_past + 1);
|
|
1018
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src, n_past + 1, acl_dst);
|
|
1019
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst, mask_tensor, alpha);
|
|
1020
|
-
ggml_cann_release_resources(ctx, alpha, acl_src, acl_dst, mask_tensor);
|
|
1090
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceTriu, mask_tensor.get(), n_past + 1);
|
|
1091
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), n_past + 1, acl_dst.get());
|
|
1092
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, acl_dst.get(), mask_tensor.get(), alpha.get());
|
|
1021
1093
|
}
|
|
1022
1094
|
|
|
1023
1095
|
/**
|
|
@@ -1035,129 +1107,121 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
|
|
|
1035
1107
|
* tensor.
|
|
1036
1108
|
* @param dims The number of dimensions in the tensor.
|
|
1037
1109
|
*/
|
|
1038
|
-
static void aclnn_permute(ggml_backend_cann_context& ctx,
|
|
1039
|
-
aclTensor
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1110
|
+
static void aclnn_permute(ggml_backend_cann_context & ctx,
|
|
1111
|
+
aclTensor * acl_src,
|
|
1112
|
+
aclTensor * acl_dst,
|
|
1113
|
+
int64_t * new_dim,
|
|
1114
|
+
uint64_t dims) {
|
|
1115
|
+
acl_int_array_ptr acl_dims = ggml_cann_create_int_array(new_dim, dims);
|
|
1116
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Permute, acl_src, acl_dims.get(), acl_dst);
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context & ctx,
|
|
1120
|
+
ggml_tensor * dst,
|
|
1121
|
+
ggml_tensor * src1,
|
|
1122
|
+
aclTensor * tmp_cast_tensor,
|
|
1123
|
+
aclTensor * tmp_im2col_tensor) {
|
|
1050
1124
|
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
|
|
1051
|
-
int64_t
|
|
1052
|
-
size_t
|
|
1053
|
-
|
|
1054
|
-
ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
|
1125
|
+
int64_t dst_ne[] = { dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3] };
|
|
1126
|
+
size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[3] };
|
|
1127
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
|
|
1055
1128
|
|
|
1056
|
-
int64_t permute_dim[] = {0, 2, 1};
|
|
1129
|
+
int64_t permute_dim[] = { 0, 2, 1 };
|
|
1057
1130
|
if (src1->type != dst->type) {
|
|
1058
|
-
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3);
|
|
1131
|
+
aclnn_permute(ctx, tmp_cast_tensor, acl_dst.get(), permute_dim, 3);
|
|
1059
1132
|
} else {
|
|
1060
|
-
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3);
|
|
1133
|
+
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst.get(), permute_dim, 3);
|
|
1061
1134
|
}
|
|
1062
|
-
|
|
1063
|
-
ggml_cann_release_resources(ctx, acl_dst);
|
|
1064
1135
|
}
|
|
1065
1136
|
|
|
1066
|
-
static void ggml_cann_im2col_1d_post_process(
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1137
|
+
static void ggml_cann_im2col_1d_post_process(ggml_backend_cann_context & ctx,
|
|
1138
|
+
ggml_tensor * dst,
|
|
1139
|
+
ggml_tensor * src1,
|
|
1140
|
+
aclTensor * tmp_cast_tensor,
|
|
1141
|
+
aclTensor * tmp_im2col_tensor,
|
|
1142
|
+
const std::vector<int64_t> & im2col_op_params) {
|
|
1070
1143
|
// get params
|
|
1071
|
-
const int64_t KH
|
|
1072
|
-
const int64_t KW
|
|
1073
|
-
const int64_t IW
|
|
1074
|
-
const int64_t IC
|
|
1075
|
-
const int64_t N
|
|
1076
|
-
const int64_t OH
|
|
1077
|
-
const int64_t OW
|
|
1078
|
-
const int64_t s0
|
|
1079
|
-
const int64_t p0
|
|
1080
|
-
const int64_t d0
|
|
1144
|
+
const int64_t KH = im2col_op_params[0];
|
|
1145
|
+
const int64_t KW = im2col_op_params[1];
|
|
1146
|
+
const int64_t IW = im2col_op_params[2];
|
|
1147
|
+
const int64_t IC = im2col_op_params[3];
|
|
1148
|
+
const int64_t N = im2col_op_params[4];
|
|
1149
|
+
const int64_t OH = im2col_op_params[5];
|
|
1150
|
+
const int64_t OW = im2col_op_params[6];
|
|
1151
|
+
const int64_t s0 = im2col_op_params[7];
|
|
1152
|
+
const int64_t p0 = im2col_op_params[8];
|
|
1153
|
+
const int64_t d0 = im2col_op_params[9];
|
|
1081
1154
|
const int64_t n_bytes_factor = im2col_op_params[10];
|
|
1082
1155
|
|
|
1083
1156
|
// Permute: [N, IC * KH * KW, OW * OH] ->
|
|
1084
1157
|
// [N, OW * OH * n_bytes_factor, IC * KH * KW]
|
|
1085
1158
|
ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool());
|
|
1086
1159
|
tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
|
1087
|
-
void* tmp_permute_buffer = tmp_permute_allocator.get();
|
|
1160
|
+
void * tmp_permute_buffer = tmp_permute_allocator.get();
|
|
1088
1161
|
|
|
1089
|
-
int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N};
|
|
1090
|
-
size_t
|
|
1162
|
+
int64_t tmp_permute_ne[] = { IC * KH * KW, OW * OH * n_bytes_factor, N };
|
|
1163
|
+
size_t tmp_permute_nb[GGML_MAX_DIMS - 1];
|
|
1091
1164
|
tmp_permute_nb[0] = ggml_type_size(dst->type);
|
|
1092
1165
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
|
1093
1166
|
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
|
1094
1167
|
}
|
|
1095
1168
|
|
|
1096
|
-
|
|
1097
|
-
tmp_permute_buffer, ggml_cann_type_mapping(dst->type),
|
|
1098
|
-
|
|
1099
|
-
GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
|
1169
|
+
acl_tensor_ptr tmp_permute_tensor =
|
|
1170
|
+
ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
|
1171
|
+
tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
|
1100
1172
|
|
|
1101
|
-
int64_t permute_dim[] = {0, 2, 1};
|
|
1173
|
+
int64_t permute_dim[] = { 0, 2, 1 };
|
|
1102
1174
|
if (src1->type != dst->type) {
|
|
1103
|
-
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3);
|
|
1175
|
+
aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor.get(), permute_dim, 3);
|
|
1104
1176
|
} else {
|
|
1105
|
-
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim,
|
|
1106
|
-
3);
|
|
1177
|
+
aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor.get(), permute_dim, 3);
|
|
1107
1178
|
}
|
|
1108
1179
|
|
|
1109
1180
|
// number of times the kernel moves in W dimension
|
|
1110
1181
|
const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1;
|
|
1111
|
-
size_t
|
|
1112
|
-
void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
|
|
1182
|
+
size_t offset;
|
|
1183
|
+
void * cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer;
|
|
1113
1184
|
|
|
1114
1185
|
// memory copy with offset to restore 1D im2col from 2d
|
|
1115
1186
|
if (IC > 1) {
|
|
1116
|
-
offset
|
|
1117
|
-
size_t
|
|
1187
|
+
offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type);
|
|
1188
|
+
size_t cpy_size = KH * KW * ggml_type_size(dst->type);
|
|
1118
1189
|
|
|
1119
1190
|
for (int c = 0; c < IC; c++) {
|
|
1120
|
-
cur_permute_buffer = (char*)tmp_permute_buffer + offset +
|
|
1121
|
-
|
|
1122
|
-
cur_dst_buffer = (char*)dst->data +
|
|
1123
|
-
c * KH * KW * n_step_w * ggml_type_size(dst->type);
|
|
1191
|
+
cur_permute_buffer = (char *) tmp_permute_buffer + offset + KH * KW * c * ggml_type_size(dst->type);
|
|
1192
|
+
cur_dst_buffer = (char *) dst->data + c * KH * KW * n_step_w * ggml_type_size(dst->type);
|
|
1124
1193
|
|
|
1125
1194
|
for (int i = 0; i < n_step_w; i++) {
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
cur_dst_buffer
|
|
1129
|
-
|
|
1130
|
-
cur_permute_buffer = (char*)cur_permute_buffer +
|
|
1131
|
-
KH * KW * IC * ggml_type_size(dst->type);
|
|
1195
|
+
ACL_CHECK(aclrtMemcpyAsync(cur_dst_buffer, cpy_size, cur_permute_buffer, cpy_size,
|
|
1196
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
|
1197
|
+
cur_dst_buffer = (char *) cur_dst_buffer + KH * KW * ggml_type_size(dst->type);
|
|
1198
|
+
cur_permute_buffer = (char *) cur_permute_buffer + KH * KW * IC * ggml_type_size(dst->type);
|
|
1132
1199
|
}
|
|
1133
1200
|
}
|
|
1134
1201
|
} else {
|
|
1135
|
-
offset = KH * KW * n_step_w *
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
ACL_MEMCPY_DEVICE_TO_DEVICE);
|
|
1202
|
+
offset = KH * KW * n_step_w * ggml_type_size(dst->type); // equal to ggml_nbytes(dst)
|
|
1203
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, (char *) tmp_permute_buffer + offset, offset,
|
|
1204
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
|
1139
1205
|
}
|
|
1140
|
-
|
|
1141
|
-
ggml_cann_release_resources(ctx, tmp_permute_tensor);
|
|
1142
1206
|
}
|
|
1143
1207
|
|
|
1144
|
-
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
1145
|
-
ggml_tensor* src0 = dst->src[0]; // kernel
|
|
1146
|
-
ggml_tensor* src1 = dst->src[1]; // input
|
|
1208
|
+
void ggml_cann_im2col(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
1209
|
+
ggml_tensor * src0 = dst->src[0]; // kernel
|
|
1210
|
+
ggml_tensor * src1 = dst->src[1]; // input
|
|
1147
1211
|
|
|
1148
1212
|
GGML_TENSOR_BINARY_OP_LOCALS;
|
|
1149
1213
|
|
|
1150
1214
|
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
|
|
1151
1215
|
// im2col and do post-processing to restore it to 1D.
|
|
1152
|
-
const bool
|
|
1153
|
-
const int32_t s0
|
|
1154
|
-
const int32_t s1
|
|
1155
|
-
const int32_t p0
|
|
1156
|
-
const int32_t p1
|
|
1157
|
-
const int32_t d0
|
|
1158
|
-
const int32_t d1
|
|
1159
|
-
|
|
1160
|
-
const int64_t N
|
|
1216
|
+
const bool is_2D = ((const int32_t *) (dst->op_params))[6] == 1;
|
|
1217
|
+
const int32_t s0 = ((const int32_t *) (dst->op_params))[0];
|
|
1218
|
+
const int32_t s1 = is_2D ? ((const int32_t *) (dst->op_params))[1] : 1;
|
|
1219
|
+
const int32_t p0 = ((const int32_t *) (dst->op_params))[2];
|
|
1220
|
+
const int32_t p1 = is_2D ? ((const int32_t *) (dst->op_params))[3] : 1;
|
|
1221
|
+
const int32_t d0 = ((const int32_t *) (dst->op_params))[4];
|
|
1222
|
+
const int32_t d1 = is_2D ? ((const int32_t *) (dst->op_params))[5] : 1;
|
|
1223
|
+
|
|
1224
|
+
const int64_t N = ne13;
|
|
1161
1225
|
const int64_t IC = ne12;
|
|
1162
1226
|
const int64_t KH = ne01;
|
|
1163
1227
|
const int64_t KW = ne00;
|
|
@@ -1170,9 +1234,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1170
1234
|
const int64_t n_bytes_factor = is_2D ? 1 : 3;
|
|
1171
1235
|
|
|
1172
1236
|
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor]
|
|
1173
|
-
|
|
1174
|
-
int64_t
|
|
1175
|
-
size_t
|
|
1237
|
+
acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
|
|
1238
|
+
int64_t tmp_im2col_ne[] = { OW * OH * n_bytes_factor, IC * KH * KW, N };
|
|
1239
|
+
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
|
|
1176
1240
|
|
|
1177
1241
|
tmp_im2col_nb[0] = ggml_type_size(src1->type);
|
|
1178
1242
|
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
|
|
@@ -1182,31 +1246,28 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1182
1246
|
// Calculate im2col.
|
|
1183
1247
|
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
|
|
1184
1248
|
// dst.elemcount.
|
|
1185
|
-
ggml_cann_pool_alloc im2col_allocator(
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
std::vector<int64_t>
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
auto* strides = aclCreateIntArray(stride_dims.data(), 2);
|
|
1203
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1, kernel_size, dilations,
|
|
1204
|
-
paddings, strides, tmp_im2col_tensor);
|
|
1249
|
+
ggml_cann_pool_alloc im2col_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor);
|
|
1250
|
+
void * tmp_im2col_buffer = im2col_allocator.get();
|
|
1251
|
+
|
|
1252
|
+
acl_tensor_ptr tmp_im2col_tensor =
|
|
1253
|
+
ggml_cann_create_tensor(tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type),
|
|
1254
|
+
tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
|
1255
|
+
|
|
1256
|
+
std::vector<int64_t> kernel_dims = { KH, KW };
|
|
1257
|
+
std::vector<int64_t> dilation_size = { d1, d0 };
|
|
1258
|
+
std::vector<int64_t> padding_dims = { p1, p0 };
|
|
1259
|
+
std::vector<int64_t> stride_dims = { s1, s0 };
|
|
1260
|
+
acl_int_array_ptr kernel_size = ggml_cann_create_int_array(kernel_dims.data(), 2);
|
|
1261
|
+
acl_int_array_ptr dilations = ggml_cann_create_int_array(dilation_size.data(), 2);
|
|
1262
|
+
acl_int_array_ptr paddings = ggml_cann_create_int_array(padding_dims.data(), 2);
|
|
1263
|
+
acl_int_array_ptr strides = ggml_cann_create_int_array(stride_dims.data(), 2);
|
|
1264
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Im2col, acl_src1.get(), kernel_size.get(), dilations.get(), paddings.get(),
|
|
1265
|
+
strides.get(), tmp_im2col_tensor.get());
|
|
1205
1266
|
|
|
1206
1267
|
// Cast if dst is f16.
|
|
1207
|
-
|
|
1268
|
+
acl_tensor_ptr tmp_cast_tensor;
|
|
1208
1269
|
ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool());
|
|
1209
|
-
void*
|
|
1270
|
+
void * tmp_cast_buffer = nullptr;
|
|
1210
1271
|
if (src1->type != dst->type) {
|
|
1211
1272
|
tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor);
|
|
1212
1273
|
tmp_cast_buffer = tmp_cast_allocator.get();
|
|
@@ -1216,26 +1277,20 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1216
1277
|
temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
|
|
1217
1278
|
}
|
|
1218
1279
|
|
|
1219
|
-
tmp_cast_tensor =
|
|
1220
|
-
tmp_cast_buffer, ggml_cann_type_mapping(dst->type),
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, ggml_cann_type_mapping(dst->type));
|
|
1280
|
+
tmp_cast_tensor =
|
|
1281
|
+
ggml_cann_create_tensor(tmp_cast_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
|
1282
|
+
tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
|
|
1283
|
+
aclnn_cast(ctx, tmp_im2col_tensor.get(), tmp_cast_tensor.get(), ggml_cann_type_mapping(dst->type));
|
|
1224
1284
|
}
|
|
1225
1285
|
|
|
1226
1286
|
// post-processing
|
|
1227
1287
|
if (is_2D) {
|
|
1228
|
-
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor,
|
|
1229
|
-
tmp_im2col_tensor);
|
|
1288
|
+
ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get());
|
|
1230
1289
|
} else {
|
|
1231
|
-
std::vector<int64_t> im2col_op_params = {
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
tmp_im2col_tensor, im2col_op_params);
|
|
1290
|
+
std::vector<int64_t> im2col_op_params = { KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor };
|
|
1291
|
+
ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor.get(), tmp_im2col_tensor.get(),
|
|
1292
|
+
im2col_op_params);
|
|
1235
1293
|
}
|
|
1236
|
-
|
|
1237
|
-
ggml_cann_release_resources(ctx, acl_src1, tmp_im2col_tensor, tmp_cast_tensor,
|
|
1238
|
-
kernel_size, dilations, paddings, strides);
|
|
1239
1294
|
}
|
|
1240
1295
|
|
|
1241
1296
|
/**
|
|
@@ -1251,136 +1306,117 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1251
1306
|
* @param ctx The context for the CANN backend operations.
|
|
1252
1307
|
* @param acl_src The tensor on which the exponential function will be applied.
|
|
1253
1308
|
*/
|
|
1254
|
-
static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) {
|
|
1309
|
+
static void aclnn_exp(ggml_backend_cann_context & ctx, aclTensor * acl_src) {
|
|
1255
1310
|
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceExp, acl_src);
|
|
1256
1311
|
}
|
|
1257
1312
|
|
|
1258
|
-
void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
1259
|
-
|
|
1260
|
-
if(acl_dst == nullptr) {
|
|
1313
|
+
void aclnn_cos(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
|
1314
|
+
if (acl_dst == nullptr) {
|
|
1261
1315
|
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCos, acl_src);
|
|
1262
1316
|
} else {
|
|
1263
1317
|
GGML_CANN_CALL_ACLNN_OP(ctx, Cos, acl_src, acl_dst);
|
|
1264
1318
|
}
|
|
1265
1319
|
}
|
|
1266
1320
|
|
|
1267
|
-
void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
1268
|
-
|
|
1269
|
-
if(acl_dst == nullptr) {
|
|
1321
|
+
void aclnn_sin(ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
|
1322
|
+
if (acl_dst == nullptr) {
|
|
1270
1323
|
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSin, acl_src);
|
|
1271
1324
|
} else {
|
|
1272
1325
|
GGML_CANN_CALL_ACLNN_OP(ctx, Sin, acl_src, acl_dst);
|
|
1273
1326
|
}
|
|
1274
1327
|
}
|
|
1275
1328
|
|
|
1276
|
-
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
|
|
1277
|
-
|
|
1278
|
-
const ggml_tensor* src = dst->src[0];
|
|
1329
|
+
void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
1330
|
+
const ggml_tensor * src = dst->src[0];
|
|
1279
1331
|
|
|
1280
1332
|
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
|
1281
1333
|
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
|
1282
1334
|
|
|
1283
|
-
const int dim
|
|
1335
|
+
const int dim = dst->op_params[0];
|
|
1284
1336
|
const int max_period = dst->op_params[1];
|
|
1285
|
-
int
|
|
1337
|
+
int half = dim / 2;
|
|
1286
1338
|
|
|
1287
|
-
|
|
1339
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
|
1288
1340
|
|
|
1289
1341
|
// arange: [0, ..., half)
|
|
1290
|
-
float
|
|
1291
|
-
float
|
|
1292
|
-
float
|
|
1342
|
+
float start = 0;
|
|
1343
|
+
float stop = half;
|
|
1344
|
+
float step = 1;
|
|
1293
1345
|
int64_t n_elements_arange = half;
|
|
1294
|
-
int64_t tmp_arange_ne[]
|
|
1295
|
-
size_t
|
|
1346
|
+
int64_t tmp_arange_ne[] = { half };
|
|
1347
|
+
size_t tmp_arange_nb[] = { sizeof(dst->type) };
|
|
1296
1348
|
|
|
1297
1349
|
ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type));
|
|
1298
|
-
void*
|
|
1299
|
-
|
|
1300
|
-
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
|
|
1301
|
-
|
|
1302
|
-
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
|
1350
|
+
void * tmp_arange_buffer = arange_allocator.get();
|
|
1351
|
+
acl_tensor_ptr tmp_arange_tensor =
|
|
1352
|
+
ggml_cann_create_tensor(tmp_arange_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
|
1353
|
+
tmp_arange_ne, tmp_arange_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
|
|
1303
1354
|
|
|
1304
|
-
aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange);
|
|
1355
|
+
aclnn_arange(ctx, tmp_arange_tensor.get(), start, stop, step, n_elements_arange);
|
|
1305
1356
|
|
|
1306
1357
|
// freq
|
|
1307
1358
|
float freq_param = -logf(max_period) / half;
|
|
1308
|
-
bool
|
|
1309
|
-
aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace);
|
|
1310
|
-
aclnn_exp(ctx, tmp_arange_tensor);
|
|
1359
|
+
bool inplace = true;
|
|
1360
|
+
aclnn_muls(ctx, tmp_arange_tensor.get(), freq_param, nullptr, inplace);
|
|
1361
|
+
aclnn_exp(ctx, tmp_arange_tensor.get());
|
|
1311
1362
|
|
|
1312
1363
|
// permute: src [0,1,2,3]->[0,1,3,2]
|
|
1313
|
-
int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]};
|
|
1314
|
-
size_t
|
|
1364
|
+
int64_t tmp_permute_ne[] = { src->ne[1], src->ne[0], src->ne[2], src->ne[3] };
|
|
1365
|
+
size_t tmp_permute_nb[GGML_MAX_DIMS];
|
|
1315
1366
|
tmp_permute_nb[0] = ggml_type_size(src->type);
|
|
1316
1367
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
1317
1368
|
tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1];
|
|
1318
1369
|
}
|
|
1319
1370
|
|
|
1320
1371
|
ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src));
|
|
1321
|
-
void*
|
|
1322
|
-
|
|
1323
|
-
tmp_permute_buffer, ggml_cann_type_mapping(src->type),
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
int64_t
|
|
1327
|
-
|
|
1328
|
-
aclnn_permute(ctx, acl_src, tmp_permute_tensor, permute_dim, num_dims);
|
|
1372
|
+
void * tmp_permute_buffer = permute_allocator.get();
|
|
1373
|
+
acl_tensor_ptr tmp_permute_tensor =
|
|
1374
|
+
ggml_cann_create_tensor(tmp_permute_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
|
|
1375
|
+
tmp_permute_ne, tmp_permute_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
|
1376
|
+
int64_t permute_dim[] = { 0, 1, 3, 2 };
|
|
1377
|
+
int64_t num_dims = 4;
|
|
1378
|
+
aclnn_permute(ctx, acl_src.get(), tmp_permute_tensor.get(), permute_dim, num_dims);
|
|
1329
1379
|
|
|
1330
1380
|
// timestep * freq
|
|
1331
|
-
int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2],
|
|
1332
|
-
|
|
1333
|
-
size_t tmp_mul_nb[GGML_MAX_DIMS];
|
|
1381
|
+
int64_t tmp_mul_ne[] = { src->ne[1] * half, src->ne[0], src->ne[2], src->ne[3] };
|
|
1382
|
+
size_t tmp_mul_nb[GGML_MAX_DIMS];
|
|
1334
1383
|
tmp_mul_nb[0] = ggml_type_size(src->type);
|
|
1335
1384
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
1336
1385
|
tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1];
|
|
1337
1386
|
}
|
|
1338
1387
|
|
|
1339
|
-
int mul_nelements =
|
|
1340
|
-
src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
|
|
1388
|
+
int mul_nelements = src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3];
|
|
1341
1389
|
|
|
1342
|
-
ggml_cann_pool_alloc mul_allocator(
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
ACL_FORMAT_ND);
|
|
1349
|
-
aclnn_mul(ctx, tmp_permute_tensor, tmp_arange_tensor, tmp_mul_tensor);
|
|
1390
|
+
ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
|
|
1391
|
+
void * tmp_mul_buffer = mul_allocator.get();
|
|
1392
|
+
acl_tensor_ptr tmp_mul_tensor =
|
|
1393
|
+
ggml_cann_create_tensor(tmp_mul_buffer, ggml_cann_type_mapping(src->type), ggml_type_size(src->type),
|
|
1394
|
+
tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
|
1395
|
+
aclnn_mul(ctx, tmp_permute_tensor.get(), tmp_arange_tensor.get(), tmp_mul_tensor.get());
|
|
1350
1396
|
|
|
1351
1397
|
// cos
|
|
1352
|
-
ggml_cann_pool_alloc cos_allocator(
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
|
|
1358
|
-
ACL_FORMAT_ND);
|
|
1398
|
+
ggml_cann_pool_alloc cos_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
|
|
1399
|
+
void * tmp_cos_buffer = cos_allocator.get();
|
|
1400
|
+
acl_tensor_ptr tmp_cos_tensor =
|
|
1401
|
+
ggml_cann_create_tensor(tmp_cos_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
|
1402
|
+
tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
|
1359
1403
|
|
|
1360
|
-
aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor);
|
|
1404
|
+
aclnn_cos(ctx, tmp_mul_tensor.get(), tmp_cos_tensor.get());
|
|
1361
1405
|
|
|
1362
1406
|
// sin
|
|
1363
|
-
ggml_cann_pool_alloc sin_allocator(
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS,
|
|
1369
|
-
ACL_FORMAT_ND);
|
|
1407
|
+
ggml_cann_pool_alloc sin_allocator(ctx.pool(), mul_nelements * ggml_type_size(src->type));
|
|
1408
|
+
void * tmp_sin_buffer = sin_allocator.get();
|
|
1409
|
+
acl_tensor_ptr tmp_sin_tensor =
|
|
1410
|
+
ggml_cann_create_tensor(tmp_sin_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
|
1411
|
+
tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
|
1370
1412
|
|
|
1371
|
-
aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor);
|
|
1413
|
+
aclnn_sin(ctx, tmp_mul_tensor.get(), tmp_sin_tensor.get());
|
|
1372
1414
|
|
|
1373
1415
|
// concat
|
|
1374
|
-
int64_t
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
aclnn_concat(ctx, tensor_list, acl_dst, concat_dim);
|
|
1379
|
-
|
|
1380
|
-
// release
|
|
1381
|
-
// segmentation fault when delete both tensorList and his elements.
|
|
1382
|
-
ggml_cann_release_resources(ctx, tensor_list, acl_src, tmp_arange_tensor,
|
|
1383
|
-
tmp_permute_tensor, tmp_mul_tensor, acl_dst);
|
|
1416
|
+
int64_t concat_dim = 3;
|
|
1417
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
1418
|
+
acl_tensor_list_ptr tensor_list = ggml_cann_create_tensor_list(tmp_cos_tensor, tmp_sin_tensor);
|
|
1419
|
+
aclnn_concat(ctx, tensor_list.get(), acl_dst.get(), concat_dim);
|
|
1384
1420
|
}
|
|
1385
1421
|
|
|
1386
1422
|
/**
|
|
@@ -1399,8 +1435,7 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
|
|
|
1399
1435
|
* @param acl_exp The exponent tensor, each element of which is used to raise
|
|
1400
1436
|
* the corresponding element in the destination tensor.
|
|
1401
1437
|
*/
|
|
1402
|
-
static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
|
|
1403
|
-
aclTensor* acl_dst, aclTensor* acl_exp) {
|
|
1438
|
+
static void aclnn_pow_tensor_tensor(ggml_backend_cann_context & ctx, aclTensor * acl_dst, aclTensor * acl_exp) {
|
|
1404
1439
|
GGML_CANN_CALL_ACLNN_OP(ctx, InplacePowTensorTensor, acl_dst, acl_exp);
|
|
1405
1440
|
}
|
|
1406
1441
|
|
|
@@ -1423,26 +1458,33 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
|
|
|
1423
1458
|
* @param start Starting exponent offset.
|
|
1424
1459
|
* @param stop Stopping exponent offset (exclusive).
|
|
1425
1460
|
* @param step Step size for the exponent increment.
|
|
1461
|
+
* @param dtype Data type for slope tensor.
|
|
1426
1462
|
*/
|
|
1427
|
-
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx,
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1463
|
+
static void aclnn_get_slope_inner(ggml_backend_cann_context & ctx,
|
|
1464
|
+
void * slope_buffer,
|
|
1465
|
+
float m,
|
|
1466
|
+
int64_t size,
|
|
1467
|
+
float start,
|
|
1468
|
+
float stop,
|
|
1469
|
+
float step,
|
|
1470
|
+
ggml_type dtype) {
|
|
1471
|
+
aclDataType acl_type = ggml_cann_type_mapping(dtype);
|
|
1472
|
+
size_t type_size = ggml_type_size(dtype);
|
|
1431
1473
|
|
|
1432
|
-
|
|
1433
|
-
|
|
1474
|
+
int64_t ne[] = { size };
|
|
1475
|
+
size_t nb[] = { type_size };
|
|
1434
1476
|
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
|
|
1477
|
+
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
|
|
1478
|
+
void * arange_buffer = arange_allocator.get();
|
|
1438
1479
|
|
|
1439
|
-
|
|
1440
|
-
|
|
1480
|
+
acl_tensor_ptr arange_tensor = ggml_cann_create_tensor(arange_buffer, acl_type, type_size, ne, nb, 1);
|
|
1481
|
+
aclnn_arange(ctx, arange_tensor.get(), start, stop, step, size);
|
|
1441
1482
|
|
|
1442
|
-
|
|
1483
|
+
acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, acl_type, type_size, ne, nb, 1);
|
|
1443
1484
|
|
|
1444
|
-
|
|
1445
|
-
|
|
1485
|
+
acl_scalar_ptr sc = ggml_cann_create_scalar(&m, aclDataType::ACL_FLOAT);
|
|
1486
|
+
|
|
1487
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc.get(), arange_tensor.get(), slope_tensor.get());
|
|
1446
1488
|
}
|
|
1447
1489
|
|
|
1448
1490
|
/**
|
|
@@ -1468,10 +1510,14 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu
|
|
|
1468
1510
|
* @param n_head Total number of attention heads.
|
|
1469
1511
|
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
|
|
1470
1512
|
* @param max_bias Maximum bias value for slope computation.
|
|
1513
|
+
* @param dtype Data type for slope tensor.
|
|
1471
1514
|
*
|
|
1472
1515
|
*/
|
|
1473
|
-
static void aclnn_get_slope(ggml_backend_cann_context & ctx,
|
|
1474
|
-
|
|
1516
|
+
static void aclnn_get_slope(ggml_backend_cann_context & ctx,
|
|
1517
|
+
int64_t n_head,
|
|
1518
|
+
void * slope_buffer,
|
|
1519
|
+
float max_bias,
|
|
1520
|
+
ggml_type dtype) {
|
|
1475
1521
|
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
|
|
1476
1522
|
|
|
1477
1523
|
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
|
|
@@ -1488,16 +1534,15 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
|
|
1488
1534
|
float step = 1;
|
|
1489
1535
|
float count = n_head_log2;
|
|
1490
1536
|
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
|
|
1491
|
-
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
|
|
1537
|
+
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
|
|
1492
1538
|
if (n_head_log2 < n_head) {
|
|
1493
1539
|
// arange2
|
|
1494
1540
|
start = 2 * (n_head_log2 - n_head_log2) + 1;
|
|
1495
1541
|
end = 2 * ((n_head - 1) - n_head_log2) + 1;
|
|
1496
1542
|
step = 2;
|
|
1497
1543
|
count = n_head - n_head_log2;
|
|
1498
|
-
aclnn_get_slope_inner(
|
|
1499
|
-
|
|
1500
|
-
m1, count, start, end + 1, step);
|
|
1544
|
+
aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
|
|
1545
|
+
dtype);
|
|
1501
1546
|
}
|
|
1502
1547
|
}
|
|
1503
1548
|
|
|
@@ -1522,19 +1567,21 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
|
|
|
1522
1567
|
* - Write data into dst_ptr using only the shape information of the dst tensor.
|
|
1523
1568
|
* - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
|
|
1524
1569
|
*/
|
|
1525
|
-
static void aclnn_add_alibi(ggml_backend_cann_context& ctx,
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1570
|
+
static void aclnn_add_alibi(ggml_backend_cann_context & ctx,
|
|
1571
|
+
ggml_tensor * mask,
|
|
1572
|
+
ggml_tensor * dst,
|
|
1573
|
+
void * dst_ptr,
|
|
1574
|
+
float max_bias) {
|
|
1575
|
+
void * slope_buffer = nullptr;
|
|
1576
|
+
void * bias_buffer = nullptr;
|
|
1529
1577
|
|
|
1530
1578
|
if (max_bias > 0.0f) {
|
|
1531
|
-
int64_t
|
|
1579
|
+
int64_t n_heads = dst->ne[2];
|
|
1532
1580
|
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
|
|
1533
1581
|
slope_buffer = slope_allocator.get();
|
|
1534
|
-
ggml_cann_pool_alloc bias_allocator(
|
|
1535
|
-
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
|
|
1582
|
+
ggml_cann_pool_alloc bias_allocator(ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
|
|
1536
1583
|
bias_buffer = bias_allocator.get();
|
|
1537
|
-
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
|
|
1584
|
+
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
|
|
1538
1585
|
}
|
|
1539
1586
|
|
|
1540
1587
|
// broadcast for mask, slop and dst;
|
|
@@ -1543,16 +1590,12 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
|
|
|
1543
1590
|
|
|
1544
1591
|
// broadcast the mask across rows
|
|
1545
1592
|
int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
|
|
1546
|
-
size_t mask_nb[] = {
|
|
1547
|
-
|
|
1548
|
-
mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
|
|
1549
|
-
};
|
|
1593
|
+
size_t mask_nb[] = { mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
|
|
1594
|
+
mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3] };
|
|
1550
1595
|
|
|
1551
1596
|
int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
|
|
1552
|
-
size_t dst_nb[] = {
|
|
1553
|
-
|
|
1554
|
-
dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
|
|
1555
|
-
};
|
|
1597
|
+
size_t dst_nb[] = { dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
|
|
1598
|
+
dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3] };
|
|
1556
1599
|
|
|
1557
1600
|
// slope is a 1 dim tensor, slope.ne2 == dst.ne2
|
|
1558
1601
|
int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
|
|
@@ -1562,17 +1605,13 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
|
|
|
1562
1605
|
slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
|
|
1563
1606
|
}
|
|
1564
1607
|
|
|
1565
|
-
|
|
1566
|
-
|
|
1567
|
-
|
|
1568
|
-
aclTensor* acl_mask = ggml_cann_create_tensor(
|
|
1569
|
-
mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
|
|
1608
|
+
acl_tensor_ptr acl_slope =
|
|
1609
|
+
ggml_cann_create_tensor(slope_buffer, ACL_FLOAT, sizeof(float), slope_ne, slope_nb, GGML_MAX_DIMS + 2);
|
|
1610
|
+
acl_tensor_ptr acl_mask = ggml_cann_create_tensor(mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
|
|
1570
1611
|
|
|
1571
1612
|
// write data into dst_ptr using only the shape information of the dst tensor.
|
|
1572
|
-
|
|
1573
|
-
|
|
1574
|
-
ggml_type_size(dst->type), dst_ne, dst_nb,
|
|
1575
|
-
GGML_MAX_DIMS + 2);
|
|
1613
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst_ptr, ggml_cann_type_mapping(dst->type),
|
|
1614
|
+
ggml_type_size(dst->type), dst_ne, dst_nb, GGML_MAX_DIMS + 2);
|
|
1576
1615
|
|
|
1577
1616
|
if (max_bias > 0.0f) {
|
|
1578
1617
|
int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
|
|
@@ -1581,17 +1620,14 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
|
|
|
1581
1620
|
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
|
|
1582
1621
|
bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
|
|
1583
1622
|
}
|
|
1584
|
-
|
|
1585
|
-
|
|
1586
|
-
bias_ne, bias_nb, GGML_MAX_DIMS + 2);
|
|
1623
|
+
acl_tensor_ptr bias_tensor =
|
|
1624
|
+
ggml_cann_create_tensor(bias_buffer, ACL_FLOAT, sizeof(float), bias_ne, bias_nb, GGML_MAX_DIMS + 2);
|
|
1587
1625
|
|
|
1588
|
-
aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
|
|
1589
|
-
aclnn_add(ctx, acl_dst, bias_tensor);
|
|
1590
|
-
ggml_cann_release_resources(ctx, bias_tensor);
|
|
1626
|
+
aclnn_mul(ctx, acl_slope.get(), acl_mask.get(), bias_tensor.get());
|
|
1627
|
+
aclnn_add(ctx, acl_dst.get(), bias_tensor.get());
|
|
1591
1628
|
} else {
|
|
1592
|
-
aclnn_add(ctx, acl_dst, acl_mask);
|
|
1629
|
+
aclnn_add(ctx, acl_dst.get(), acl_mask.get());
|
|
1593
1630
|
}
|
|
1594
|
-
ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
|
|
1595
1631
|
}
|
|
1596
1632
|
|
|
1597
1633
|
void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
@@ -1612,17 +1648,16 @@ void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
|
1612
1648
|
* @param acl_dst The destination tensor where the softmax results will be
|
|
1613
1649
|
* stored.
|
|
1614
1650
|
*/
|
|
1615
|
-
static void aclnn_softmax(ggml_backend_cann_context & ctx,
|
|
1616
|
-
aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
|
|
1651
|
+
static void aclnn_softmax(ggml_backend_cann_context & ctx, aclTensor * acl_src, int64_t dim, aclTensor * acl_dst) {
|
|
1617
1652
|
GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
|
|
1618
1653
|
}
|
|
1619
1654
|
|
|
1620
1655
|
void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
1621
|
-
ggml_tensor* src0 = dst->src[0];
|
|
1622
|
-
ggml_tensor* src1 = dst->src[1]; // mask
|
|
1656
|
+
ggml_tensor * src0 = dst->src[0];
|
|
1657
|
+
ggml_tensor * src1 = dst->src[1]; // mask
|
|
1623
1658
|
|
|
1624
|
-
|
|
1625
|
-
|
|
1659
|
+
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
|
1660
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
1626
1661
|
|
|
1627
1662
|
float scale = 1.0f;
|
|
1628
1663
|
float max_bias = 0.0f;
|
|
@@ -1631,22 +1666,20 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
|
1631
1666
|
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
|
|
1632
1667
|
|
|
1633
1668
|
// input mul scale
|
|
1634
|
-
|
|
1669
|
+
acl_scalar_ptr acl_scale = ggml_cann_create_scalar(&scale, aclDataType::ACL_FLOAT);
|
|
1635
1670
|
ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
|
|
1636
|
-
void*
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);
|
|
1671
|
+
void * src_tensor_buffer = src_tensor_allocator.get();
|
|
1672
|
+
acl_tensor_ptr softmax_tensor = ggml_cann_create_tensor(src_tensor_buffer, ggml_cann_type_mapping(src0->type),
|
|
1673
|
+
ggml_element_size(src0), src0->ne, src0->nb, GGML_MAX_DIMS);
|
|
1640
1674
|
|
|
1641
|
-
aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
|
|
1675
|
+
aclnn_muls(ctx, acl_src0.get(), scale, softmax_tensor.get(), false);
|
|
1642
1676
|
|
|
1643
1677
|
// mask
|
|
1644
1678
|
if (src1) {
|
|
1645
1679
|
aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
|
|
1646
1680
|
}
|
|
1647
1681
|
// softmax
|
|
1648
|
-
aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
|
|
1649
|
-
ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
|
|
1682
|
+
aclnn_softmax(ctx, softmax_tensor.get(), 3, acl_dst.get());
|
|
1650
1683
|
}
|
|
1651
1684
|
|
|
1652
1685
|
/**
|
|
@@ -1668,31 +1701,32 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
|
1668
1701
|
* @param index The index tensor specifying the indices to select from the source tensor.
|
|
1669
1702
|
* @param type The data type of the source and destination tensors.
|
|
1670
1703
|
*/
|
|
1671
|
-
static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1704
|
+
static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
|
|
1705
|
+
void * src_buffer,
|
|
1706
|
+
int64_t * src_ne,
|
|
1707
|
+
size_t * src_nb,
|
|
1708
|
+
void * dst_buffer,
|
|
1709
|
+
int64_t * dst_ne,
|
|
1710
|
+
size_t * dst_nb,
|
|
1711
|
+
ggml_tensor * index,
|
|
1712
|
+
ggml_type type) {
|
|
1675
1713
|
for (int64_t i = 0; i < src_ne[3]; i++) {
|
|
1676
1714
|
for (int64_t j = 0; j < src_ne[2]; j++) {
|
|
1677
1715
|
// src
|
|
1678
|
-
|
|
1679
|
-
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
|
1680
|
-
|
|
1681
|
-
src_ne, src_nb, 2);
|
|
1716
|
+
acl_tensor_ptr acl_src_tensor =
|
|
1717
|
+
ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
|
|
1718
|
+
ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
|
|
1682
1719
|
|
|
1683
1720
|
// index
|
|
1684
|
-
|
|
1685
|
-
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
|
1686
|
-
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
|
1687
|
-
index->ne, index->nb, 1);
|
|
1721
|
+
acl_tensor_ptr acl_index = ggml_cann_create_tensor(
|
|
1722
|
+
(char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
|
1723
|
+
ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
|
|
1688
1724
|
|
|
1689
1725
|
// out
|
|
1690
|
-
|
|
1691
|
-
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
|
1692
|
-
|
|
1693
|
-
|
|
1694
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor, 0, acl_index, acl_out);
|
|
1695
|
-
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
|
1726
|
+
acl_tensor_ptr acl_out =
|
|
1727
|
+
ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
|
1728
|
+
ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
|
|
1729
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor.get(), 0, acl_index.get(), acl_out.get());
|
|
1696
1730
|
}
|
|
1697
1731
|
}
|
|
1698
1732
|
}
|
|
@@ -1717,167 +1751,149 @@ static void aclnn_index_select_4d(ggml_backend_cann_context& ctx,
|
|
|
1717
1751
|
* @param index The index tensor specifying target positions in the destination tensor.
|
|
1718
1752
|
* @param type The data type of the source and destination tensors.
|
|
1719
1753
|
*/
|
|
1720
|
-
static void aclnn_index_copy_4d(ggml_backend_cann_context& ctx,
|
|
1721
|
-
void*
|
|
1722
|
-
|
|
1723
|
-
|
|
1754
|
+
static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
|
|
1755
|
+
void * src_buffer,
|
|
1756
|
+
int64_t * src_ne,
|
|
1757
|
+
size_t * src_nb,
|
|
1758
|
+
void * dst_buffer,
|
|
1759
|
+
int64_t * dst_ne,
|
|
1760
|
+
size_t * dst_nb,
|
|
1761
|
+
ggml_tensor * index,
|
|
1762
|
+
ggml_type type) {
|
|
1724
1763
|
for (int64_t i = 0; i < src_ne[3]; i++) {
|
|
1725
1764
|
for (int64_t j = 0; j < src_ne[2]; j++) {
|
|
1726
1765
|
// src
|
|
1727
|
-
|
|
1728
|
-
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
|
1729
|
-
|
|
1730
|
-
src_ne, src_nb, 2);
|
|
1766
|
+
acl_tensor_ptr acl_src_tensor =
|
|
1767
|
+
ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
|
|
1768
|
+
ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
|
|
1731
1769
|
|
|
1732
1770
|
// index
|
|
1733
|
-
|
|
1734
|
-
(char*)index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
|
1735
|
-
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
|
1736
|
-
index->ne, index->nb, 1);
|
|
1771
|
+
acl_tensor_ptr acl_index = ggml_cann_create_tensor(
|
|
1772
|
+
(char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
|
1773
|
+
ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
|
|
1737
1774
|
|
|
1738
1775
|
// out
|
|
1739
|
-
|
|
1740
|
-
(char*)dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out, 0, acl_index, acl_src_tensor);
|
|
1744
|
-
ggml_cann_release_resources(ctx, acl_src_tensor, acl_index, acl_out);
|
|
1776
|
+
acl_tensor_ptr acl_out =
|
|
1777
|
+
ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
|
1778
|
+
ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
|
|
1779
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out.get(), 0, acl_index.get(), acl_src_tensor.get());
|
|
1745
1780
|
}
|
|
1746
1781
|
}
|
|
1747
1782
|
}
|
|
1748
1783
|
|
|
1749
|
-
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
1750
|
-
ggml_tensor* src0 = dst->src[0]; // src
|
|
1751
|
-
ggml_tensor* src1 = dst->src[1]; // index
|
|
1784
|
+
void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
1785
|
+
ggml_tensor * src0 = dst->src[0]; // src
|
|
1786
|
+
ggml_tensor * src1 = dst->src[1]; // index
|
|
1787
|
+
|
|
1788
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
1752
1789
|
|
|
1753
1790
|
switch (src0->type) {
|
|
1754
|
-
case
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1791
|
+
case GGML_TYPE_F16:
|
|
1792
|
+
case GGML_TYPE_F32:
|
|
1793
|
+
if (src0->type == dst->type) {
|
|
1794
|
+
aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
|
|
1795
|
+
dst->type);
|
|
1796
|
+
} else {
|
|
1797
|
+
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
|
1798
|
+
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
|
|
1799
|
+
void * src_trans_buffer = src_buffer_allocator.get();
|
|
1800
|
+
size_t src_trans_nb[GGML_MAX_DIMS];
|
|
1801
|
+
src_trans_nb[0] = dst->nb[0];
|
|
1802
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
1803
|
+
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
1804
|
+
}
|
|
1805
|
+
acl_tensor_ptr src_trans_tensor =
|
|
1806
|
+
ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
|
1807
|
+
ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
1808
|
+
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
|
|
1809
|
+
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
|
|
1810
|
+
dst->type);
|
|
1769
1811
|
}
|
|
1770
|
-
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
|
1771
|
-
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
|
1772
|
-
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
1773
|
-
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
|
1774
|
-
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
|
1775
|
-
dst->data, dst->ne, dst->nb,
|
|
1776
|
-
src1, dst->type);
|
|
1777
|
-
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
|
1778
1812
|
break;
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
|
|
1794
|
-
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
|
|
1825
|
-
|
|
1826
|
-
dequant_buffer_allocator.get(),
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
|
1830
|
-
dequant_nb[0] = sizeof(float_t);
|
|
1831
|
-
dequant_ne = src0->ne;
|
|
1832
|
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
1833
|
-
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
|
1813
|
+
case GGML_TYPE_Q8_0:
|
|
1814
|
+
{
|
|
1815
|
+
// add 1 dim for bcast mul.
|
|
1816
|
+
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
|
|
1817
|
+
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
|
|
1818
|
+
int64_t scale_offset = 0;
|
|
1819
|
+
// [3,4,5,64] -> [3,4,5,2,32]
|
|
1820
|
+
weight_ne[0] = QK8_0;
|
|
1821
|
+
weight_ne[1] = src0->ne[0] / QK8_0;
|
|
1822
|
+
weight_nb[0] = sizeof(int8_t);
|
|
1823
|
+
weight_nb[1] = weight_nb[0] * weight_ne[0];
|
|
1824
|
+
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
|
1825
|
+
weight_ne[i] = src0->ne[i - 1];
|
|
1826
|
+
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
|
1827
|
+
}
|
|
1828
|
+
// [3,4,5,64] -> [3,4,5,2,1]
|
|
1829
|
+
scale_ne[0] = 1;
|
|
1830
|
+
scale_ne[1] = src0->ne[0] / QK8_0;
|
|
1831
|
+
scale_nb[0] = sizeof(uint16_t);
|
|
1832
|
+
scale_nb[1] = scale_nb[0] * scale_ne[0];
|
|
1833
|
+
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
|
1834
|
+
scale_ne[i] = src0->ne[i - 1];
|
|
1835
|
+
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
|
1836
|
+
}
|
|
1837
|
+
// [3,4,5,64] -> [3,4,5,2,32]
|
|
1838
|
+
dequant_ne = weight_ne;
|
|
1839
|
+
dequant_nb[0] = ggml_type_size(dst->type);
|
|
1840
|
+
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
|
1841
|
+
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
|
1842
|
+
}
|
|
1843
|
+
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
|
1844
|
+
ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
|
|
1845
|
+
ggml_nelements(src0) * ggml_type_size(dst->type));
|
|
1846
|
+
acl_tensor_ptr acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
|
|
1847
|
+
weight_ne, weight_nb, GGML_MAX_DIMS + 1);
|
|
1848
|
+
acl_tensor_ptr acl_scale_tensor =
|
|
1849
|
+
ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
|
|
1850
|
+
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
|
1851
|
+
acl_tensor_ptr dequant_tensor =
|
|
1852
|
+
ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
|
|
1853
|
+
ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
|
1854
|
+
aclnn_mul(ctx, acl_weight_tensor.get(), acl_scale_tensor.get(), dequant_tensor.get());
|
|
1855
|
+
dequant_nb[0] = ggml_type_size(dst->type);
|
|
1856
|
+
dequant_ne = src0->ne;
|
|
1857
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
1858
|
+
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
|
1859
|
+
}
|
|
1860
|
+
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
|
|
1861
|
+
dst->nb, src1, dst->type);
|
|
1862
|
+
break;
|
|
1834
1863
|
}
|
|
1835
|
-
|
|
1836
|
-
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(),
|
|
1837
|
-
dequant_ne, dequant_nb,
|
|
1838
|
-
dst->data, dst->ne, dst->nb,
|
|
1839
|
-
src1, dst->type);
|
|
1840
|
-
|
|
1841
|
-
ggml_cann_release_resources(ctx, dequant_tensor);
|
|
1842
|
-
break;
|
|
1843
|
-
}
|
|
1844
1864
|
default:
|
|
1845
1865
|
GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
|
|
1846
1866
|
break;
|
|
1847
1867
|
}
|
|
1848
1868
|
}
|
|
1849
1869
|
|
|
1850
|
-
void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
1851
|
-
ggml_tensor* src0 = dst->src[0]; // src
|
|
1852
|
-
ggml_tensor* src1 = dst->src[1]; // index
|
|
1870
|
+
void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
1871
|
+
ggml_tensor * src0 = dst->src[0]; // src
|
|
1872
|
+
ggml_tensor * src1 = dst->src[1]; // index
|
|
1853
1873
|
|
|
1854
1874
|
switch (dst->type) {
|
|
1855
|
-
case GGML_TYPE_F32:
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1875
|
+
case GGML_TYPE_F32:
|
|
1876
|
+
{
|
|
1877
|
+
aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
|
|
1878
|
+
break;
|
|
1879
|
+
}
|
|
1880
|
+
case GGML_TYPE_F16:
|
|
1881
|
+
{
|
|
1882
|
+
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
|
1883
|
+
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
|
1884
|
+
void * src_trans_buffer = src_buffer_allocator.get();
|
|
1885
|
+
size_t src_trans_nb[GGML_MAX_DIMS];
|
|
1886
|
+
src_trans_nb[0] = sizeof(uint16_t);
|
|
1887
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
1888
|
+
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
1889
|
+
}
|
|
1890
|
+
acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
|
|
1891
|
+
src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
1892
|
+
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
|
|
1893
|
+
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
|
|
1894
|
+
dst->type);
|
|
1895
|
+
break;
|
|
1870
1896
|
}
|
|
1871
|
-
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
|
1872
|
-
src_trans_buffer, ACL_FLOAT16, ggml_type_size(dst->type),
|
|
1873
|
-
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
1874
|
-
aclnn_cast(ctx, acl_src0, src_trans_tensor, ggml_cann_type_mapping(dst->type));
|
|
1875
|
-
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb,
|
|
1876
|
-
dst->data, dst->ne, dst->nb,
|
|
1877
|
-
src1, dst->type);
|
|
1878
|
-
ggml_cann_release_resources(ctx, acl_src0, src_trans_tensor);
|
|
1879
|
-
break;
|
|
1880
|
-
}
|
|
1881
1897
|
default:
|
|
1882
1898
|
GGML_ABORT("Unsupported tensor type for GGML_OP_SET_ROWS");
|
|
1883
1899
|
break;
|
|
@@ -1899,12 +1915,13 @@ void ggml_cann_set_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
1899
1915
|
* @param repeats The number of times each element will be repeated.
|
|
1900
1916
|
* @param output_size The size of the output tensor.
|
|
1901
1917
|
*/
|
|
1902
|
-
static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
|
|
1903
|
-
aclTensor*
|
|
1904
|
-
|
|
1905
|
-
int64_t
|
|
1906
|
-
|
|
1907
|
-
|
|
1918
|
+
static void aclnn_repeat_interleave(ggml_backend_cann_context & ctx,
|
|
1919
|
+
aclTensor * acl_src,
|
|
1920
|
+
aclTensor * acl_dst,
|
|
1921
|
+
int64_t dim,
|
|
1922
|
+
int64_t repeats,
|
|
1923
|
+
int64_t output_size) {
|
|
1924
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, RepeatInterleaveIntWithDim, acl_src, repeats, dim, output_size, acl_dst);
|
|
1908
1925
|
}
|
|
1909
1926
|
|
|
1910
1927
|
/**
|
|
@@ -1919,10 +1936,9 @@ static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx,
|
|
|
1919
1936
|
* @param dst The destination tensor where the result of the matrix
|
|
1920
1937
|
* multiplication will be stored.
|
|
1921
1938
|
*/
|
|
1922
|
-
static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
1923
|
-
|
|
1924
|
-
ggml_tensor*
|
|
1925
|
-
ggml_tensor* input = dst->src[1]; // input
|
|
1939
|
+
static void ggml_cann_mat_mul_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
1940
|
+
ggml_tensor * weight = dst->src[0]; // weight
|
|
1941
|
+
ggml_tensor * input = dst->src[1]; // input
|
|
1926
1942
|
|
|
1927
1943
|
// when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto
|
|
1928
1944
|
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
|
|
@@ -1937,51 +1953,36 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
|
1937
1953
|
}
|
|
1938
1954
|
}
|
|
1939
1955
|
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
bcast_weight_nb[2], bcast_weight_nb[3],
|
|
1947
|
-
bcast_weight_nb[4], bcast_weight_nb[5]};
|
|
1948
|
-
aclTensor* acl_weight_tensor;
|
|
1956
|
+
acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
|
|
1957
|
+
int64_t transpose_ne[] = { bcast_weight_ne[1], bcast_weight_ne[0], bcast_weight_ne[2],
|
|
1958
|
+
bcast_weight_ne[3], bcast_weight_ne[4], bcast_weight_ne[5] };
|
|
1959
|
+
size_t transpose_nb[] = { bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2],
|
|
1960
|
+
bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5] };
|
|
1961
|
+
acl_tensor_ptr acl_weight_tensor;
|
|
1949
1962
|
|
|
1950
1963
|
// Only check env once.
|
|
1951
|
-
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
|
1964
|
+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
|
1952
1965
|
if (weight_to_nz && is_matmul_weight(weight)) {
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
// Reverse ne.
|
|
1956
|
-
std::reverse(transpose_ne, transpose_ne + n_dims);
|
|
1957
|
-
|
|
1958
|
-
std::vector<int64_t> storageDims = {transpose_ne[0], transpose_ne[1]};
|
|
1959
|
-
|
|
1960
|
-
acl_weight_tensor = aclCreateTensor(
|
|
1961
|
-
transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride,
|
|
1962
|
-
0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data);
|
|
1966
|
+
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_FRACTAL_NZ);
|
|
1963
1967
|
} else {
|
|
1964
|
-
acl_weight_tensor =
|
|
1965
|
-
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
|
1968
|
+
acl_weight_tensor = ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND);
|
|
1966
1969
|
}
|
|
1967
|
-
|
|
1968
|
-
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
|
1970
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
|
|
1969
1971
|
|
|
1970
1972
|
switch (n_dims) {
|
|
1971
1973
|
case 2:
|
|
1972
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor, acl_weight_tensor, acl_dst, 2);
|
|
1974
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Mm, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 2);
|
|
1973
1975
|
break;
|
|
1974
1976
|
case 3:
|
|
1975
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor, acl_weight_tensor, acl_dst,
|
|
1977
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(),
|
|
1978
|
+
2);
|
|
1976
1979
|
break;
|
|
1977
1980
|
default:
|
|
1978
1981
|
// ALLOW_FP32_DOWN_PRECISION, when input is
|
|
1979
1982
|
// fp32, atlas a2 will transpose it to HFLOAT32.
|
|
1980
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor, acl_weight_tensor, acl_dst, 1);
|
|
1983
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Matmul, acl_input_tensor.get(), acl_weight_tensor.get(), acl_dst.get(), 1);
|
|
1981
1984
|
break;
|
|
1982
1985
|
}
|
|
1983
|
-
|
|
1984
|
-
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_input_tensor, acl_dst);
|
|
1985
1986
|
}
|
|
1986
1987
|
|
|
1987
1988
|
/**
|
|
@@ -1997,11 +1998,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
|
|
|
1997
1998
|
* @param dst The destination tensor where the result of the matrix
|
|
1998
1999
|
* multiplication will be stored.
|
|
1999
2000
|
*/
|
|
2000
|
-
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
ggml_tensor* src0 = dst->src[0]; // weight
|
|
2004
|
-
ggml_tensor* src1 = dst->src[1]; // input
|
|
2001
|
+
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst, const enum ggml_type type) {
|
|
2002
|
+
ggml_tensor * src0 = dst->src[0]; // weight
|
|
2003
|
+
ggml_tensor * src1 = dst->src[1]; // input
|
|
2005
2004
|
|
|
2006
2005
|
// The shape of the weight is NCHW.
|
|
2007
2006
|
// Matrix multiplication uses HW dims.
|
|
@@ -2015,56 +2014,51 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|
|
2015
2014
|
} else {
|
|
2016
2015
|
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
|
|
2017
2016
|
}
|
|
2018
|
-
float
|
|
2017
|
+
float weight_nb[] = { src0->ne[0] * weight_elem_size, weight_elem_size };
|
|
2019
2018
|
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
|
|
2020
|
-
size_t weight_size
|
|
2019
|
+
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
|
|
2021
2020
|
|
|
2022
2021
|
// scale stored at the end of weight. Also need transpose.
|
|
2023
2022
|
size_t scale_elem_size = sizeof(uint16_t);
|
|
2024
|
-
size_t scale_nb[]
|
|
2025
|
-
|
|
2026
|
-
|
|
2027
|
-
char* scale_offset = (char*)src0->data + weight_size;
|
|
2023
|
+
size_t scale_nb[] = { src0->ne[0] / QK8_0 * scale_elem_size, scale_elem_size };
|
|
2024
|
+
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
|
2025
|
+
char * scale_offset = (char *) src0->data + weight_size;
|
|
2028
2026
|
|
|
2029
2027
|
// input
|
|
2030
|
-
size_t
|
|
2031
|
-
int64_t
|
|
2032
|
-
size_t
|
|
2033
|
-
size_t
|
|
2028
|
+
size_t input_elem_size = sizeof(uint16_t);
|
|
2029
|
+
int64_t input_ne[] = { src1->ne[0], src1->ne[1] };
|
|
2030
|
+
size_t input_nb[] = { input_elem_size, input_ne[0] * input_elem_size };
|
|
2031
|
+
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
|
|
2034
2032
|
ggml_cann_pool_alloc input_alloctor(ctx.pool());
|
|
2035
|
-
void*
|
|
2033
|
+
void * input_buffer = src1->data;
|
|
2036
2034
|
|
|
2037
2035
|
// case in
|
|
2038
2036
|
if (src1->type != GGML_TYPE_F16) {
|
|
2039
|
-
|
|
2040
|
-
input_buffer
|
|
2041
|
-
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
|
2037
|
+
acl_tensor_ptr acl_src1_tensor = ggml_cann_create_tensor(src1);
|
|
2038
|
+
input_buffer = input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
|
|
2042
2039
|
|
|
2043
|
-
int64_t* input_cast_ne = src1->ne;
|
|
2044
|
-
size_t
|
|
2040
|
+
int64_t * input_cast_ne = src1->ne;
|
|
2041
|
+
size_t input_cast_nb[GGML_MAX_DIMS];
|
|
2045
2042
|
input_cast_nb[0] = sizeof(uint16_t);
|
|
2046
2043
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2047
2044
|
input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1];
|
|
2048
2045
|
}
|
|
2049
2046
|
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
|
|
2054
|
-
ggml_cann_release_resources(ctx, acl_input_tensor, acl_src1_tensor);
|
|
2047
|
+
acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(input_buffer, ACL_FLOAT16, input_elem_size,
|
|
2048
|
+
input_cast_ne, input_cast_nb, GGML_MAX_DIMS);
|
|
2049
|
+
aclnn_cast(ctx, acl_src1_tensor.get(), acl_input_tensor.get(), ACL_FLOAT16);
|
|
2055
2050
|
}
|
|
2056
2051
|
|
|
2057
2052
|
// output
|
|
2058
|
-
size_t
|
|
2059
|
-
size_t
|
|
2053
|
+
size_t output_elem_size = sizeof(uint16_t);
|
|
2054
|
+
size_t output_nb[] = { output_elem_size, dst->ne[0] * output_elem_size };
|
|
2060
2055
|
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
|
2061
|
-
void*
|
|
2062
|
-
|
|
2063
|
-
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
|
2056
|
+
void * output_buffer = output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
|
|
2057
|
+
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
|
|
2064
2058
|
|
|
2065
2059
|
// aclnn
|
|
2066
|
-
int64_t
|
|
2067
|
-
int64_t
|
|
2060
|
+
int64_t max_elem_size = 65535;
|
|
2061
|
+
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
|
|
2068
2062
|
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
|
|
2069
2063
|
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
|
|
2070
2064
|
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
|
|
@@ -2074,98 +2068,77 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
|
|
2074
2068
|
int64_t batch1 = (n1 * src1->ne[2]) + c1;
|
|
2075
2069
|
int64_t batch0 = (n0 * src0->ne[2]) + c0;
|
|
2076
2070
|
|
|
2077
|
-
|
|
2078
|
-
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
|
|
2079
|
-
input_elem_size, input_ne, input_nb, 2);
|
|
2071
|
+
acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(
|
|
2072
|
+
(char *) input_buffer + batch1 * input_stride, ACL_FLOAT16, input_elem_size, input_ne, input_nb, 2);
|
|
2080
2073
|
|
|
2081
2074
|
// first split
|
|
2082
2075
|
int64_t weight_ne_offset = 0;
|
|
2083
|
-
int64_t weight_ne[2]
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
int64_t scale_ne_offset = 0;
|
|
2087
|
-
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
|
|
2076
|
+
int64_t weight_ne[2] = { max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, src0->ne[0] };
|
|
2077
|
+
int64_t scale_ne_offset = 0;
|
|
2078
|
+
int64_t scale_ne[2] = { weight_ne[0], weight_ne[1] / QK8_0 };
|
|
2088
2079
|
int64_t output_ne_offset = 0;
|
|
2089
|
-
int64_t output_ne[2]
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
(char*)src0->data + batch0 * weight_stride,
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
|
2101
|
-
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
|
2102
|
-
output_ne_offset);
|
|
2080
|
+
int64_t output_ne[2] = { weight_ne[0], dst->ne[1] };
|
|
2081
|
+
|
|
2082
|
+
acl_tensor_ptr acl_weight_tensor =
|
|
2083
|
+
ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
|
|
2084
|
+
weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
|
2085
|
+
acl_tensor_ptr acl_scale_tensor =
|
|
2086
|
+
ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size, scale_ne,
|
|
2087
|
+
scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
|
|
2088
|
+
acl_tensor_ptr acl_output_tensor =
|
|
2089
|
+
ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16, output_elem_size,
|
|
2090
|
+
output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
|
|
2103
2091
|
int64_t antiquantGroupSize = 0;
|
|
2104
2092
|
if (src0->ne[0] > QK8_0) {
|
|
2105
2093
|
antiquantGroupSize = QK8_0;
|
|
2106
2094
|
}
|
|
2107
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
acl_output_tensor);
|
|
2111
|
-
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
|
|
2095
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
|
|
2096
|
+
acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
|
|
2097
|
+
acl_output_tensor.get());
|
|
2112
2098
|
|
|
2113
2099
|
// other splits
|
|
2114
2100
|
for (int64_t split = 1; split < split_size; split++) {
|
|
2115
|
-
weight_ne_offset +=
|
|
2116
|
-
|
|
2117
|
-
|
|
2118
|
-
? src0->ne[1] - (max_elem_size * split)
|
|
2119
|
-
: max_elem_size;
|
|
2101
|
+
weight_ne_offset += weight_elem_size * weight_ne[0] * weight_ne[1];
|
|
2102
|
+
weight_ne[0] =
|
|
2103
|
+
max_elem_size * (split + 1) > src0->ne[1] ? src0->ne[1] - (max_elem_size * split) : max_elem_size;
|
|
2120
2104
|
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
|
|
2121
2105
|
scale_ne[0] = weight_ne[0];
|
|
2122
|
-
output_ne_offset +=
|
|
2123
|
-
output_elem_size * output_ne[0] * output_ne[1];
|
|
2106
|
+
output_ne_offset += output_elem_size * output_ne[0] * output_ne[1];
|
|
2124
2107
|
output_ne[0] = weight_ne[0];
|
|
2125
2108
|
|
|
2126
|
-
acl_weight_tensor =
|
|
2127
|
-
(char*)src0->data + batch0 * weight_stride,
|
|
2128
|
-
|
|
2129
|
-
|
|
2130
|
-
|
|
2131
|
-
|
|
2132
|
-
|
|
2133
|
-
|
|
2134
|
-
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2138
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor,
|
|
2139
|
-
acl_weight_tensor, acl_scale_tensor, nullptr,
|
|
2140
|
-
nullptr, nullptr, nullptr, antiquantGroupSize,
|
|
2141
|
-
acl_output_tensor);
|
|
2142
|
-
ggml_cann_release_resources(ctx, acl_weight_tensor, acl_scale_tensor, acl_output_tensor);
|
|
2109
|
+
acl_weight_tensor =
|
|
2110
|
+
ggml_cann_create_tensor((char *) src0->data + batch0 * weight_stride, ggml_cann_type_mapping(type),
|
|
2111
|
+
weight_elem_size, weight_ne, weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
|
|
2112
|
+
acl_scale_tensor =
|
|
2113
|
+
ggml_cann_create_tensor(scale_offset + batch0 * scale_stride, ACL_FLOAT16, scale_elem_size,
|
|
2114
|
+
scale_ne, scale_nb, 2, ACL_FORMAT_ND, scale_ne_offset);
|
|
2115
|
+
acl_output_tensor =
|
|
2116
|
+
ggml_cann_create_tensor((char *) output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
|
2117
|
+
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, output_ne_offset);
|
|
2118
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, WeightQuantBatchMatmulV2, acl_input_tensor.get(), acl_weight_tensor.get(),
|
|
2119
|
+
acl_scale_tensor.get(), nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
|
|
2120
|
+
acl_output_tensor.get());
|
|
2143
2121
|
}
|
|
2144
|
-
|
|
2145
|
-
ggml_cann_release_resources(ctx, acl_input_tensor);
|
|
2146
2122
|
}
|
|
2147
2123
|
}
|
|
2148
2124
|
|
|
2149
2125
|
// cast out
|
|
2150
2126
|
if (dst->type != GGML_TYPE_F16) {
|
|
2151
|
-
int64_t* output_cast_ne = dst->ne;
|
|
2152
|
-
size_t
|
|
2127
|
+
int64_t * output_cast_ne = dst->ne;
|
|
2128
|
+
size_t output_cast_nb[GGML_MAX_DIMS];
|
|
2153
2129
|
output_cast_nb[0] = sizeof(uint16_t);
|
|
2154
2130
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2155
2131
|
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
|
|
2156
2132
|
}
|
|
2157
2133
|
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ggml_cann_type_mapping(dst->type));
|
|
2163
|
-
|
|
2164
|
-
ggml_cann_release_resources(ctx, acl_output_tensor, acl_dst_tensor);
|
|
2134
|
+
acl_tensor_ptr acl_output_tensor = ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
|
|
2135
|
+
output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
|
|
2136
|
+
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
|
2137
|
+
aclnn_cast(ctx, acl_output_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
|
2165
2138
|
}
|
|
2166
2139
|
}
|
|
2167
2140
|
|
|
2168
|
-
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
2141
|
+
void ggml_cann_mul_mat(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2169
2142
|
const enum ggml_type type = dst->src[0]->type;
|
|
2170
2143
|
switch (type) {
|
|
2171
2144
|
case GGML_TYPE_F32:
|
|
@@ -2198,12 +2171,14 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
2198
2171
|
* @param dims An array specifying the dimensions along which elements are
|
|
2199
2172
|
* shifted.
|
|
2200
2173
|
*/
|
|
2201
|
-
static void aclnn_roll(ggml_backend_cann_context& ctx,
|
|
2202
|
-
aclTensor
|
|
2203
|
-
|
|
2204
|
-
|
|
2205
|
-
|
|
2206
|
-
|
|
2174
|
+
static void aclnn_roll(ggml_backend_cann_context & ctx,
|
|
2175
|
+
aclTensor * acl_src,
|
|
2176
|
+
aclTensor * acl_dst,
|
|
2177
|
+
int64_t * shifts,
|
|
2178
|
+
int64_t * dims) {
|
|
2179
|
+
acl_int_array_ptr acl_shifts = ggml_cann_create_int_array(shifts, 1);
|
|
2180
|
+
acl_int_array_ptr acl_dims = ggml_cann_create_int_array(dims, 1);
|
|
2181
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Roll, acl_src, acl_shifts.get(), acl_dims.get(), acl_dst);
|
|
2207
2182
|
}
|
|
2208
2183
|
|
|
2209
2184
|
/**
|
|
@@ -2219,14 +2194,15 @@ static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
|
|
2219
2194
|
* @param index_num The number of positions specified in the index array.
|
|
2220
2195
|
* @param value The scalar value used to fill the specified positions.
|
|
2221
2196
|
*/
|
|
2222
|
-
static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
|
|
2223
|
-
aclTensor*
|
|
2224
|
-
int64_t
|
|
2225
|
-
|
|
2226
|
-
|
|
2227
|
-
|
|
2228
|
-
|
|
2229
|
-
|
|
2197
|
+
static void aclnn_index_fill_tensor(ggml_backend_cann_context & ctx,
|
|
2198
|
+
aclTensor * acl_src,
|
|
2199
|
+
int64_t dim,
|
|
2200
|
+
int64_t * index,
|
|
2201
|
+
int64_t index_num,
|
|
2202
|
+
float value) {
|
|
2203
|
+
acl_int_array_ptr acl_index = ggml_cann_create_int_array(index, index_num);
|
|
2204
|
+
acl_scalar_ptr acl_value = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
|
|
2205
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexFillTensor, acl_src, dim, acl_index.get(), acl_value.get());
|
|
2230
2206
|
}
|
|
2231
2207
|
|
|
2232
2208
|
/**
|
|
@@ -2248,420 +2224,435 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
|
|
|
2248
2224
|
* 5. Compute sin(θ), cos(θ) and optionally scale by attn_factor.
|
|
2249
2225
|
* 6. Expand sin/cos values by repeat or repeat_interleave depending
|
|
2250
2226
|
* on whether @param is_neox is enabled.
|
|
2251
|
-
*
|
|
2252
|
-
*
|
|
2253
|
-
*
|
|
2254
|
-
* @param
|
|
2255
|
-
*
|
|
2256
|
-
* @param
|
|
2257
|
-
*
|
|
2258
|
-
* @param
|
|
2259
|
-
* @param
|
|
2260
|
-
*
|
|
2261
|
-
* @param is_neox Whether to use Neox-style repeat strategy
|
|
2262
|
-
* (dim expansion vs repeat_interleave).
|
|
2227
|
+
*
|
|
2228
|
+
* @param ctx The CANN backend context, holding memory pool,
|
|
2229
|
+
* stream, and persistent buffers for rope init/cache.
|
|
2230
|
+
* @param dst The destination ggml_tensor whose computation
|
|
2231
|
+
* depends on the RoPE values (usually Qcur/Kcur).
|
|
2232
|
+
* @param theta_scale Scalar exponent base for computing theta scale values.
|
|
2233
|
+
* @param freq_scale Frequency scaling factor, applied to theta scale.
|
|
2234
|
+
* @param attn_factor Attention scaling factor, applied to sin/cos.
|
|
2235
|
+
* @param is_neox Whether to use Neox-style repeat strategy
|
|
2236
|
+
* (dim expansion vs repeat_interleave).
|
|
2263
2237
|
*/
|
|
2264
|
-
static void aclnn_cache_init(ggml_backend_cann_context& ctx,
|
|
2265
|
-
|
|
2266
|
-
float
|
|
2267
|
-
|
|
2268
|
-
|
|
2269
|
-
|
|
2270
|
-
|
|
2271
|
-
|
|
2272
|
-
|
|
2273
|
-
|
|
2274
|
-
|
|
2275
|
-
|
|
2276
|
-
|
|
2277
|
-
|
|
2238
|
+
static void aclnn_cache_init(ggml_backend_cann_context & ctx,
|
|
2239
|
+
ggml_tensor * dst,
|
|
2240
|
+
float * corr_dims,
|
|
2241
|
+
float ext_factor,
|
|
2242
|
+
float theta_scale,
|
|
2243
|
+
float freq_scale,
|
|
2244
|
+
float attn_factor,
|
|
2245
|
+
bool is_neox) {
|
|
2246
|
+
ggml_tensor * src0 = dst->src[0]; // input
|
|
2247
|
+
ggml_tensor * src1 = dst->src[1]; // position
|
|
2248
|
+
ggml_tensor * src2 = dst->src[2]; // freq_factors
|
|
2249
|
+
|
|
2250
|
+
if (src2 == nullptr && ctx.rope_cache.cached && ctx.rope_cache.ext_factor == ext_factor &&
|
|
2251
|
+
ctx.rope_cache.theta_scale == theta_scale && ctx.rope_cache.freq_scale == freq_scale &&
|
|
2252
|
+
ctx.rope_cache.attn_factor == attn_factor && ctx.rope_cache.is_neox == is_neox) {
|
|
2253
|
+
// use cache.
|
|
2278
2254
|
return;
|
|
2279
2255
|
}
|
|
2280
2256
|
|
|
2281
|
-
|
|
2282
|
-
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
GGML_TENSOR_BINARY_OP_LOCALS
|
|
2286
|
-
|
|
2287
|
-
int64_t theta_scale_length = ne00 / 2;
|
|
2288
|
-
int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
|
|
2289
|
-
size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
|
|
2290
|
-
theta_scale_length * sizeof(float_t)};
|
|
2257
|
+
int64_t theta_scale_length = src0->ne[0] / 2;
|
|
2258
|
+
int64_t theta_scale_ne[] = { theta_scale_length, 1, 1, 1 };
|
|
2259
|
+
size_t theta_scale_nb[] = { sizeof(float), sizeof(float), sizeof(float), theta_scale_length * sizeof(float) };
|
|
2291
2260
|
|
|
2292
2261
|
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
|
2293
2262
|
int64_t position_length = src1->ne[0];
|
|
2294
|
-
int64_t position_ne[]
|
|
2295
|
-
size_t
|
|
2296
|
-
sizeof(int32_t) * position_length};
|
|
2263
|
+
int64_t position_ne[] = { 1, 1, position_length, 1 };
|
|
2264
|
+
size_t position_nb[] = { sizeof(int32_t), sizeof(int32_t), sizeof(int32_t), sizeof(int32_t) * position_length };
|
|
2297
2265
|
|
|
2298
|
-
int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
|
|
2299
|
-
size_t
|
|
2300
|
-
theta_nb[0] = sizeof(
|
|
2266
|
+
int64_t theta_ne[] = { theta_scale_length, 1, position_length, 1 };
|
|
2267
|
+
size_t theta_nb[GGML_MAX_DIMS];
|
|
2268
|
+
theta_nb[0] = sizeof(float);
|
|
2301
2269
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2302
2270
|
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
|
|
2303
2271
|
}
|
|
2304
2272
|
|
|
2305
|
-
//
|
|
2306
|
-
|
|
2307
|
-
|
|
2308
|
-
|
|
2309
|
-
|
|
2273
|
+
// theta_scale arange, [0,1,...,ne00/2 - 1]
|
|
2274
|
+
acl_tensor_ptr acl_theta_scale_tensor;
|
|
2275
|
+
// cache theta scale
|
|
2276
|
+
if (ctx.rope_cache.theta_scale_length != theta_scale_length ||
|
|
2277
|
+
// theta_scale and freq_scale should not change during the current token inference process,
|
|
2278
|
+
// so we can directly use == here instead of comparing the absolute difference.
|
|
2279
|
+
ctx.rope_cache.theta_scale != theta_scale || ctx.rope_cache.freq_scale != freq_scale) {
|
|
2280
|
+
ctx.rope_cache.theta_scale_length = theta_scale_length;
|
|
2281
|
+
|
|
2282
|
+
if (ctx.rope_cache.theta_scale_cache != nullptr) {
|
|
2283
|
+
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
|
|
2284
|
+
}
|
|
2285
|
+
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float),
|
|
2286
|
+
ACL_MEM_MALLOC_HUGE_FIRST));
|
|
2287
|
+
|
|
2288
|
+
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
|
|
2289
|
+
theta_scale_ne, theta_scale_nb, 1);
|
|
2290
|
+
|
|
2291
|
+
float start = 0;
|
|
2292
|
+
float step = 1;
|
|
2293
|
+
float stop = theta_scale_length;
|
|
2294
|
+
float n_elements = theta_scale_length;
|
|
2295
|
+
aclnn_arange(ctx, acl_theta_scale_tensor.get(), start, stop, step, n_elements);
|
|
2296
|
+
|
|
2297
|
+
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
|
|
2298
|
+
acl_tensor_ptr acl_yarn_ramp_tensor;
|
|
2299
|
+
if (ext_factor != 0) {
|
|
2300
|
+
// -rope_yarn_ramp
|
|
2301
|
+
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
|
|
2302
|
+
// return MIN(1, MAX(0, y)) - 1;
|
|
2303
|
+
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
|
|
2304
|
+
void * yarn_ramp_buffer = yarn_ramp_allocator.get();
|
|
2305
|
+
acl_yarn_ramp_tensor =
|
|
2306
|
+
ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne, theta_scale_nb, 1);
|
|
2307
|
+
float zero_value = 0, one_value = 1;
|
|
2308
|
+
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
|
|
2309
|
+
acl_scalar_ptr low = ggml_cann_create_scalar(&corr_dims[0], aclDataType::ACL_FLOAT);
|
|
2310
|
+
acl_scalar_ptr zero = ggml_cann_create_scalar(&zero_value, aclDataType::ACL_FLOAT);
|
|
2311
|
+
acl_scalar_ptr one = ggml_cann_create_scalar(&one_value, aclDataType::ACL_FLOAT);
|
|
2312
|
+
acl_scalar_ptr denom_safe = ggml_cann_create_scalar(&denom_safe_value, aclDataType::ACL_FLOAT);
|
|
2313
|
+
acl_scalar_ptr ext_factor_sc = ggml_cann_create_scalar(&ext_factor, aclDataType::ACL_FLOAT);
|
|
2314
|
+
|
|
2315
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor.get(), low.get(), one.get(),
|
|
2316
|
+
acl_yarn_ramp_tensor.get());
|
|
2317
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor.get(), denom_safe.get());
|
|
2318
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor.get(), zero.get(), zero.get());
|
|
2319
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor.get(), one.get());
|
|
2320
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor.get(), one.get(), one.get());
|
|
2321
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), ext_factor_sc.get());
|
|
2322
|
+
|
|
2323
|
+
// theta_interp = freq_scale * theta_extrap;
|
|
2324
|
+
// theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
|
2325
|
+
// theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
|
|
2326
|
+
// theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
|
|
2327
|
+
// theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
|
|
2328
|
+
//
|
|
2329
|
+
// we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
|
|
2330
|
+
// cache freq_scale + (freq_scale - 1) * ramp_mix
|
|
2331
|
+
float freq_scale_1 = freq_scale - 1;
|
|
2332
|
+
acl_scalar_ptr freq_scale_sc = ggml_cann_create_scalar(&freq_scale, aclDataType::ACL_FLOAT);
|
|
2333
|
+
acl_scalar_ptr freq_scale_1_sc = ggml_cann_create_scalar(&freq_scale_1, aclDataType::ACL_FLOAT);
|
|
2334
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor.get(), freq_scale_1_sc.get());
|
|
2335
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor.get(), freq_scale_sc.get(), one.get());
|
|
2310
2336
|
}
|
|
2311
|
-
ACL_CHECK(aclrtMalloc(&ctx.rope_init_ptr, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
|
|
2312
|
-
|
|
2313
|
-
aclTensor* acl_theta_scale_tensor =
|
|
2314
|
-
ggml_cann_create_tensor(ctx.rope_init_ptr, ACL_FLOAT, sizeof(float_t),
|
|
2315
|
-
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
|
2316
|
-
float start = 0;
|
|
2317
|
-
float step = 1;
|
|
2318
|
-
float stop = ne00 / 2;
|
|
2319
|
-
float n_elements = ne00 / 2;
|
|
2320
|
-
aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
|
|
2321
2337
|
|
|
2322
2338
|
// power
|
|
2323
|
-
|
|
2324
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
|
|
2325
|
-
acl_theta_scale_tensor);
|
|
2326
|
-
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2339
|
+
acl_scalar_ptr acl_theta_scale = ggml_cann_create_scalar(&theta_scale, aclDataType::ACL_FLOAT);
|
|
2340
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale.get(), acl_theta_scale_tensor.get(),
|
|
2341
|
+
acl_theta_scale_tensor.get());
|
|
2342
|
+
|
|
2343
|
+
if (ext_factor != 0) {
|
|
2344
|
+
aclnn_mul(ctx, acl_theta_scale_tensor.get(), acl_yarn_ramp_tensor.get());
|
|
2345
|
+
} else if (freq_scale != 1) {
|
|
2346
|
+
aclnn_muls(ctx, acl_theta_scale_tensor.get(), freq_scale, nullptr, true);
|
|
2330
2347
|
}
|
|
2348
|
+
} else {
|
|
2349
|
+
// use cache
|
|
2350
|
+
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
|
|
2351
|
+
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
|
2352
|
+
}
|
|
2331
2353
|
|
|
2332
|
-
|
|
2333
|
-
|
|
2334
|
-
|
|
2335
|
-
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2354
|
+
ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
|
|
2355
|
+
// freq_factors
|
|
2356
|
+
if (src2) {
|
|
2357
|
+
freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
|
|
2358
|
+
void * freq_fac_res_ptr = freq_fac_res_allocator.get();
|
|
2359
|
+
acl_tensor_ptr acl_freq_factors_tensor =
|
|
2360
|
+
ggml_cann_create_tensor(src2->data, ggml_cann_type_mapping(src2->type), ggml_type_size(src2->type),
|
|
2361
|
+
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
|
2362
|
+
acl_tensor_ptr acl_freq_fac_res_tensor = ggml_cann_create_tensor(freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
|
|
2363
|
+
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
|
2364
|
+
aclnn_div(ctx, acl_theta_scale_tensor.get(), acl_freq_factors_tensor.get(), acl_freq_fac_res_tensor.get());
|
|
2365
|
+
std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
|
|
2342
2366
|
}
|
|
2343
2367
|
|
|
2344
|
-
// init sin_repeat && cos_repeat,
|
|
2345
|
-
if(position_length > ctx.
|
|
2346
|
-
ctx.
|
|
2347
|
-
|
|
2348
|
-
|
|
2349
|
-
|
|
2350
|
-
|
|
2368
|
+
// init sin_repeat && cos_repeat, only to accelerate first layer on each device
|
|
2369
|
+
if (position_length > ctx.rope_cache.position_length) {
|
|
2370
|
+
ctx.rope_cache.position_length = position_length;
|
|
2371
|
+
if (ctx.rope_cache.sin_cache != nullptr) {
|
|
2372
|
+
ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
|
|
2373
|
+
}
|
|
2374
|
+
if (ctx.rope_cache.cos_cache != nullptr) {
|
|
2375
|
+
ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
|
|
2351
2376
|
}
|
|
2352
|
-
|
|
2353
|
-
ACL_CHECK(
|
|
2377
|
+
int64_t repeat_theta_length = theta_scale_length * position_length * 2;
|
|
2378
|
+
ACL_CHECK(
|
|
2379
|
+
aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
|
|
2380
|
+
ACL_CHECK(
|
|
2381
|
+
aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
|
|
2354
2382
|
}
|
|
2355
2383
|
|
|
2356
|
-
aclTensor* acl_theta_scale_tensor =
|
|
2357
|
-
ggml_cann_create_tensor(ctx.rope_init_ptr, ACL_FLOAT, sizeof(float_t),
|
|
2358
|
-
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
|
|
2359
|
-
|
|
2360
2384
|
// position
|
|
2361
|
-
|
|
2362
|
-
src1->data, ggml_cann_type_mapping(src1->type),
|
|
2363
|
-
|
|
2385
|
+
acl_tensor_ptr acl_position_tensor =
|
|
2386
|
+
ggml_cann_create_tensor(src1->data, ggml_cann_type_mapping(src1->type), ggml_type_size(src1->type), position_ne,
|
|
2387
|
+
position_nb, GGML_MAX_DIMS);
|
|
2364
2388
|
|
|
2365
2389
|
// power * position
|
|
2366
|
-
int64_t
|
|
2367
|
-
ggml_cann_pool_alloc theta_allocator(ctx.pool(),
|
|
2368
|
-
|
|
2369
|
-
void* theta_buffer = theta_allocator.get();
|
|
2390
|
+
int64_t theta_length = theta_scale_length * position_length;
|
|
2391
|
+
ggml_cann_pool_alloc theta_allocator(ctx.pool(), theta_length * sizeof(float));
|
|
2392
|
+
void * theta_buffer = theta_allocator.get();
|
|
2370
2393
|
|
|
2371
|
-
|
|
2372
|
-
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(
|
|
2373
|
-
|
|
2374
|
-
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
|
|
2375
|
-
acl_theta_tensor);
|
|
2394
|
+
acl_tensor_ptr acl_theta_tensor =
|
|
2395
|
+
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS);
|
|
2396
|
+
aclnn_mul(ctx, acl_position_tensor.get(), acl_theta_scale_tensor.get(), acl_theta_tensor.get());
|
|
2376
2397
|
|
|
2377
2398
|
// sin/cos
|
|
2378
|
-
ggml_cann_pool_alloc sin_allocator(ctx.pool(),
|
|
2379
|
-
|
|
2380
|
-
|
|
2381
|
-
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2388
|
-
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2399
|
+
ggml_cann_pool_alloc sin_allocator(ctx.pool(), theta_length * sizeof(float));
|
|
2400
|
+
void * sin_buffer = sin_allocator.get();
|
|
2401
|
+
acl_tensor_ptr acl_sin_tensor =
|
|
2402
|
+
ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
|
2403
|
+
aclnn_sin(ctx, acl_theta_tensor.get(), acl_sin_tensor.get());
|
|
2404
|
+
|
|
2405
|
+
ggml_cann_pool_alloc cos_allocator(ctx.pool(), theta_length * sizeof(float));
|
|
2406
|
+
void * cos_buffer = cos_allocator.get();
|
|
2407
|
+
acl_tensor_ptr acl_cos_tensor =
|
|
2408
|
+
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb, GGML_MAX_DIMS, ACL_FORMAT_ND);
|
|
2409
|
+
aclnn_cos(ctx, acl_theta_tensor.get(), acl_cos_tensor.get());
|
|
2410
|
+
|
|
2411
|
+
if (ext_factor != 0) {
|
|
2412
|
+
attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
|
|
2413
|
+
}
|
|
2393
2414
|
|
|
2394
2415
|
// attn_factor
|
|
2395
2416
|
if (attn_factor != 1) {
|
|
2396
|
-
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
|
|
2397
|
-
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
|
|
2417
|
+
aclnn_muls(ctx, acl_sin_tensor.get(), attn_factor, nullptr, true);
|
|
2418
|
+
aclnn_muls(ctx, acl_cos_tensor.get(), attn_factor, nullptr, true);
|
|
2398
2419
|
}
|
|
2399
2420
|
|
|
2400
|
-
int64_t sin_reshape_ne[4] = {
|
|
2401
|
-
size_t
|
|
2402
|
-
sin_reshape_nb[0] = sizeof(
|
|
2421
|
+
int64_t sin_reshape_ne[4] = { src0->ne[0], 1, src0->ne[2], 1 };
|
|
2422
|
+
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
|
2423
|
+
sin_reshape_nb[0] = sizeof(float);
|
|
2403
2424
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2404
2425
|
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
|
|
2405
2426
|
}
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
ggml_cann_create_tensor(ctx.rope_cos_ptr, ACL_FLOAT, sizeof(float_t),
|
|
2411
|
-
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
|
2427
|
+
acl_tensor_ptr acl_sin_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
|
|
2428
|
+
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
|
2429
|
+
acl_tensor_ptr acl_cos_repeat_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
|
|
2430
|
+
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
|
2412
2431
|
|
|
2413
2432
|
// repeat
|
|
2414
2433
|
if (is_neox) {
|
|
2415
|
-
int64_t repeatsArray[] = {1, 1, 1, 2};
|
|
2416
|
-
aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray);
|
|
2417
|
-
aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray);
|
|
2434
|
+
int64_t repeatsArray[] = { 1, 1, 1, 2 };
|
|
2435
|
+
aclnn_repeat(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), repeatsArray);
|
|
2436
|
+
aclnn_repeat(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), repeatsArray);
|
|
2418
2437
|
} else {
|
|
2419
2438
|
int64_t num_repeats = 2;
|
|
2420
|
-
int64_t dim
|
|
2439
|
+
int64_t dim = 3;
|
|
2421
2440
|
int64_t output_size = theta_scale_length * num_repeats;
|
|
2422
|
-
aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim,
|
|
2423
|
-
|
|
2424
|
-
aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim,
|
|
2425
|
-
num_repeats, output_size);
|
|
2441
|
+
aclnn_repeat_interleave(ctx, acl_sin_tensor.get(), acl_sin_repeat_tensor.get(), dim, num_repeats, output_size);
|
|
2442
|
+
aclnn_repeat_interleave(ctx, acl_cos_tensor.get(), acl_cos_repeat_tensor.get(), dim, num_repeats, output_size);
|
|
2426
2443
|
}
|
|
2427
2444
|
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2445
|
+
// Other layers use cache except first layer.
|
|
2446
|
+
ctx.rope_cache.cached = true;
|
|
2447
|
+
ctx.rope_cache.ext_factor = ext_factor;
|
|
2448
|
+
ctx.rope_cache.theta_scale = theta_scale;
|
|
2449
|
+
ctx.rope_cache.freq_scale = freq_scale;
|
|
2450
|
+
ctx.rope_cache.attn_factor = attn_factor;
|
|
2451
|
+
ctx.rope_cache.is_neox = is_neox;
|
|
2431
2452
|
}
|
|
2432
2453
|
|
|
2433
2454
|
#ifdef __cplusplus
|
|
2434
2455
|
extern "C" {
|
|
2435
2456
|
#endif
|
|
2436
|
-
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2457
|
+
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(const aclTensor * x,
|
|
2458
|
+
const aclTensor * cos,
|
|
2459
|
+
const aclTensor * sin,
|
|
2460
|
+
int64_t mode,
|
|
2461
|
+
const aclTensor * yOut,
|
|
2462
|
+
uint64_t * workspaceSize,
|
|
2463
|
+
aclOpExecutor ** executor);
|
|
2464
|
+
aclnnStatus aclnnRotaryPositionEmbedding(void * workspace,
|
|
2465
|
+
uint64_t workspaceSize,
|
|
2466
|
+
aclOpExecutor * executor,
|
|
2467
|
+
aclrtStream stream);
|
|
2444
2468
|
#ifdef __cplusplus
|
|
2445
2469
|
}
|
|
2446
2470
|
#endif
|
|
2447
2471
|
|
|
2448
|
-
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
2449
|
-
|
|
2450
|
-
// Only test with LLAMA model.
|
|
2451
|
-
ggml_tensor* src0 = dst->src[0]; // input
|
|
2472
|
+
void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2473
|
+
ggml_tensor * src0 = dst->src[0]; // input
|
|
2452
2474
|
|
|
2453
2475
|
// param
|
|
2454
|
-
float
|
|
2476
|
+
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
2455
2477
|
// const int n_past = ((int32_t *) dst->op_params)[0];
|
|
2456
|
-
const int n_dims
|
|
2457
|
-
const int mode
|
|
2478
|
+
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
2479
|
+
const int mode = ((int32_t *) dst->op_params)[2];
|
|
2458
2480
|
// const int n_ctx = ((int32_t *) dst->op_params)[3];
|
|
2459
|
-
const int n_ctx_orig = ((int32_t*)dst->op_params)[4];
|
|
2481
|
+
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
2460
2482
|
|
|
2461
2483
|
GGML_TENSOR_UNARY_OP_LOCALS
|
|
2462
2484
|
|
|
2463
|
-
memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float));
|
|
2464
|
-
memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float));
|
|
2465
|
-
memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float));
|
|
2466
|
-
memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float));
|
|
2467
|
-
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
|
|
2468
|
-
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
|
|
2485
|
+
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
2486
|
+
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
2487
|
+
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float));
|
|
2488
|
+
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
|
2489
|
+
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
|
2490
|
+
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
2469
2491
|
|
|
2470
2492
|
// TODO: n_dims <= ne0
|
|
2471
2493
|
GGML_ASSERT(n_dims == ne0);
|
|
2472
2494
|
GGML_ASSERT(n_dims % 2 == 0);
|
|
2473
|
-
// TODO: ext_factor != 0
|
|
2474
|
-
GGML_ASSERT(ext_factor == 0);
|
|
2475
2495
|
|
|
2476
2496
|
const float theta_scale = powf(freq_base, -2.0f / n_dims);
|
|
2477
2497
|
|
|
2478
2498
|
float corr_dims[2];
|
|
2479
|
-
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
|
|
2480
|
-
beta_slow, corr_dims);
|
|
2499
|
+
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
|
2481
2500
|
|
|
2482
2501
|
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
2483
2502
|
|
|
2484
2503
|
// init ctx.rope_cos/rope_sin cache
|
|
2485
|
-
aclnn_cache_init(ctx, dst, theta_scale, freq_scale, attn_factor, is_neox);
|
|
2504
|
+
aclnn_cache_init(ctx, dst, corr_dims, ext_factor, theta_scale, freq_scale, attn_factor, is_neox);
|
|
2486
2505
|
|
|
2487
|
-
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
|
|
2488
|
-
size_t
|
|
2489
|
-
sin_reshape_nb[0] = sizeof(
|
|
2506
|
+
int64_t sin_reshape_ne[4] = { ne00, 1, ne02, 1 };
|
|
2507
|
+
size_t sin_reshape_nb[GGML_MAX_DIMS];
|
|
2508
|
+
sin_reshape_nb[0] = sizeof(float);
|
|
2490
2509
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2491
2510
|
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
|
|
2492
2511
|
}
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
ggml_cann_create_tensor(ctx.rope_cos_ptr, ACL_FLOAT, sizeof(float_t),
|
|
2498
|
-
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
|
2512
|
+
acl_tensor_ptr acl_sin_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
|
|
2513
|
+
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
|
2514
|
+
acl_tensor_ptr acl_cos_reshape_tensor = ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
|
|
2515
|
+
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
|
|
2499
2516
|
|
|
2500
|
-
|
|
2501
|
-
|
|
2517
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
|
2518
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
2502
2519
|
|
|
2503
2520
|
#ifdef ASCEND_310P
|
|
2504
2521
|
// Special ROPE operation for 310P
|
|
2505
2522
|
|
|
2506
2523
|
// roll input
|
|
2507
|
-
void*
|
|
2508
|
-
|
|
2509
|
-
void*
|
|
2524
|
+
void * input_roll_buffer;
|
|
2525
|
+
acl_tensor_ptr acl_minus_one_tensor;
|
|
2526
|
+
void * minus_one_scale_buffer = nullptr;
|
|
2510
2527
|
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
|
|
2511
|
-
ggml_cann_pool_alloc minus_one_scale_allocator(
|
|
2512
|
-
ctx.pool(), sizeof(float_t) * src0->ne[0]);
|
|
2528
|
+
ggml_cann_pool_alloc minus_one_scale_allocator(ctx.pool(), sizeof(float) * src0->ne[0]);
|
|
2513
2529
|
if (!is_neox) {
|
|
2514
2530
|
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
|
|
2515
|
-
input_roll_buffer
|
|
2516
|
-
int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2),
|
|
2517
|
-
|
|
2518
|
-
size_t input_roll_nb[GGML_MAX_DIMS];
|
|
2531
|
+
input_roll_buffer = roll_allocator.get();
|
|
2532
|
+
int64_t input_roll_ne[4] = { 2, src0->ne[1] * (src0->ne[0] / 2), src0->ne[2], src0->ne[3] };
|
|
2533
|
+
size_t input_roll_nb[GGML_MAX_DIMS];
|
|
2519
2534
|
input_roll_nb[0] = ggml_type_size(src0->type);
|
|
2520
2535
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2521
2536
|
input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1];
|
|
2522
2537
|
}
|
|
2523
|
-
|
|
2524
|
-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
|
|
2530
|
-
|
|
2531
|
-
|
|
2532
|
-
|
|
2533
|
-
int64_t dims[] = {3};
|
|
2534
|
-
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
|
2535
|
-
ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
|
|
2538
|
+
acl_tensor_ptr acl_input_roll_tensor =
|
|
2539
|
+
ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
|
|
2540
|
+
input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
|
|
2541
|
+
acl_tensor_ptr acl_input_tensor =
|
|
2542
|
+
ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
|
|
2543
|
+
input_roll_ne, input_roll_nb, GGML_MAX_DIMS);
|
|
2544
|
+
|
|
2545
|
+
int64_t shifts[] = { 1 };
|
|
2546
|
+
int64_t dims[] = { 3 };
|
|
2547
|
+
aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
|
|
2536
2548
|
|
|
2537
2549
|
// init [-1, 1, -1, 1, ...]
|
|
2538
2550
|
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
|
2539
2551
|
|
|
2540
|
-
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
|
2541
|
-
size_t
|
|
2542
|
-
minus_one_nb[0] = sizeof(
|
|
2552
|
+
int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
|
|
2553
|
+
size_t minus_one_nb[GGML_MAX_DIMS];
|
|
2554
|
+
minus_one_nb[0] = sizeof(float);
|
|
2543
2555
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2544
2556
|
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
|
2545
2557
|
}
|
|
2546
|
-
acl_minus_one_tensor = aclnn_values(
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
int64_t
|
|
2550
|
-
int64_t* index = new int64_t[src0->ne[0]];
|
|
2558
|
+
acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
|
|
2559
|
+
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
|
|
2560
|
+
int64_t dim = 3;
|
|
2561
|
+
int64_t * index = new int64_t[src0->ne[0]];
|
|
2551
2562
|
for (int i = 0; i < src0->ne[0]; i++) {
|
|
2552
2563
|
index[i] = i / 2 * 2;
|
|
2553
2564
|
}
|
|
2554
2565
|
int64_t index_num = src0->ne[0];
|
|
2555
|
-
float
|
|
2556
|
-
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index,
|
|
2557
|
-
index_num, value);
|
|
2566
|
+
float value = -1;
|
|
2567
|
+
aclnn_index_fill_tensor(ctx, acl_minus_one_tensor.get(), dim, index, index_num, value);
|
|
2558
2568
|
} else {
|
|
2559
2569
|
// roll input: [q0,q1,q2,...] ->
|
|
2560
2570
|
// [q_half,q_half+1,...,q_end,q0,q1,...q_half-1]
|
|
2561
2571
|
input_roll_buffer = roll_allocator.get();
|
|
2562
|
-
|
|
2563
|
-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
|
2564
|
-
|
|
2565
|
-
|
|
2572
|
+
acl_tensor_ptr acl_input_roll_tensor =
|
|
2573
|
+
ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
|
|
2574
|
+
src0->ne, src0->nb, GGML_MAX_DIMS);
|
|
2575
|
+
acl_tensor_ptr acl_input_tensor = ggml_cann_create_tensor(src0);
|
|
2566
2576
|
|
|
2567
|
-
int64_t shifts[] = {src0->ne[0] / 2};
|
|
2568
|
-
int64_t dims[]
|
|
2569
|
-
aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims);
|
|
2577
|
+
int64_t shifts[] = { src0->ne[0] / 2 };
|
|
2578
|
+
int64_t dims[] = { 3 };
|
|
2579
|
+
aclnn_roll(ctx, acl_input_tensor.get(), acl_input_roll_tensor.get(), shifts, dims);
|
|
2570
2580
|
|
|
2571
|
-
ggml_cann_release_resources(ctx, acl_input_roll_tensor, acl_input_tensor);
|
|
2572
2581
|
// init [-1, -1, -1, 1, 1,1,...]
|
|
2573
|
-
minus_one_scale_buffer
|
|
2574
|
-
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
|
|
2575
|
-
size_t
|
|
2576
|
-
minus_one_nb[0] = sizeof(
|
|
2582
|
+
minus_one_scale_buffer = minus_one_scale_allocator.get();
|
|
2583
|
+
int64_t minus_one_ne[4] = { src0->ne[0], 1, 1, 1 };
|
|
2584
|
+
size_t minus_one_nb[GGML_MAX_DIMS];
|
|
2585
|
+
minus_one_nb[0] = sizeof(float);
|
|
2577
2586
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2578
2587
|
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
|
|
2579
2588
|
}
|
|
2580
|
-
acl_minus_one_tensor
|
|
2581
|
-
|
|
2582
|
-
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
|
|
2589
|
+
acl_minus_one_tensor = aclnn_values(ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0], minus_one_ne,
|
|
2590
|
+
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
|
|
2583
2591
|
// -1 * first half
|
|
2584
|
-
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
|
|
2585
|
-
size_t
|
|
2586
|
-
first_half_nb[0] = sizeof(
|
|
2592
|
+
int64_t first_half_ne[4] = { src0->ne[0] / 2, 1, 1, 1 };
|
|
2593
|
+
size_t first_half_nb[GGML_MAX_DIMS];
|
|
2594
|
+
first_half_nb[0] = sizeof(float);
|
|
2587
2595
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2588
2596
|
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
|
|
2589
2597
|
}
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace);
|
|
2596
|
-
ggml_cann_release_resources(ctx, acl_first_half_tensor);
|
|
2598
|
+
acl_tensor_ptr acl_first_half_tensor = ggml_cann_create_tensor(minus_one_scale_buffer, ACL_FLOAT, sizeof(float),
|
|
2599
|
+
first_half_ne, first_half_nb, GGML_MAX_DIMS);
|
|
2600
|
+
bool inplace = true;
|
|
2601
|
+
float scale = -1;
|
|
2602
|
+
aclnn_muls(ctx, acl_first_half_tensor.get(), scale, nullptr, inplace);
|
|
2597
2603
|
}
|
|
2598
2604
|
|
|
2599
2605
|
// TODO: n_dims < ne0
|
|
2600
2606
|
GGML_ASSERT(n_dims == src0->ne[0]);
|
|
2601
2607
|
|
|
2602
2608
|
// input * scale
|
|
2603
|
-
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(),
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
size_t input_nb[GGML_MAX_DIMS];
|
|
2609
|
+
ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), ggml_nbytes(src0));
|
|
2610
|
+
void * input_roll_mul_scale_buffer = roll_mul_scale_allocator.get();
|
|
2611
|
+
size_t input_nb[GGML_MAX_DIMS];
|
|
2607
2612
|
input_nb[0] = ggml_type_size(src0->type);
|
|
2608
2613
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2609
2614
|
input_nb[i] = input_nb[i - 1] * src0->ne[i - 1];
|
|
2610
2615
|
}
|
|
2611
|
-
|
|
2612
|
-
input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
input_roll_buffer, ggml_cann_type_mapping(src0->type),
|
|
2616
|
-
|
|
2616
|
+
acl_tensor_ptr acl_input_roll_mul_scale_tensor =
|
|
2617
|
+
ggml_cann_create_tensor(input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type),
|
|
2618
|
+
ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS);
|
|
2619
|
+
acl_tensor_ptr acl_input_roll_reshape_tensor =
|
|
2620
|
+
ggml_cann_create_tensor(input_roll_buffer, ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
|
|
2621
|
+
src0->ne, input_nb, GGML_MAX_DIMS);
|
|
2617
2622
|
|
|
2618
|
-
aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor,
|
|
2619
|
-
acl_input_roll_mul_scale_tensor);
|
|
2623
|
+
aclnn_mul(ctx, acl_input_roll_reshape_tensor.get(), acl_minus_one_tensor.get(),
|
|
2624
|
+
acl_input_roll_mul_scale_tensor.get());
|
|
2620
2625
|
|
|
2621
2626
|
// output
|
|
2622
|
-
void* output_fp32_buffer;
|
|
2627
|
+
void * output_fp32_buffer;
|
|
2623
2628
|
if (src0->type == GGML_TYPE_F32) {
|
|
2624
|
-
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor);
|
|
2625
|
-
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor,
|
|
2626
|
-
|
|
2627
|
-
aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
|
|
2629
|
+
aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get());
|
|
2630
|
+
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get());
|
|
2631
|
+
aclnn_add(ctx, acl_src.get(), acl_input_roll_mul_scale_tensor.get(), acl_dst.get());
|
|
2628
2632
|
// TODO: ne0 != n_dims in mode2
|
|
2629
2633
|
} else if (src0->type == GGML_TYPE_F16) {
|
|
2630
2634
|
size_t input_fp32_nb[GGML_MAX_DIMS];
|
|
2631
|
-
input_fp32_nb[0] = sizeof(
|
|
2635
|
+
input_fp32_nb[0] = sizeof(float);
|
|
2632
2636
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2633
2637
|
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
|
|
2634
2638
|
}
|
|
2635
|
-
ggml_cann_pool_alloc fp32_allocator1(
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
2643
|
-
|
|
2644
|
-
|
|
2645
|
-
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
2649
|
-
|
|
2650
|
-
|
|
2651
|
-
|
|
2652
|
-
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
|
|
2653
|
-
input_fp32_nb, GGML_MAX_DIMS);
|
|
2654
|
-
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
|
|
2655
|
-
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
|
|
2656
|
-
input_fp32_tensor2);
|
|
2657
|
-
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
|
|
2658
|
-
output_fp32_tensor);
|
|
2659
|
-
aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16);
|
|
2660
|
-
|
|
2661
|
-
ggml_cann_release_resources(ctx, input_fp32_tensor1, input_fp32_tensor2,
|
|
2662
|
-
output_fp32_tensor, acl_sin_reshape_tensor,
|
|
2663
|
-
acl_minus_one_tensor, acl_input_roll_mul_scale_tensor,
|
|
2664
|
-
acl_input_roll_reshape_tensor, acl_src);
|
|
2639
|
+
ggml_cann_pool_alloc fp32_allocator1(ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
|
2640
|
+
void * input_fp32_buffer1 = fp32_allocator1.get();
|
|
2641
|
+
acl_tensor_ptr input_fp32_tensor1 = ggml_cann_create_tensor(input_fp32_buffer1, ACL_FLOAT, sizeof(float),
|
|
2642
|
+
dst->ne, input_fp32_nb, GGML_MAX_DIMS);
|
|
2643
|
+
ggml_cann_pool_alloc fp32_allocator2(ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
|
2644
|
+
void * input_fp32_buffer2 = fp32_allocator2.get();
|
|
2645
|
+
acl_tensor_ptr input_fp32_tensor2 = ggml_cann_create_tensor(input_fp32_buffer2, ACL_FLOAT, sizeof(float),
|
|
2646
|
+
dst->ne, input_fp32_nb, GGML_MAX_DIMS);
|
|
2647
|
+
|
|
2648
|
+
ggml_cann_pool_alloc fp32_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
|
2649
|
+
output_fp32_buffer = fp32_allocator.get();
|
|
2650
|
+
acl_tensor_ptr output_fp32_tensor = ggml_cann_create_tensor(output_fp32_buffer, ACL_FLOAT, sizeof(float),
|
|
2651
|
+
dst->ne, input_fp32_nb, GGML_MAX_DIMS);
|
|
2652
|
+
aclnn_mul(ctx, acl_src.get(), acl_cos_reshape_tensor.get(), input_fp32_tensor1.get());
|
|
2653
|
+
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor.get(), acl_sin_reshape_tensor.get(), input_fp32_tensor2.get());
|
|
2654
|
+
aclnn_add(ctx, input_fp32_tensor1.get(), input_fp32_tensor2.get(), output_fp32_tensor.get());
|
|
2655
|
+
aclnn_cast(ctx, output_fp32_tensor.get(), acl_dst.get(), ACL_FLOAT16);
|
|
2665
2656
|
}
|
|
2666
2657
|
return;
|
|
2667
2658
|
#endif
|
|
@@ -2670,178 +2661,150 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
2670
2661
|
int64_t acl_mode = mode == 0 ? 1 : mode;
|
|
2671
2662
|
|
|
2672
2663
|
switch (src0->type) {
|
|
2673
|
-
case GGML_TYPE_F32:
|
|
2674
|
-
|
|
2675
|
-
|
|
2676
|
-
|
|
2677
|
-
|
|
2678
|
-
case GGML_TYPE_F16: {
|
|
2679
|
-
ggml_cann_pool_alloc src_trans_allocator(
|
|
2680
|
-
ctx.pool(), ggml_nelements(src0) * sizeof(float));
|
|
2681
|
-
void* src_trans_buffer = src_trans_allocator.get();
|
|
2682
|
-
ggml_cann_pool_alloc dst_trans_allocator(
|
|
2683
|
-
ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
|
2684
|
-
void* dst_trans_buffer = dst_trans_allocator.get();
|
|
2685
|
-
|
|
2686
|
-
size_t src_trans_nb[GGML_MAX_DIMS];
|
|
2687
|
-
src_trans_nb[0] = sizeof(float);
|
|
2688
|
-
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2689
|
-
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
2664
|
+
case GGML_TYPE_F32:
|
|
2665
|
+
{
|
|
2666
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
|
|
2667
|
+
acl_sin_reshape_tensor.get(), acl_mode, acl_dst.get());
|
|
2668
|
+
break;
|
|
2690
2669
|
}
|
|
2670
|
+
case GGML_TYPE_F16:
|
|
2671
|
+
{
|
|
2672
|
+
ggml_cann_pool_alloc src_trans_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(float));
|
|
2673
|
+
void * src_trans_buffer = src_trans_allocator.get();
|
|
2674
|
+
ggml_cann_pool_alloc dst_trans_allocator(ctx.pool(), ggml_nelements(dst) * sizeof(float));
|
|
2675
|
+
void * dst_trans_buffer = dst_trans_allocator.get();
|
|
2676
|
+
|
|
2677
|
+
size_t src_trans_nb[GGML_MAX_DIMS];
|
|
2678
|
+
src_trans_nb[0] = sizeof(float);
|
|
2679
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
2680
|
+
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
|
2681
|
+
}
|
|
2691
2682
|
|
|
2692
|
-
|
|
2693
|
-
|
|
2694
|
-
|
|
2695
|
-
|
|
2696
|
-
dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb,
|
|
2697
|
-
GGML_MAX_DIMS);
|
|
2698
|
-
|
|
2699
|
-
aclnn_cast(ctx, acl_src, acl_src_trans_tensor, ACL_FLOAT);
|
|
2683
|
+
acl_tensor_ptr acl_src_trans_tensor = ggml_cann_create_tensor(
|
|
2684
|
+
src_trans_buffer, ACL_FLOAT, sizeof(float), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
2685
|
+
acl_tensor_ptr acl_dst_trans_tensor = ggml_cann_create_tensor(
|
|
2686
|
+
dst_trans_buffer, ACL_FLOAT, sizeof(float), dst->ne, src_trans_nb, GGML_MAX_DIMS);
|
|
2700
2687
|
|
|
2701
|
-
|
|
2702
|
-
acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
|
|
2703
|
-
acl_dst_trans_tensor);
|
|
2688
|
+
aclnn_cast(ctx, acl_src.get(), acl_src_trans_tensor.get(), ACL_FLOAT);
|
|
2704
2689
|
|
|
2705
|
-
|
|
2690
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(),
|
|
2691
|
+
acl_cos_reshape_tensor.get(), acl_sin_reshape_tensor.get(), acl_mode,
|
|
2692
|
+
acl_dst_trans_tensor.get());
|
|
2706
2693
|
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
}
|
|
2694
|
+
aclnn_cast(ctx, acl_dst_trans_tensor.get(), acl_dst.get(), ACL_FLOAT16);
|
|
2695
|
+
break;
|
|
2696
|
+
}
|
|
2711
2697
|
default:
|
|
2712
2698
|
GGML_ABORT("Unsupported tensor type for GGML_OP_ROPE");
|
|
2713
2699
|
break;
|
|
2714
2700
|
}
|
|
2715
|
-
ggml_cann_release_resources(ctx, acl_cos_reshape_tensor,
|
|
2716
|
-
acl_sin_reshape_tensor, acl_src, acl_dst);
|
|
2717
2701
|
}
|
|
2718
2702
|
|
|
2719
|
-
|
|
2720
|
-
void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
2703
|
+
void ggml_cann_argmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2721
2704
|
ggml_tensor * src0 = dst->src[0];
|
|
2722
2705
|
|
|
2723
|
-
|
|
2724
|
-
|
|
2706
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
|
2707
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
|
|
2725
2708
|
|
|
2726
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src, 3, false, acl_dst);
|
|
2727
|
-
|
|
2728
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
|
2709
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, ArgMax, acl_src.get(), 3, false, acl_dst.get());
|
|
2729
2710
|
}
|
|
2730
2711
|
|
|
2731
|
-
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
2712
|
+
void ggml_cann_conv_transpose_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2732
2713
|
ggml_tensor * src0 = dst->src[0];
|
|
2733
2714
|
ggml_tensor * src1 = dst->src[1];
|
|
2734
2715
|
|
|
2735
2716
|
// stride
|
|
2736
|
-
int64_t s0 = ((const int32_t*)(dst->op_params))[0];
|
|
2717
|
+
int64_t s0 = ((const int32_t *) (dst->op_params))[0];
|
|
2737
2718
|
|
|
2738
|
-
|
|
2739
|
-
|
|
2740
|
-
|
|
2719
|
+
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src1, src1->ne, src1->nb, 3, ACL_FORMAT_NCL);
|
|
2720
|
+
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3, ACL_FORMAT_NCL);
|
|
2721
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3, ACL_FORMAT_NCL);
|
|
2741
2722
|
|
|
2742
2723
|
int64_t strideVal[1];
|
|
2743
|
-
strideVal[0]
|
|
2744
|
-
|
|
2745
|
-
int64_t
|
|
2746
|
-
|
|
2747
|
-
int64_t
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
int64_t groups = 1;
|
|
2751
|
-
int8_t cubeMathType = 0;
|
|
2724
|
+
strideVal[0] = s0;
|
|
2725
|
+
acl_int_array_ptr stride = ggml_cann_create_int_array(strideVal, 1);
|
|
2726
|
+
int64_t paddingVal[] = { 0 };
|
|
2727
|
+
acl_int_array_ptr padding = ggml_cann_create_int_array(paddingVal, 1);
|
|
2728
|
+
int64_t dilationVal[] = { 1 };
|
|
2729
|
+
acl_int_array_ptr dilation = ggml_cann_create_int_array(dilationVal, 1);
|
|
2730
|
+
int8_t cubeMathType = 0;
|
|
2752
2731
|
|
|
2753
2732
|
#ifdef ASCEND_310P
|
|
2754
2733
|
cubeMathType = 1;
|
|
2755
2734
|
#endif
|
|
2756
2735
|
|
|
2757
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
|
|
2758
|
-
|
|
2759
|
-
|
|
2760
|
-
ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
|
|
2736
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input.get(), acl_weight.get(), nullptr, stride.get(), padding.get(),
|
|
2737
|
+
dilation.get(), true, padding.get(), 1, acl_dst.get(), cubeMathType);
|
|
2761
2738
|
}
|
|
2762
2739
|
|
|
2763
|
-
void ggml_cann_elu(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
2740
|
+
void ggml_cann_elu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2764
2741
|
ggml_tensor * src0 = dst->src[0];
|
|
2765
2742
|
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
float alphaValue = 1.0f;
|
|
2770
|
-
aclScalar* alpha = nullptr;
|
|
2771
|
-
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
2743
|
+
acl_tensor_ptr acl_input = ggml_cann_create_tensor(src0);
|
|
2744
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
2772
2745
|
|
|
2773
|
-
|
|
2774
|
-
|
|
2746
|
+
float alphaValue = 1.0f;
|
|
2747
|
+
acl_scalar_ptr alpha = nullptr;
|
|
2748
|
+
alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
2775
2749
|
|
|
2776
|
-
|
|
2750
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Elu, acl_input.get(), alpha.get(), alpha.get(), alpha.get(), acl_dst.get());
|
|
2777
2751
|
}
|
|
2778
2752
|
|
|
2779
|
-
void ggml_cann_mean(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
2753
|
+
void ggml_cann_mean(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2780
2754
|
ggml_tensor * src0 = dst->src[0];
|
|
2781
2755
|
|
|
2782
|
-
|
|
2783
|
-
|
|
2756
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
|
2757
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
2784
2758
|
|
|
2785
|
-
int64_t
|
|
2786
|
-
|
|
2787
|
-
bool
|
|
2759
|
+
int64_t reduceDimValue[] = { 3 };
|
|
2760
|
+
acl_int_array_ptr reduceDim = ggml_cann_create_int_array(reduceDimValue, 1);
|
|
2761
|
+
bool keepDim = true;
|
|
2788
2762
|
|
|
2789
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src, reduceDim, keepDim, ACL_FLOAT, acl_dst);
|
|
2790
|
-
|
|
2791
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, reduceDim);
|
|
2763
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, Mean, acl_src.get(), reduceDim.get(), keepDim, ACL_FLOAT, acl_dst.get());
|
|
2792
2764
|
}
|
|
2793
2765
|
|
|
2794
|
-
void ggml_cann_pad_reflect_1d(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
2795
|
-
ggml_tensor *
|
|
2796
|
-
int32_t *opts
|
|
2797
|
-
int64_t
|
|
2798
|
-
|
|
2766
|
+
void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2767
|
+
ggml_tensor * src0 = dst->src[0];
|
|
2768
|
+
int32_t * opts = (int32_t *) dst->op_params;
|
|
2769
|
+
int64_t paddingsArray[2] = { opts[0], opts[1] };
|
|
2770
|
+
acl_int_array_ptr paddings = ggml_cann_create_int_array(paddingsArray, 2);
|
|
2799
2771
|
|
|
2800
2772
|
for (int64_t i = 0; i < src0->ne[3]; i++) {
|
|
2801
|
-
|
|
2802
|
-
(char*)src0->data + i * src0->ne[3],
|
|
2803
|
-
|
|
2804
|
-
src0->ne, src0->nb, 3);
|
|
2773
|
+
acl_tensor_ptr acl_src =
|
|
2774
|
+
ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
|
|
2775
|
+
ggml_element_size(src0), src0->ne, src0->nb, 3);
|
|
2805
2776
|
|
|
2806
|
-
|
|
2807
|
-
(char*)dst->data + i * src0->ne[3],
|
|
2808
|
-
|
|
2809
|
-
dst->ne, dst->nb, 3);
|
|
2777
|
+
acl_tensor_ptr acl_dst =
|
|
2778
|
+
ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
|
|
2779
|
+
ggml_element_size(dst), dst->ne, dst->nb, 3);
|
|
2810
2780
|
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst);
|
|
2781
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
|
|
2814
2782
|
}
|
|
2815
|
-
ggml_cann_release_resources(ctx, paddings);
|
|
2816
2783
|
}
|
|
2817
2784
|
|
|
2818
|
-
void ggml_cann_count_equal(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
2785
|
+
void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2819
2786
|
ggml_tensor * src0 = dst->src[0];
|
|
2820
2787
|
ggml_tensor * src1 = dst->src[1];
|
|
2821
2788
|
|
|
2822
|
-
|
|
2823
|
-
|
|
2789
|
+
acl_tensor_ptr acl_self = ggml_cann_create_tensor(src0);
|
|
2790
|
+
acl_tensor_ptr acl_other = ggml_cann_create_tensor(src1);
|
|
2824
2791
|
|
|
2825
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self, acl_other);
|
|
2792
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self.get(), acl_other.get());
|
|
2826
2793
|
|
|
2827
2794
|
ggml_cann_sum(ctx, dst);
|
|
2828
|
-
|
|
2829
|
-
ggml_cann_release_resources(ctx, acl_self, acl_other);
|
|
2830
2795
|
}
|
|
2831
2796
|
|
|
2832
|
-
void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
2797
|
+
void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2833
2798
|
ggml_tensor * src0 = dst->src[0];
|
|
2834
2799
|
|
|
2835
|
-
|
|
2836
|
-
|
|
2800
|
+
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
|
2801
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
|
2837
2802
|
|
|
2838
|
-
float
|
|
2839
|
-
|
|
2840
|
-
alpha
|
|
2803
|
+
float alphaValue = 0.0f;
|
|
2804
|
+
acl_scalar_ptr alpha = nullptr;
|
|
2805
|
+
alpha = ggml_cann_create_scalar(&alphaValue, aclDataType::ACL_FLOAT);
|
|
2841
2806
|
|
|
2842
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src, alpha, acl_dst);
|
|
2843
|
-
|
|
2844
|
-
ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
|
|
2807
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src.get(), alpha.get(), acl_dst.get());
|
|
2845
2808
|
}
|
|
2846
2809
|
|
|
2847
2810
|
/**
|
|
@@ -2862,176 +2825,54 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
|
2862
2825
|
* @note This function assumes floating-point data types and is designed for
|
|
2863
2826
|
* MoE architectures, possibly involving sparse expert routing.
|
|
2864
2827
|
*/
|
|
2865
|
-
static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
2828
|
+
static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
2866
2829
|
//dst [M, K, N, 1]
|
|
2867
|
-
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
|
2868
|
-
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
|
|
2830
|
+
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] -> [D, M, K, 1]
|
|
2831
|
+
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 -> [D, 1, K, 1]
|
|
2869
2832
|
ggml_tensor * ids = dst->src[2]; //ids [K, N]
|
|
2870
2833
|
|
|
2871
|
-
|
|
2834
|
+
GGML_ASSERT(src0->ne[3] == 1);
|
|
2835
|
+
GGML_ASSERT(src1->ne[3] == 1);
|
|
2836
|
+
GGML_ASSERT(dst->ne[3] == 1);
|
|
2872
2837
|
|
|
2873
|
-
|
|
2874
|
-
|
|
2875
|
-
int64_t n_ids = ids->ne[0]; // K
|
|
2876
|
-
|
|
2877
|
-
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
2878
|
-
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
|
|
2879
|
-
ACL_MEMCPY_DEVICE_TO_HOST);
|
|
2880
|
-
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
|
2881
|
-
|
|
2882
|
-
char * src0_original = (char *) src0->data;
|
|
2883
|
-
char * src1_original = (char *) src1->data;
|
|
2884
|
-
char * dst_original = (char *) dst->data;
|
|
2885
|
-
size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
|
|
2838
|
+
int64_t batch = src1->ne[2];
|
|
2839
|
+
GGML_ASSERT(batch == ids->ne[1]);
|
|
2886
2840
|
|
|
2887
|
-
|
|
2888
|
-
|
|
2889
|
-
|
|
2890
|
-
|
|
2891
|
-
|
|
2841
|
+
ggml_cann_pool_alloc export_allocator(ctx.pool(), src0->ne[0] * src0->ne[1] * ids->ne[0] * ggml_element_size(src0));
|
|
2842
|
+
void * export_ptr = export_allocator.get();
|
|
2843
|
+
for (int64_t i = 0; i < batch; i++) {
|
|
2844
|
+
acl_tensor_ptr select_index = ggml_cann_create_tensor(ids, ids->ne, ids->nb, 1, ACL_FORMAT_ND, i * ids->nb[1]);
|
|
2845
|
+
acl_tensor_ptr export_weight = ggml_cann_create_tensor(src0, src0->ne, src0->nb, 3);
|
|
2892
2846
|
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
|
|
2847
|
+
int64_t select_export_ne[] = { src0->ne[0], src0->ne[1], ids->ne[0] };
|
|
2848
|
+
size_t select_export_nb[3];
|
|
2849
|
+
select_export_nb[0] = src0->nb[0];
|
|
2850
|
+
for (int k = 1; k < 3; k++) {
|
|
2851
|
+
select_export_nb[k] = select_export_nb[k - 1] * select_export_ne[k - 1];
|
|
2897
2852
|
}
|
|
2898
2853
|
|
|
2899
|
-
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
GGML_CANN_CALL_ACLNN_OP(ctx,
|
|
2903
|
-
ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
|
|
2854
|
+
acl_tensor_ptr select_export =
|
|
2855
|
+
ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
|
|
2856
|
+
select_export_ne, select_export_nb, 3);
|
|
2857
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, export_weight.get(), 0, select_index.get(), select_export.get());
|
|
2904
2858
|
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2859
|
+
int64_t select_transpose_ne[] = { select_export_ne[1], select_export_ne[0], select_export_ne[2] };
|
|
2860
|
+
size_t select_transpose_nb[] = { select_export_nb[1], select_export_nb[0], select_export_nb[2] };
|
|
2861
|
+
acl_tensor_ptr select_export_transpose =
|
|
2862
|
+
ggml_cann_create_tensor(export_ptr, ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
|
|
2863
|
+
select_transpose_ne, select_transpose_nb, 3);
|
|
2908
2864
|
|
|
2909
|
-
|
|
2910
|
-
|
|
2911
|
-
|
|
2912
|
-
|
|
2865
|
+
int64_t active_tensor_ne[] = { src1->ne[0], 1, src1->ne[1] };
|
|
2866
|
+
size_t active_tensor_nb[] = { src1->nb[0], src1->nb[1], src1->nb[1] };
|
|
2867
|
+
acl_tensor_ptr active_tensor =
|
|
2868
|
+
ggml_cann_create_tensor(src1, active_tensor_ne, active_tensor_nb, 3, ACL_FORMAT_ND, i * src1->nb[2]);
|
|
2913
2869
|
|
|
2914
|
-
|
|
2915
|
-
|
|
2916
|
-
|
|
2917
|
-
|
|
2918
|
-
// src0_row [D, M, 1, 1] weight without permute
|
|
2919
|
-
src0_row.ne[2] = 1;
|
|
2920
|
-
src0_row.ne[3] = 1;
|
|
2921
|
-
src0_row.nb[0] = ori_src0_nb[0];
|
|
2922
|
-
src0_row.nb[1] = ori_src0_nb[1];
|
|
2923
|
-
src0_row.nb[2] = ori_src0_nb[1];
|
|
2924
|
-
src0_row.nb[3] = ori_src0_nb[1];
|
|
2870
|
+
int64_t dst_ne[] = { dst->ne[0], 1, dst->ne[1] };
|
|
2871
|
+
size_t dst_nb[] = { dst->nb[0], dst->nb[1], dst->nb[1] };
|
|
2872
|
+
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, dst_ne, dst_nb, 3, ACL_FORMAT_ND, i * dst->nb[2]);
|
|
2925
2873
|
|
|
2926
|
-
|
|
2927
|
-
src1_row.ne[1] = 1;
|
|
2928
|
-
src1_row.ne[2] = 1;
|
|
2929
|
-
src1_row.ne[3] = 1;
|
|
2930
|
-
src1_row.nb[2] = nb11;
|
|
2931
|
-
src1_row.nb[3] = nb11;
|
|
2932
|
-
|
|
2933
|
-
// dst_row [M, 1, 1, 1] -> out
|
|
2934
|
-
dst_row.ne[1] = 1;
|
|
2935
|
-
dst_row.ne[2] = 1;
|
|
2936
|
-
dst_row.ne[3] = 1;
|
|
2937
|
-
dst_row.nb[2] = nb1;
|
|
2938
|
-
dst_row.nb[3] = nb1;
|
|
2939
|
-
|
|
2940
|
-
//create weight for one row
|
|
2941
|
-
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
2942
|
-
for (int64_t id = 0; id < n_ids; id++) {
|
|
2943
|
-
// expert index
|
|
2944
|
-
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
2945
|
-
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
|
2946
|
-
|
|
2947
|
-
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
|
2948
|
-
int64_t i11 = (ne11 == 1 ? 0 : id);
|
|
2949
|
-
int64_t i12 = iid1;
|
|
2950
|
-
|
|
2951
|
-
int64_t i1 = id;
|
|
2952
|
-
int64_t i2 = i12;
|
|
2953
|
-
|
|
2954
|
-
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
|
2955
|
-
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
|
2956
|
-
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
|
2957
|
-
|
|
2958
|
-
src0_row.data = src0_tmp_ptr;
|
|
2959
|
-
src1_row.data = src1_tmp_ptr;
|
|
2960
|
-
dst_row.data = dst_tmp_ptr;
|
|
2961
|
-
dst_row.src[0] = &src0_row;
|
|
2962
|
-
dst_row.src[1] = &src1_row;
|
|
2963
|
-
|
|
2964
|
-
ggml_cann_mul_mat(ctx, &dst_row);
|
|
2965
|
-
}
|
|
2966
|
-
}
|
|
2967
|
-
return;
|
|
2968
|
-
#endif
|
|
2969
|
-
|
|
2970
|
-
std::vector<aclTensor*> src0_tensor_vec;
|
|
2971
|
-
std::vector<aclTensor*> src1_tensor_vec;
|
|
2972
|
-
std::vector<aclTensor*> dst_tensor_vec;
|
|
2973
|
-
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
2974
|
-
for (int64_t id = 0; id < n_ids; id++) {
|
|
2975
|
-
// src0_row [M, D] -> weight && permute
|
|
2976
|
-
int64_t src0_ne[2] = {ne01, ne00};
|
|
2977
|
-
size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
|
|
2978
|
-
// src1_row [D, 1] -> input
|
|
2979
|
-
int64_t src1_ne[2] = {ne10, 1};
|
|
2980
|
-
size_t src1_nb[2] = {nb10, nb11};
|
|
2981
|
-
// dst_row [M, 1] -> out
|
|
2982
|
-
int64_t dst_ne[2] = {ne0, 1};
|
|
2983
|
-
size_t dst_nb[2] = {nb0, nb1};
|
|
2984
|
-
|
|
2985
|
-
// expert index
|
|
2986
|
-
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
2987
|
-
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
|
2988
|
-
|
|
2989
|
-
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
|
2990
|
-
int64_t i11 = (ne11 == 1 ? 0 : id);
|
|
2991
|
-
int64_t i12 = iid1;
|
|
2992
|
-
|
|
2993
|
-
int64_t i1 = id;
|
|
2994
|
-
int64_t i2 = i12;
|
|
2995
|
-
|
|
2996
|
-
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
|
2997
|
-
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
|
2998
|
-
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
|
2999
|
-
|
|
3000
|
-
aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
|
|
3001
|
-
ACL_FLOAT, sizeof(float),
|
|
3002
|
-
src0_ne, src0_nb, 2);
|
|
3003
|
-
aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
|
|
3004
|
-
ACL_FLOAT, sizeof(float),
|
|
3005
|
-
src1_ne, src1_nb, 2);
|
|
3006
|
-
aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
|
|
3007
|
-
ACL_FLOAT, sizeof(float),
|
|
3008
|
-
dst_ne, dst_nb, 2);
|
|
3009
|
-
|
|
3010
|
-
src0_tensor_vec.push_back(acl_src0);
|
|
3011
|
-
src1_tensor_vec.push_back(acl_src1);
|
|
3012
|
-
dst_tensor_vec.push_back(acl_dst);
|
|
3013
|
-
}
|
|
2874
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, BatchMatMul, active_tensor.get(), select_export_transpose.get(), acl_dst.get(), 2);
|
|
3014
2875
|
}
|
|
3015
|
-
|
|
3016
|
-
size_t GROUP_SIZE = 128;
|
|
3017
|
-
// GroupedMatmulV3 required tensor_list.size < 128
|
|
3018
|
-
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
|
3019
|
-
// split and call GroupedMatmulV3
|
|
3020
|
-
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
|
3021
|
-
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
|
3022
|
-
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
|
3023
|
-
std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
|
|
3024
|
-
|
|
3025
|
-
aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
|
|
3026
|
-
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
|
3027
|
-
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
|
3028
|
-
|
|
3029
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV3, src1_tensor_list, src0_tensor_list,
|
|
3030
|
-
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
|
3031
|
-
|
|
3032
|
-
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|
|
3033
|
-
}
|
|
3034
|
-
return;
|
|
3035
2876
|
}
|
|
3036
2877
|
|
|
3037
2878
|
/**
|
|
@@ -3057,7 +2898,7 @@ static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor*
|
|
|
3057
2898
|
* @note This function assumes quantized data types and is designed for
|
|
3058
2899
|
* MoE architectures with potential sparse expert routing.
|
|
3059
2900
|
*/
|
|
3060
|
-
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
2901
|
+
static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
3061
2902
|
// TODO: Use aclnnGroupedMatMul
|
|
3062
2903
|
//dst [M, K, N, 1]
|
|
3063
2904
|
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
|
@@ -3067,24 +2908,24 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
|
|
|
3067
2908
|
GGML_TENSOR_BINARY_OP_LOCALS
|
|
3068
2909
|
|
|
3069
2910
|
// copy index from npu to cpu
|
|
3070
|
-
int64_t n_as
|
|
3071
|
-
int64_t n_ids = ids->ne[0];
|
|
2911
|
+
int64_t n_as = ne02; // A
|
|
2912
|
+
int64_t n_ids = ids->ne[0]; // K
|
|
3072
2913
|
|
|
3073
2914
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
|
3074
|
-
|
|
3075
|
-
|
|
2915
|
+
ACL_CHECK(aclrtMemcpyAsync(ids_host.data(), ggml_nbytes(ids), ids->data, ggml_nbytes(ids),
|
|
2916
|
+
ACL_MEMCPY_DEVICE_TO_HOST, ctx.stream()));
|
|
3076
2917
|
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
|
3077
2918
|
|
|
3078
2919
|
char * src0_original = (char *) src0->data;
|
|
3079
2920
|
char * src1_original = (char *) src1->data;
|
|
3080
|
-
char * dst_original = (char *)
|
|
2921
|
+
char * dst_original = (char *) dst->data;
|
|
3081
2922
|
|
|
3082
2923
|
ggml_tensor src0_row = *src0;
|
|
3083
2924
|
ggml_tensor src1_row = *src1;
|
|
3084
|
-
ggml_tensor dst_row
|
|
2925
|
+
ggml_tensor dst_row = *dst;
|
|
3085
2926
|
|
|
3086
2927
|
const enum ggml_type type = dst->src[0]->type;
|
|
3087
|
-
float
|
|
2928
|
+
float weight_elem_size;
|
|
3088
2929
|
if (type == GGML_TYPE_Q4_0) {
|
|
3089
2930
|
weight_elem_size = float(sizeof(uint8_t)) / 2;
|
|
3090
2931
|
} else if (type == GGML_TYPE_Q8_0) {
|
|
@@ -3094,18 +2935,18 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
|
|
|
3094
2935
|
}
|
|
3095
2936
|
|
|
3096
2937
|
// src0_row [D, M, 1, 1] weight without permute
|
|
3097
|
-
src0_row.ne[2]
|
|
3098
|
-
src0_row.ne[3]
|
|
3099
|
-
src0_row.nb[0]
|
|
3100
|
-
src0_row.nb[1]
|
|
3101
|
-
src0_row.nb[2]
|
|
3102
|
-
src0_row.nb[3]
|
|
2938
|
+
src0_row.ne[2] = 1;
|
|
2939
|
+
src0_row.ne[3] = 1;
|
|
2940
|
+
src0_row.nb[0] = weight_elem_size;
|
|
2941
|
+
src0_row.nb[1] = weight_elem_size * ne00;
|
|
2942
|
+
src0_row.nb[2] = weight_elem_size * ne00;
|
|
2943
|
+
src0_row.nb[3] = weight_elem_size * ne00;
|
|
3103
2944
|
size_t weight_stride = ne00 * ne01 * weight_elem_size;
|
|
3104
|
-
size_t weight_size
|
|
2945
|
+
size_t weight_size = weight_stride * ne02 * ne03;
|
|
3105
2946
|
|
|
3106
2947
|
// scale [D, M, 1, 1] -> scale && permute
|
|
3107
2948
|
size_t scale_elem_size = sizeof(uint16_t);
|
|
3108
|
-
size_t scale_stride
|
|
2949
|
+
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
|
|
3109
2950
|
|
|
3110
2951
|
// src1_row [D, 1, 1, 1] -> input
|
|
3111
2952
|
src1_row.ne[1] = 1;
|
|
@@ -3123,11 +2964,11 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
|
|
|
3123
2964
|
|
|
3124
2965
|
//create weight for one row
|
|
3125
2966
|
ggml_cann_pool_alloc weight_allocator(ctx.pool());
|
|
3126
|
-
void*
|
|
2967
|
+
void * weight_buffer = weight_allocator.alloc(nb02);
|
|
3127
2968
|
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
|
3128
2969
|
for (int64_t id = 0; id < n_ids; id++) {
|
|
3129
2970
|
// expert index
|
|
3130
|
-
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
|
2971
|
+
int32_t i02 = *(int32_t *) (ids_host.data() + iid1 * ids->nb[1] + id * ids->nb[0]);
|
|
3131
2972
|
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
|
3132
2973
|
|
|
3133
2974
|
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
|
@@ -3137,21 +2978,21 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
|
|
|
3137
2978
|
int64_t i1 = id;
|
|
3138
2979
|
int64_t i2 = i12;
|
|
3139
2980
|
|
|
3140
|
-
void* src0_tmp_ptr
|
|
3141
|
-
void* scale_tmp_ptr = src0_original + weight_size + i02*scale_stride;
|
|
3142
|
-
void* src1_tmp_ptr
|
|
3143
|
-
void* dst_tmp_ptr
|
|
2981
|
+
void * src0_tmp_ptr = src0_original + i02 * weight_stride;
|
|
2982
|
+
void * scale_tmp_ptr = src0_original + weight_size + i02 * scale_stride;
|
|
2983
|
+
void * src1_tmp_ptr = src1_original + i11 * nb11 + i12 * nb12;
|
|
2984
|
+
void * dst_tmp_ptr = dst_original + i1 * nb1 + i2 * nb2;
|
|
3144
2985
|
|
|
3145
2986
|
// mem cpy
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
void* scale_buffer = (char*)weight_buffer + weight_stride;
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
src0_row.data
|
|
3153
|
-
src1_row.data
|
|
3154
|
-
dst_row.data
|
|
2987
|
+
ACL_CHECK(aclrtMemcpyAsync(weight_buffer, weight_stride, src0_tmp_ptr, weight_stride,
|
|
2988
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
|
2989
|
+
void * scale_buffer = (char *) weight_buffer + weight_stride;
|
|
2990
|
+
ACL_CHECK(aclrtMemcpyAsync(scale_buffer, scale_stride, scale_tmp_ptr, scale_stride,
|
|
2991
|
+
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
|
2992
|
+
|
|
2993
|
+
src0_row.data = weight_buffer;
|
|
2994
|
+
src1_row.data = src1_tmp_ptr;
|
|
2995
|
+
dst_row.data = dst_tmp_ptr;
|
|
3155
2996
|
dst_row.src[0] = &src0_row;
|
|
3156
2997
|
dst_row.src[1] = &src1_row;
|
|
3157
2998
|
|
|
@@ -3161,7 +3002,7 @@ static void ggml_cann_mul_mat_id_quant(ggml_backend_cann_context& ctx, ggml_tens
|
|
|
3161
3002
|
return;
|
|
3162
3003
|
}
|
|
3163
3004
|
|
|
3164
|
-
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
3005
|
+
void ggml_cann_mul_mat_id(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
3165
3006
|
const enum ggml_type type = dst->src[0]->type;
|
|
3166
3007
|
switch (type) {
|
|
3167
3008
|
case GGML_TYPE_F32:
|
|
@@ -3178,12 +3019,11 @@ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
|
|
3178
3019
|
}
|
|
3179
3020
|
}
|
|
3180
3021
|
|
|
3181
|
-
void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
3182
|
-
|
|
3183
|
-
ggml_tensor*
|
|
3184
|
-
ggml_tensor*
|
|
3185
|
-
ggml_tensor*
|
|
3186
|
-
ggml_tensor* src3 = dst->src[3]; // mask, fp16
|
|
3022
|
+
void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
|
3023
|
+
ggml_tensor * src0 = dst->src[0]; // q, fp32 | B, N, S, D (uncont) -> B, S, N, D (cont)
|
|
3024
|
+
ggml_tensor * src1 = dst->src[1]; // k, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
|
|
3025
|
+
ggml_tensor * src2 = dst->src[2]; // v, fp16 | B, N, S, D (uncont) -> B, S, N, D (cont)
|
|
3026
|
+
ggml_tensor * src3 = dst->src[3]; // mask, fp16
|
|
3187
3027
|
|
|
3188
3028
|
// B, N, S, D (uncont) -> B, S, N, D (cont)
|
|
3189
3029
|
int64_t src0_bsnd_ne[GGML_MAX_DIMS];
|
|
@@ -3199,229 +3039,200 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
|
3199
3039
|
size_t src2_bsnd_nb[GGML_MAX_DIMS];
|
|
3200
3040
|
memcpy(src2_bsnd_nb, src2->nb, GGML_MAX_DIMS * sizeof(size_t));
|
|
3201
3041
|
|
|
3202
|
-
auto transpose12 = [](int64_t* ne, size_t* nb) {
|
|
3042
|
+
auto transpose12 = [](int64_t * ne, size_t * nb) {
|
|
3203
3043
|
int64_t ne_tmp = ne[1];
|
|
3204
3044
|
size_t nb_tmp = nb[1];
|
|
3205
|
-
ne[1]
|
|
3206
|
-
nb[1]
|
|
3207
|
-
ne[2]
|
|
3208
|
-
nb[2]
|
|
3045
|
+
ne[1] = ne[2];
|
|
3046
|
+
nb[1] = nb[2];
|
|
3047
|
+
ne[2] = ne_tmp;
|
|
3048
|
+
nb[2] = nb_tmp;
|
|
3209
3049
|
};
|
|
3210
3050
|
|
|
3211
3051
|
transpose12(src0_bsnd_ne, src0_bsnd_nb);
|
|
3212
3052
|
transpose12(src1_bsnd_ne, src1_bsnd_nb);
|
|
3213
3053
|
transpose12(src2_bsnd_ne, src2_bsnd_nb);
|
|
3214
3054
|
|
|
3215
|
-
float maxBias
|
|
3216
|
-
float scaleValue
|
|
3055
|
+
float maxBias = 0.0f;
|
|
3056
|
+
float scaleValue = 1.0f;
|
|
3217
3057
|
float logitSoftcap = 0.0f;
|
|
3218
|
-
memcpy(&scaleValue,
|
|
3219
|
-
memcpy(&maxBias,
|
|
3220
|
-
memcpy(&logitSoftcap,
|
|
3058
|
+
memcpy(&scaleValue, (float *) dst->op_params + 0, sizeof(float));
|
|
3059
|
+
memcpy(&maxBias, (float *) dst->op_params + 1, sizeof(float));
|
|
3060
|
+
memcpy(&logitSoftcap, (float *) dst->op_params + 2, sizeof(float));
|
|
3221
3061
|
|
|
3222
|
-
if(logitSoftcap == 0.0f){
|
|
3062
|
+
if (logitSoftcap == 0.0f) {
|
|
3223
3063
|
size_t faElemSize = sizeof(uint16_t);
|
|
3224
|
-
auto faDataType = ACL_FLOAT16;
|
|
3064
|
+
auto faDataType = ACL_FLOAT16; //ACL_BF16;
|
|
3225
3065
|
|
|
3226
|
-
|
|
3227
|
-
|
|
3228
|
-
|
|
3229
|
-
aclTensor* acl_dst_f16_tensor = nullptr;
|
|
3066
|
+
acl_tensor_ptr acl_q_tensor = nullptr;
|
|
3067
|
+
acl_tensor_ptr acl_k_tensor = nullptr;
|
|
3068
|
+
acl_tensor_ptr acl_v_tensor = nullptr;
|
|
3230
3069
|
|
|
3231
3070
|
// Step 1: cast the src0 (Query) to fp16 if needed
|
|
3232
3071
|
ggml_cann_pool_alloc src0_f16_allocator(ctx.pool());
|
|
3233
|
-
void*
|
|
3072
|
+
void * src0_f16_buffer = nullptr;
|
|
3234
3073
|
|
|
3235
|
-
if(ggml_cann_type_mapping(src0->type) != faDataType){
|
|
3236
|
-
|
|
3237
|
-
src0_bsnd_nb, GGML_MAX_DIMS);
|
|
3238
|
-
src0_f16_buffer = src0_f16_allocator.alloc(
|
|
3239
|
-
ggml_nelements(src0) * faElemSize);
|
|
3074
|
+
if (ggml_cann_type_mapping(src0->type) != faDataType) {
|
|
3075
|
+
acl_tensor_ptr acl_src0_f32_tensor =
|
|
3076
|
+
ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
|
|
3077
|
+
src0_f16_buffer = src0_f16_allocator.alloc(ggml_nelements(src0) * faElemSize);
|
|
3240
3078
|
|
|
3241
|
-
int64_t* src0_f16_ne = src0_bsnd_ne;
|
|
3242
|
-
size_t
|
|
3079
|
+
int64_t * src0_f16_ne = src0_bsnd_ne;
|
|
3080
|
+
size_t src0_f16_nb[GGML_MAX_DIMS];
|
|
3243
3081
|
src0_f16_nb[0] = sizeof(uint16_t);
|
|
3244
|
-
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
|
3082
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
|
3245
3083
|
src0_f16_nb[i] = src0_f16_nb[i - 1] * src0_f16_ne[i - 1];
|
|
3246
3084
|
}
|
|
3247
3085
|
|
|
3248
|
-
|
|
3249
|
-
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
ggml_cann_release_resources(ctx, acl_src0_f32_tensor);
|
|
3254
|
-
}else{
|
|
3255
|
-
acl_src0_f16_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne,
|
|
3256
|
-
src0_bsnd_nb, GGML_MAX_DIMS);
|
|
3086
|
+
acl_q_tensor = ggml_cann_create_tensor(src0_f16_buffer, faDataType, faElemSize, src0_f16_ne, src0_f16_nb,
|
|
3087
|
+
GGML_MAX_DIMS);
|
|
3088
|
+
aclnn_cast(ctx, acl_src0_f32_tensor.get(), acl_q_tensor.get(), faDataType);
|
|
3089
|
+
} else {
|
|
3090
|
+
acl_q_tensor = ggml_cann_create_tensor(src0, src0_bsnd_ne, src0_bsnd_nb, GGML_MAX_DIMS);
|
|
3257
3091
|
}
|
|
3258
3092
|
|
|
3259
3093
|
// Step 2: create the acl tensors for src1 (Key), src2 (Value),
|
|
3260
3094
|
// and the direct output from FusedInferAttention
|
|
3261
3095
|
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
acl_src2_f16_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne,
|
|
3265
|
-
src2_bsnd_nb, GGML_MAX_DIMS);
|
|
3266
|
-
|
|
3267
|
-
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
|
3268
|
-
void* out_f16_buffer = out_f16_allocator.alloc(
|
|
3269
|
-
ggml_nelements(dst) * faElemSize);
|
|
3270
|
-
|
|
3271
|
-
int64_t* out_f16_ne = src0_bsnd_ne;
|
|
3272
|
-
size_t out_f16_nb[GGML_MAX_DIMS];
|
|
3273
|
-
out_f16_nb[0] = faElemSize;
|
|
3274
|
-
for(int i = 1; i < GGML_MAX_DIMS; ++i){
|
|
3275
|
-
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
|
3276
|
-
}
|
|
3277
|
-
|
|
3278
|
-
acl_dst_f16_tensor = ggml_cann_create_tensor(
|
|
3279
|
-
out_f16_buffer, faDataType, faElemSize,
|
|
3280
|
-
out_f16_ne, out_f16_nb, GGML_MAX_DIMS
|
|
3281
|
-
);
|
|
3096
|
+
acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
|
|
3097
|
+
acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
|
|
3282
3098
|
|
|
3283
3099
|
// Step 3: create the PSEShift tensor if needed
|
|
3284
3100
|
// this tensor is considered as mask (f16) in the llama.cpp
|
|
3285
|
-
|
|
3101
|
+
acl_tensor_ptr bcast_pse_tensor;
|
|
3286
3102
|
ggml_cann_pool_alloc bcast_pse_allocator(ctx.pool());
|
|
3287
|
-
if(src3 != nullptr){
|
|
3103
|
+
if (src3 != nullptr) {
|
|
3288
3104
|
// Construct the truncated pse tensor (common for prefill/decode)
|
|
3289
3105
|
int64_t trunc_pse_ne[GGML_MAX_DIMS] = {
|
|
3290
|
-
src3->ne[0],
|
|
3291
|
-
src0->ne[1],
|
|
3292
|
-
src3->ne[2],
|
|
3293
|
-
src3->ne[3]
|
|
3106
|
+
src3->ne[0], // D
|
|
3107
|
+
src0->ne[1], // S (number of Q tokens)
|
|
3108
|
+
src3->ne[2], // mask N
|
|
3109
|
+
src3->ne[3] // B
|
|
3294
3110
|
};
|
|
3295
|
-
size_t* trunc_pse_nb = src3->nb;
|
|
3111
|
+
size_t * trunc_pse_nb = src3->nb;
|
|
3296
3112
|
|
|
3297
|
-
|
|
3298
|
-
src3->data, ACL_FLOAT16, sizeof(uint16_t),
|
|
3299
|
-
trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS
|
|
3300
|
-
);
|
|
3113
|
+
acl_tensor_ptr acl_mask_f16_trunc_tensor = ggml_cann_create_tensor(
|
|
3114
|
+
src3->data, ACL_FLOAT16, sizeof(uint16_t), trunc_pse_ne, trunc_pse_nb, GGML_MAX_DIMS);
|
|
3301
3115
|
|
|
3302
3116
|
int64_t bcast_pse_ne[GGML_MAX_DIMS];
|
|
3303
|
-
size_t
|
|
3304
|
-
bcast_pse_ne[0] = src3->ne[0];
|
|
3305
|
-
bcast_pse_ne[1] = src0->ne[1];
|
|
3306
|
-
bcast_pse_ne[2] = src0->ne[2];
|
|
3307
|
-
bcast_pse_ne[3] = src3->ne[3];
|
|
3117
|
+
size_t bcast_pse_nb[GGML_MAX_DIMS];
|
|
3118
|
+
bcast_pse_ne[0] = src3->ne[0]; // D
|
|
3119
|
+
bcast_pse_ne[1] = src0->ne[1]; // S
|
|
3120
|
+
bcast_pse_ne[2] = src0->ne[2]; // N (num_heads)
|
|
3121
|
+
bcast_pse_ne[3] = src3->ne[3]; // B
|
|
3308
3122
|
if (maxBias == 0.0f) {
|
|
3309
3123
|
// When maxBias == 0.0f, use nb = 0 reduce once repeat (Qwen2)
|
|
3310
3124
|
// Construct the bcast tensor (simulate repeat on the head dimension using stride=0)
|
|
3311
3125
|
bcast_pse_nb[0] = sizeof(uint16_t);
|
|
3312
3126
|
bcast_pse_nb[1] = bcast_pse_nb[0] * bcast_pse_ne[0];
|
|
3313
|
-
bcast_pse_nb[2] = 0;
|
|
3127
|
+
bcast_pse_nb[2] = 0; // <---- the head dimension shares the same data
|
|
3314
3128
|
bcast_pse_nb[3] = src3->nb[3];
|
|
3315
3129
|
|
|
3316
|
-
bcast_pse_tensor = ggml_cann_create_tensor(
|
|
3317
|
-
|
|
3318
|
-
bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS
|
|
3319
|
-
);
|
|
3130
|
+
bcast_pse_tensor = ggml_cann_create_tensor(src3->data, ACL_FLOAT16, sizeof(uint16_t), bcast_pse_ne,
|
|
3131
|
+
bcast_pse_nb, GGML_MAX_DIMS);
|
|
3320
3132
|
|
|
3321
|
-
ggml_cann_release_resources(ctx, acl_mask_f16_trunc_tensor);
|
|
3322
3133
|
} else {
|
|
3323
3134
|
bcast_pse_nb[0] = sizeof(uint16_t);
|
|
3324
3135
|
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
3325
3136
|
bcast_pse_nb[i] = bcast_pse_nb[i - 1] * bcast_pse_ne[i - 1];
|
|
3326
3137
|
}
|
|
3327
3138
|
|
|
3328
|
-
void* bcast_pse_buffer =
|
|
3329
|
-
ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t)
|
|
3330
|
-
);
|
|
3139
|
+
void * bcast_pse_buffer =
|
|
3140
|
+
bcast_pse_allocator.alloc(ggml_nelements(src3) * src0->ne[2] * sizeof(uint16_t));
|
|
3331
3141
|
|
|
3332
|
-
bcast_pse_tensor = ggml_cann_create_tensor(
|
|
3333
|
-
|
|
3334
|
-
bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS
|
|
3335
|
-
);
|
|
3142
|
+
bcast_pse_tensor = ggml_cann_create_tensor(bcast_pse_buffer, ACL_FLOAT16, sizeof(uint16_t),
|
|
3143
|
+
bcast_pse_ne, bcast_pse_nb, GGML_MAX_DIMS);
|
|
3336
3144
|
|
|
3337
|
-
int64_t repeats[] = {1, src0->ne[2], 1, 1};
|
|
3338
|
-
aclnn_repeat(ctx, acl_mask_f16_trunc_tensor, bcast_pse_tensor, repeats);
|
|
3145
|
+
int64_t repeats[] = { 1, src0->ne[2], 1, 1 };
|
|
3146
|
+
aclnn_repeat(ctx, acl_mask_f16_trunc_tensor.get(), bcast_pse_tensor.get(), repeats);
|
|
3339
3147
|
|
|
3340
3148
|
// alibi
|
|
3341
3149
|
// Compute the slope if needed. Derived from ggml_cann_softmax().
|
|
3342
|
-
const int64_t
|
|
3150
|
+
const int64_t n_heads = src0->ne[2];
|
|
3343
3151
|
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
|
|
3344
|
-
void*
|
|
3345
|
-
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
|
|
3152
|
+
void * slope_buffer = slope_allocator.get();
|
|
3153
|
+
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
|
|
3346
3154
|
|
|
3347
|
-
int64_t slope_ne[] = {1, 1, n_heads, 1};
|
|
3348
|
-
size_t
|
|
3155
|
+
int64_t slope_ne[] = { 1, 1, n_heads, 1 };
|
|
3156
|
+
size_t slope_nb[GGML_MAX_DIMS];
|
|
3349
3157
|
slope_nb[0] = sizeof(uint16_t);
|
|
3350
|
-
for(int i = 1;i<GGML_MAX_DIMS;i++) {
|
|
3351
|
-
slope_nb[i] = slope_nb[i-1] * slope_ne[0];
|
|
3158
|
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
|
3159
|
+
slope_nb[i] = slope_nb[i - 1] * slope_ne[0];
|
|
3352
3160
|
}
|
|
3353
3161
|
|
|
3354
|
-
|
|
3355
|
-
|
|
3356
|
-
|
|
3357
|
-
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
|
|
3358
|
-
|
|
3359
|
-
ggml_cann_release_resources(ctx, slope_tensor, acl_mask_f16_trunc_tensor);
|
|
3162
|
+
acl_tensor_ptr slope_tensor = ggml_cann_create_tensor(slope_buffer, ACL_FLOAT16, sizeof(uint16_t),
|
|
3163
|
+
slope_ne, slope_nb, GGML_MAX_DIMS);
|
|
3164
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor.get(), slope_tensor.get());
|
|
3360
3165
|
}
|
|
3361
3166
|
}
|
|
3362
3167
|
|
|
3363
3168
|
// Step 4: set the inputs for FusedInferAttention.
|
|
3364
|
-
|
|
3365
|
-
|
|
3366
|
-
|
|
3367
|
-
|
|
3368
|
-
|
|
3369
|
-
auto acl_v_tensor_list = aclCreateTensorList(acl_v_tensors, kvTensorNum);
|
|
3370
|
-
|
|
3371
|
-
int64_t numHeads = src0->ne[2]; // N
|
|
3372
|
-
int64_t numKeyValueHeads = src1->ne[2];
|
|
3169
|
+
acl_tensor_list_ptr acl_k_tensor_list = ggml_cann_create_tensor_list(acl_k_tensor);
|
|
3170
|
+
acl_tensor_list_ptr acl_v_tensor_list = ggml_cann_create_tensor_list(acl_v_tensor);
|
|
3171
|
+
|
|
3172
|
+
int64_t numHeads = src0->ne[2]; // N
|
|
3173
|
+
int64_t numKeyValueHeads = src1->ne[2];
|
|
3373
3174
|
// double scaleValue = 1 / sqrt(src0->ne[0]); // 1/sqrt(d)
|
|
3374
|
-
int64_t preTokens
|
|
3375
|
-
int64_t nextTokens
|
|
3376
|
-
char
|
|
3377
|
-
int64_t sparseMode
|
|
3378
|
-
int64_t innerPrecise
|
|
3379
|
-
int64_t blockSize
|
|
3380
|
-
int64_t antiquantMode
|
|
3381
|
-
bool
|
|
3382
|
-
int64_t keyAntiquantMode
|
|
3175
|
+
int64_t preTokens = 65535;
|
|
3176
|
+
int64_t nextTokens = 65535;
|
|
3177
|
+
char layout[5] = { 'B', 'S', 'N', 'D', 0 };
|
|
3178
|
+
int64_t sparseMode = 0;
|
|
3179
|
+
int64_t innerPrecise = (src0->ne[1] == 1) ? 0 : 2;
|
|
3180
|
+
int64_t blockSize = 0;
|
|
3181
|
+
int64_t antiquantMode = 0;
|
|
3182
|
+
bool softmaxLseFlag = false;
|
|
3183
|
+
int64_t keyAntiquantMode = 0;
|
|
3383
3184
|
int64_t valueAntiquantMode = 0;
|
|
3384
3185
|
|
|
3385
|
-
|
|
3386
|
-
|
|
3387
|
-
|
|
3388
|
-
|
|
3389
|
-
|
|
3390
|
-
|
|
3391
|
-
|
|
3392
|
-
|
|
3393
|
-
|
|
3394
|
-
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3398
|
-
|
|
3399
|
-
|
|
3400
|
-
|
|
3401
|
-
|
|
3402
|
-
|
|
3403
|
-
|
|
3404
|
-
|
|
3405
|
-
|
|
3406
|
-
|
|
3407
|
-
|
|
3408
|
-
|
|
3409
|
-
|
|
3186
|
+
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
|
3187
|
+
acl_tensor_ptr fa_dst_tensor;
|
|
3188
|
+
acl_tensor_ptr acl_dst_tensor;
|
|
3189
|
+
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
|
3190
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
3191
|
+
void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
|
|
3192
|
+
|
|
3193
|
+
int64_t * out_f16_ne = src0_bsnd_ne;
|
|
3194
|
+
size_t out_f16_nb[GGML_MAX_DIMS];
|
|
3195
|
+
out_f16_nb[0] = faElemSize;
|
|
3196
|
+
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
|
3197
|
+
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
|
3198
|
+
}
|
|
3199
|
+
|
|
3200
|
+
fa_dst_tensor =
|
|
3201
|
+
ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
|
|
3202
|
+
} else {
|
|
3203
|
+
fa_dst_tensor = ggml_cann_create_tensor(dst);
|
|
3204
|
+
}
|
|
3205
|
+
|
|
3206
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, FusedInferAttentionScoreV2, acl_q_tensor.get(), acl_k_tensor_list.get(),
|
|
3207
|
+
acl_v_tensor_list.get(), // q, k, v
|
|
3208
|
+
bcast_pse_tensor.get(), nullptr, // pse, mask
|
|
3209
|
+
nullptr, nullptr, // actSeqLen, actSeqLenkv
|
|
3210
|
+
nullptr, nullptr, // deqScale1, quantScale1
|
|
3211
|
+
nullptr, nullptr, nullptr, // deqScale2, quantScale2, quantOffset2
|
|
3212
|
+
nullptr, nullptr, // antiquantScale, antiquantOffset
|
|
3213
|
+
nullptr, // blockTable
|
|
3214
|
+
nullptr, nullptr, // qPadSize, kvPadSize
|
|
3215
|
+
nullptr, nullptr, // kAntiquantScale, kAntiQuantOffset
|
|
3216
|
+
nullptr, nullptr, // vAntiquantScale, vAntiQuantOffset
|
|
3217
|
+
nullptr, nullptr, nullptr, // kSharedPrefix, vSharedPrefix, actSharedLen
|
|
3218
|
+
numHeads, scaleValue, // heads, scaleValue
|
|
3219
|
+
preTokens, nextTokens, // preTokens, nextTokens
|
|
3220
|
+
layout, // inputLayout
|
|
3221
|
+
numKeyValueHeads, // numKVHeads
|
|
3222
|
+
sparseMode, innerPrecise, // sparseMode, innerPrecise
|
|
3223
|
+
blockSize, antiquantMode, // blockSize, antiquantMode
|
|
3224
|
+
softmaxLseFlag, // softmaxLseFlag
|
|
3225
|
+
keyAntiquantMode, valueAntiquantMode, // keyAntiqMode, valueAntiqMode
|
|
3226
|
+
fa_dst_tensor.get(), // attentionOut
|
|
3227
|
+
nullptr // softmaxLse
|
|
3410
3228
|
);
|
|
3411
3229
|
|
|
3412
|
-
|
|
3413
|
-
|
|
3414
|
-
|
|
3415
|
-
|
|
3416
|
-
ggml_cann_release_resources(ctx, acl_src0_f16_tensor,
|
|
3417
|
-
acl_src1_f16_tensor,
|
|
3418
|
-
acl_src2_f16_tensor,
|
|
3419
|
-
acl_dst_f16_tensor,
|
|
3420
|
-
acl_dst_tensor);
|
|
3421
|
-
if(src3 != nullptr){
|
|
3422
|
-
ggml_cann_release_resources(ctx, bcast_pse_tensor);
|
|
3230
|
+
if (dst->type == GGML_TYPE_F32) {
|
|
3231
|
+
// Step 6: post-processing, permute and cast to f32
|
|
3232
|
+
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
|
3233
|
+
aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
|
3423
3234
|
}
|
|
3424
|
-
}else{
|
|
3235
|
+
} else {
|
|
3425
3236
|
GGML_ABORT("Function is not implemented.");
|
|
3426
3237
|
}
|
|
3427
3238
|
}
|