@novastera-oss/llamarn 0.4.1 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/RNLlamaCpp.podspec +3 -0
- package/android/CMakeLists.txt +2 -0
- package/android/src/main/cpp/include/llama.h +44 -21
- package/android/src/main/jniLibs/arm64-v8a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libggml.so +0 -0
- package/android/src/main/jniLibs/arm64-v8a/libllama.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-base.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libggml.so +0 -0
- package/android/src/main/jniLibs/armeabi-v7a/libllama.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86/libggml.so +0 -0
- package/android/src/main/jniLibs/x86/libllama.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-base.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml-cpu.so +0 -0
- package/android/src/main/jniLibs/x86_64/libggml.so +0 -0
- package/android/src/main/jniLibs/x86_64/libllama.so +0 -0
- package/cpp/build-info.cpp +2 -2
- package/cpp/llama.cpp/CMakeLists.txt +12 -0
- package/cpp/llama.cpp/CODEOWNERS +116 -10
- package/cpp/llama.cpp/CONTRIBUTING.md +30 -3
- package/cpp/llama.cpp/README.md +13 -5
- package/cpp/llama.cpp/build-xcframework.sh +5 -0
- package/cpp/llama.cpp/cmake/riscv64-spacemit-linux-gnu-gcc.cmake +29 -0
- package/cpp/llama.cpp/common/CMakeLists.txt +12 -2
- package/cpp/llama.cpp/common/arg.cpp +303 -795
- package/cpp/llama.cpp/common/arg.h +2 -3
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.cpp +861 -0
- package/cpp/llama.cpp/common/chat-parser-xml-toolcall.h +45 -0
- package/cpp/llama.cpp/common/chat-parser.cpp +156 -15
- package/cpp/llama.cpp/common/chat-parser.h +13 -0
- package/cpp/llama.cpp/common/chat.cpp +1147 -88
- package/cpp/llama.cpp/common/chat.h +16 -3
- package/cpp/llama.cpp/common/common.cpp +70 -15
- package/cpp/llama.cpp/common/common.h +57 -19
- package/cpp/llama.cpp/common/download.cpp +1072 -0
- package/cpp/llama.cpp/common/download.h +55 -0
- package/cpp/llama.cpp/common/http.h +73 -0
- package/cpp/llama.cpp/common/json-partial.cpp +70 -2
- package/cpp/llama.cpp/common/json-schema-to-grammar.cpp +61 -22
- package/cpp/llama.cpp/common/json-schema-to-grammar.h +2 -0
- package/cpp/llama.cpp/common/log.cpp +59 -2
- package/cpp/llama.cpp/common/log.h +12 -4
- package/cpp/llama.cpp/common/sampling.cpp +84 -8
- package/cpp/llama.cpp/common/sampling.h +3 -1
- package/cpp/llama.cpp/common/speculative.cpp +1 -1
- package/cpp/llama.cpp/convert_hf_to_gguf.py +1608 -233
- package/cpp/llama.cpp/convert_hf_to_gguf_update.py +6 -1
- package/cpp/llama.cpp/convert_lora_to_gguf.py +37 -5
- package/cpp/llama.cpp/ggml/CMakeLists.txt +47 -28
- package/cpp/llama.cpp/ggml/include/ggml-backend.h +19 -1
- package/cpp/llama.cpp/ggml/include/ggml-cpu.h +1 -1
- package/cpp/llama.cpp/ggml/include/ggml-hexagon.h +19 -0
- package/cpp/llama.cpp/ggml/include/ggml-metal.h +1 -6
- package/cpp/llama.cpp/ggml/include/ggml-rpc.h +7 -9
- package/cpp/llama.cpp/ggml/include/ggml-zdnn.h +2 -1
- package/cpp/llama.cpp/ggml/include/ggml.h +199 -6
- package/cpp/llama.cpp/ggml/src/CMakeLists.txt +38 -0
- package/cpp/llama.cpp/ggml/src/ggml-alloc.c +299 -130
- package/cpp/llama.cpp/ggml/src/ggml-backend-impl.h +4 -4
- package/cpp/llama.cpp/ggml/src/ggml-backend-reg.cpp +21 -5
- package/cpp/llama.cpp/ggml/src/ggml-backend.cpp +99 -2
- package/cpp/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-blas/ggml-blas.cpp +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.cpp +57 -45
- package/cpp/llama.cpp/ggml/src/ggml-cann/acl_tensor.h +138 -47
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +1584 -1773
- package/cpp/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +201 -317
- package/cpp/llama.cpp/ggml/src/ggml-cann/common.h +146 -187
- package/cpp/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +771 -713
- package/cpp/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +135 -77
- package/cpp/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +5 -2
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/arm/quants.c +428 -26
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/loongarch/quants.c +16 -17
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/riscv/quants.c +318 -145
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp +50 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/s390/quants.c +155 -60
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch/x86/repack.cpp +8 -8
- package/cpp/llama.cpp/ggml/src/ggml-cpu/arch-fallback.h +0 -1
- package/cpp/llama.cpp/ggml/src/ggml-cpu/common.h +14 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-impl.h +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +108 -64
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +14 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.cpp +530 -87
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kernels.h +37 -45
- package/cpp/llama.cpp/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +349 -127
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.cpp +947 -1218
- package/cpp/llama.cpp/ggml/src/ggml-cpu/ops.h +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-cpu/repack.cpp +143 -29
- package/cpp/llama.cpp/ggml/src/ggml-cpu/simd-mappings.h +82 -76
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.cpp +1025 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime.h +13 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime1_kernels.cpp +3196 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/spacemit/ime_kernels.h +26 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.cpp +151 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/unary-ops.h +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.cpp +233 -28
- package/cpp/llama.cpp/ggml/src/ggml-cpu/vec.h +326 -66
- package/cpp/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +12 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/argsort.cu +102 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/binbcast.cu +110 -76
- package/cpp/llama.cpp/ggml/src/ggml-cuda/common.cuh +167 -38
- package/cpp/llama.cpp/ggml/src/ggml-cuda/conv2d.cu +6 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/convert.cuh +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy-utils.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cu +245 -151
- package/cpp/llama.cpp/ggml/src/ggml-cuda/cpy.cuh +1 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-common.cuh +341 -289
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cu +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile.cuh +1233 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec.cuh +586 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cu +6 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-wmma-f16.cuh +48 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn.cu +123 -220
- package/cpp/llama.cpp/ggml/src/ggml-cuda/getrows.cu +41 -39
- package/cpp/llama.cpp/ggml/src/ggml-cuda/ggml-cuda.cu +715 -45
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cu +150 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/im2col.cuh +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mma.cuh +321 -24
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cu +93 -351
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmf.cuh +828 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cu +164 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmid.cuh +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cu +3 -166
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cu +371 -78
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvf.cuh +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cu +279 -147
- package/cpp/llama.cpp/ggml/src/ggml-cuda/mmvq.cuh +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-cuda/norm.cu +97 -85
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad.cu +46 -23
- package/cpp/llama.cpp/ggml/src/ggml-cuda/pad_reflect_1d.cu +63 -54
- package/cpp/llama.cpp/ggml/src/ggml-cuda/quantize.cu +12 -10
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cu +192 -77
- package/cpp/llama.cpp/ggml/src/ggml-cuda/rope.cuh +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/scale.cu +10 -9
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set-rows.cu +137 -75
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cu +39 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/set.cuh +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq112-dv112.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq128-dv128.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq256-dv256.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq40-dv40.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq576-dv512.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq64-dv64.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq80-dv80.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq96-dv96.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-f16-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q4_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q5_1-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-f16.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q4_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q5_1.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-instance-q8_0-q8_0.cu +7 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/generate_cu_files.py +40 -19
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_1.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_10.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_11.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_12.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_13.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_14.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_15.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_16.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_2.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_3.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_4.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_5.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_6.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_7.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_8.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/mmf-instance-ncols_9.cu +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cu +336 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/topk-moe.cuh +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/tsembd.cu +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cu +105 -11
- package/cpp/llama.cpp/ggml/src/ggml-cuda/unary.cuh +36 -0
- package/cpp/llama.cpp/ggml/src/ggml-cuda/upscale.cu +87 -6
- package/cpp/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +28 -12
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/CMakeLists.txt +68 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/ggml-hexagon.cpp +3807 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/CMakeLists.txt +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/act-ops.c +442 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/binary-ops.c +360 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +157 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ctx.h +40 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.c +69 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-dma.h +119 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-msg.h +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp-ops.h +64 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/htp_iface.idl +16 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-exp.c +93 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-inverse.c +60 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-sigmoid.c +49 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.c +960 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/hvx-utils.h +1032 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/main.c +829 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/matmul-ops.c +2223 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/ops-utils.h +149 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/rope-ops.c +418 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/softmax-ops.c +402 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/unary-ops.c +255 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.c +297 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp/worker-pool.h +57 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.c +448 -0
- package/cpp/llama.cpp/ggml/src/ggml-hexagon/htp-utils.h +220 -0
- package/cpp/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +8 -13
- package/cpp/llama.cpp/ggml/src/ggml-impl.h +110 -12
- package/cpp/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.cpp +446 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-common.h +52 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.h +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-context.m +599 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp +1662 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h +251 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.m +1527 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +244 -39
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp +3844 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.h +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.cpp +723 -0
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal +3453 -1907
- package/cpp/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +3 -1
- package/cpp/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +10 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +1331 -109
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/cvt.cl +126 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32.cl +35 -7
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/flash_attn_f32_f16.cl +31 -4
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemm_moe_mxfp4_f32.cl +162 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/gemv_moe_mxfp4_f32.cl +156 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/get_rows.cl +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_kq_kqv.cl +273 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl +24 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl +154 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_mxfp4_f32_flat.cl +176 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32_flat.cl +222 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_mxfp4_f32_flat.cl +167 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32.cl +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/mul_mv_q8_0_f32_flat.cl +202 -0
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/pad.cl +29 -20
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rms_norm.cl +25 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/rope.cl +50 -24
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/set_rows.cl +123 -10
- package/cpp/llama.cpp/ggml/src/ggml-opencl/kernels/tsembd.cl +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-quants.c +1 -0
- package/cpp/llama.cpp/ggml/src/ggml-rpc/ggml-rpc.cpp +341 -161
- package/cpp/llama.cpp/ggml/src/ggml-sycl/backend.hpp +6 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/binbcast.cpp +6 -5
- package/cpp/llama.cpp/ggml/src/ggml-sycl/common.hpp +74 -15
- package/cpp/llama.cpp/ggml/src/ggml-sycl/concat.cpp +50 -30
- package/cpp/llama.cpp/ggml/src/ggml-sycl/conv.cpp +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-sycl/convert.cpp +166 -99
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/count-equal.hpp +9 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/cpy.cpp +72 -94
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +67 -49
- package/cpp/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +21 -31
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +252 -316
- package/cpp/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/getrows.cpp +9 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +359 -142
- package/cpp/llama.cpp/ggml/src/ggml-sycl/gla.cpp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +80 -60
- package/cpp/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +201 -132
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.cpp +230 -55
- package/cpp/llama.cpp/ggml/src/ggml-sycl/norm.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.cpp +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad.hpp +24 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.cpp +72 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/pad_reflect_1d.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/presets.hpp +2 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.cpp +76 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/repeat_back.hpp +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.cpp +122 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/roll.hpp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/rope.cpp +50 -41
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.cpp +73 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/set_rows.cpp +45 -36
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +330 -165
- package/cpp/llama.cpp/ggml/src/ggml-sycl/softmax.hpp +4 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.cpp +127 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/ssm_conv.hpp +5 -0
- package/cpp/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +12 -6
- package/cpp/llama.cpp/ggml/src/ggml-sycl/wkv.cpp +16 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +38 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +4184 -2159
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +21 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/acc.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add1.comp +28 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/add_id.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/arange.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argmax.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp +33 -26
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/argsort_large.comp +114 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/concat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/contig_copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_dw.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +53 -30
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/conv_transpose_1d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_from_quant.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_to_quant.comp +13 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/copy_transpose.comp +67 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/count_equal.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_f32.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs.comp → dequant_funcs.glsl} +138 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_funcs_cm2.comp → dequant_funcs_cm2.glsl} +18 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{dequant_head.comp → dequant_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_s.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq2_xxs.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_s.comp +7 -6
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq3_xxs.comp +5 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_nl.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_iq4_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q4_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_1.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q5_k.comp +3 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q8_0.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/diag_mask_inf.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/div.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +3 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/fill.comp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +52 -14
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{flash_attn_base.comp → flash_attn_base.glsl} +50 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +61 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +54 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp +5 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/geglu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_binary_head.comp → generic_binary_head.glsl} +10 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows.comp +21 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_quant.comp +28 -18
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_head.comp → glu_head.glsl} +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/group_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col.comp +15 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/im2col_3d.comp +125 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/log.comp +18 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.glsl +229 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iface.glsl +33 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_m.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq1_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq2_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_s.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_iq3_xxs.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_nc.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_p021.comp +9 -7
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +3 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +106 -634
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +118 -9
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +556 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_id_funcs.glsl +70 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +77 -214
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.glsl +589 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_shmem_types.glsl +78 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +97 -13
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_adamw.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/opt_step_sgd.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pad.comp +25 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/pool2d.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/quantize_q8_1.comp +55 -5
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/reglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +45 -3
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_funcs.glsl +227 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.glsl +20 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_multi.comp +5 -52
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_neox.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_norm.comp +5 -35
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_params.glsl +27 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_vision.comp +5 -41
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +29 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/scale.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/silu_back.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/soft_max_back.comp +6 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +23 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_conv.comp +44 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/ssm_scan.comp +140 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sub.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +1 -1
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/swiglu_oai.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +2 -2
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/timestep_embedding.comp +5 -4
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp +171 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +22 -0
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{types.comp → types.glsl} +79 -29
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/upscale.comp +36 -12
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +471 -196
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/CMakeLists.txt +8 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/ggml-webgpu.cpp +1690 -383
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/bin_op.tmpl.wgsl +188 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/binary_head.tmpl +45 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/common_decls.tmpl +930 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.tmpl.wgsl +101 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +57 -10
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/get_rows.tmpl.wgsl +874 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/glu.tmpl.wgsl +323 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl +25 -912
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +97 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_reg_tile.tmpl.wgsl +247 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_subgroup_matrix.tmpl.wgsl +302 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.tmpl.wgsl +267 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rms_norm.wgsl +123 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/rope.tmpl.wgsl +295 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/scale.tmpl.wgsl +90 -0
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/{set_rows.wgsl → set_rows.tmpl.wgsl} +38 -8
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/soft_max.tmpl.wgsl +345 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/common.hpp +59 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn.cpp +96 -314
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.cpp +80 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/mmf.hpp +12 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.cpp +79 -0
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/utils.hpp +19 -0
- package/cpp/llama.cpp/ggml/src/ggml.c +440 -17
- package/cpp/llama.cpp/ggml/src/gguf.cpp +104 -29
- package/cpp/llama.cpp/gguf-py/gguf/constants.py +363 -13
- package/cpp/llama.cpp/gguf-py/gguf/gguf_writer.py +64 -0
- package/cpp/llama.cpp/gguf-py/gguf/lazy.py +8 -3
- package/cpp/llama.cpp/gguf-py/gguf/scripts/gguf_convert_endian.py +6 -0
- package/cpp/llama.cpp/gguf-py/gguf/tensor_mapping.py +156 -18
- package/cpp/llama.cpp/gguf-py/gguf/utility.py +80 -0
- package/cpp/llama.cpp/gguf-py/gguf/vocab.py +4 -4
- package/cpp/llama.cpp/include/llama.h +44 -21
- package/cpp/llama.cpp/media/llama1-icon-transparent.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon-transparent.svg +77 -0
- package/cpp/llama.cpp/media/llama1-icon.png +0 -0
- package/cpp/llama.cpp/media/llama1-icon.svg +87 -0
- package/cpp/llama.cpp/requirements/requirements-all.txt +2 -0
- package/cpp/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +3 -3
- package/cpp/llama.cpp/requirements/requirements-convert_legacy_llama.txt +3 -1
- package/cpp/llama.cpp/requirements/requirements-tool_bench.txt +1 -1
- package/cpp/llama.cpp/src/CMakeLists.txt +101 -0
- package/cpp/llama.cpp/src/llama-adapter.cpp +33 -0
- package/cpp/llama.cpp/src/llama-adapter.h +3 -0
- package/cpp/llama.cpp/src/llama-arch.cpp +344 -14
- package/cpp/llama.cpp/src/llama-arch.h +50 -0
- package/cpp/llama.cpp/src/llama-batch.cpp +63 -31
- package/cpp/llama.cpp/src/llama-batch.h +13 -2
- package/cpp/llama.cpp/src/llama-chat.cpp +85 -3
- package/cpp/llama.cpp/src/llama-chat.h +4 -0
- package/cpp/llama.cpp/src/llama-context.cpp +300 -45
- package/cpp/llama.cpp/src/llama-context.h +16 -6
- package/cpp/llama.cpp/src/llama-cparams.h +2 -1
- package/cpp/llama.cpp/src/llama-grammar.cpp +17 -9
- package/cpp/llama.cpp/src/llama-graph.cpp +226 -64
- package/cpp/llama.cpp/src/llama-graph.h +27 -5
- package/cpp/llama.cpp/src/llama-hparams.cpp +53 -2
- package/cpp/llama.cpp/src/llama-hparams.h +48 -8
- package/cpp/llama.cpp/src/llama-impl.cpp +3 -3
- package/cpp/llama.cpp/src/llama-impl.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.cpp +13 -3
- package/cpp/llama.cpp/src/llama-kv-cache-iswa.h +2 -0
- package/cpp/llama.cpp/src/llama-kv-cache.cpp +120 -62
- package/cpp/llama.cpp/src/llama-kv-cache.h +13 -4
- package/cpp/llama.cpp/src/llama-kv-cells.h +44 -2
- package/cpp/llama.cpp/src/llama-memory-hybrid.cpp +19 -9
- package/cpp/llama.cpp/src/llama-memory-hybrid.h +2 -0
- package/cpp/llama.cpp/src/llama-memory-recurrent.cpp +38 -17
- package/cpp/llama.cpp/src/llama-memory-recurrent.h +5 -2
- package/cpp/llama.cpp/src/llama-memory.h +3 -0
- package/cpp/llama.cpp/src/llama-model-loader.cpp +2 -0
- package/cpp/llama.cpp/src/llama-model.cpp +1070 -12614
- package/cpp/llama.cpp/src/llama-model.h +40 -4
- package/cpp/llama.cpp/src/llama-quant.cpp +14 -6
- package/cpp/llama.cpp/src/llama-sampling.cpp +243 -136
- package/cpp/llama.cpp/src/llama-vocab.cpp +43 -3
- package/cpp/llama.cpp/src/llama-vocab.h +43 -39
- package/cpp/llama.cpp/src/llama.cpp +69 -10
- package/cpp/llama.cpp/src/models/afmoe.cpp +187 -0
- package/cpp/llama.cpp/src/models/apertus.cpp +125 -0
- package/cpp/llama.cpp/src/models/arcee.cpp +135 -0
- package/cpp/llama.cpp/src/models/arctic.cpp +138 -0
- package/cpp/llama.cpp/src/models/arwkv7.cpp +86 -0
- package/cpp/llama.cpp/src/models/baichuan.cpp +122 -0
- package/cpp/llama.cpp/src/models/bailingmoe.cpp +144 -0
- package/cpp/llama.cpp/src/models/bailingmoe2.cpp +135 -0
- package/cpp/llama.cpp/src/models/bert.cpp +176 -0
- package/cpp/llama.cpp/src/models/bitnet.cpp +160 -0
- package/cpp/llama.cpp/src/models/bloom.cpp +101 -0
- package/cpp/llama.cpp/src/models/chameleon.cpp +178 -0
- package/cpp/llama.cpp/src/models/chatglm.cpp +132 -0
- package/cpp/llama.cpp/src/models/codeshell.cpp +111 -0
- package/cpp/llama.cpp/src/models/cogvlm.cpp +100 -0
- package/cpp/llama.cpp/src/models/cohere2-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/command-r.cpp +122 -0
- package/cpp/llama.cpp/src/models/dbrx.cpp +123 -0
- package/cpp/llama.cpp/src/models/deci.cpp +135 -0
- package/cpp/llama.cpp/src/models/deepseek.cpp +144 -0
- package/cpp/llama.cpp/src/models/deepseek2.cpp +237 -0
- package/cpp/llama.cpp/src/models/dots1.cpp +134 -0
- package/cpp/llama.cpp/src/models/dream.cpp +105 -0
- package/cpp/llama.cpp/src/models/ernie4-5-moe.cpp +150 -0
- package/cpp/llama.cpp/src/models/ernie4-5.cpp +110 -0
- package/cpp/llama.cpp/src/models/exaone.cpp +114 -0
- package/cpp/llama.cpp/src/models/exaone4.cpp +123 -0
- package/cpp/llama.cpp/src/models/falcon-h1.cpp +113 -0
- package/cpp/llama.cpp/src/models/falcon.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma-embedding.cpp +120 -0
- package/cpp/llama.cpp/src/models/gemma.cpp +112 -0
- package/cpp/llama.cpp/src/models/gemma2-iswa.cpp +125 -0
- package/cpp/llama.cpp/src/models/gemma3-iswa.cpp +131 -0
- package/cpp/llama.cpp/src/models/gemma3n-iswa.cpp +377 -0
- package/cpp/llama.cpp/src/models/glm4-moe.cpp +153 -0
- package/cpp/llama.cpp/src/models/glm4.cpp +127 -0
- package/cpp/llama.cpp/src/models/gpt2.cpp +105 -0
- package/cpp/llama.cpp/src/models/gptneox.cpp +144 -0
- package/cpp/llama.cpp/src/models/granite-hybrid.cpp +196 -0
- package/cpp/llama.cpp/src/models/granite.cpp +211 -0
- package/cpp/llama.cpp/src/models/graph-context-mamba.cpp +283 -0
- package/cpp/llama.cpp/src/models/grok.cpp +159 -0
- package/cpp/llama.cpp/src/models/grovemoe.cpp +141 -0
- package/cpp/llama.cpp/src/models/hunyuan-dense.cpp +132 -0
- package/cpp/llama.cpp/src/models/hunyuan-moe.cpp +154 -0
- package/cpp/llama.cpp/src/models/internlm2.cpp +120 -0
- package/cpp/llama.cpp/src/models/jais.cpp +86 -0
- package/cpp/llama.cpp/src/models/jamba.cpp +106 -0
- package/cpp/llama.cpp/src/models/lfm2.cpp +173 -0
- package/cpp/llama.cpp/src/models/llada-moe.cpp +122 -0
- package/cpp/llama.cpp/src/models/llada.cpp +99 -0
- package/cpp/llama.cpp/src/models/llama-iswa.cpp +174 -0
- package/cpp/llama.cpp/src/models/llama.cpp +155 -0
- package/cpp/llama.cpp/src/models/mamba.cpp +55 -0
- package/cpp/llama.cpp/src/models/minicpm3.cpp +199 -0
- package/cpp/llama.cpp/src/models/minimax-m2.cpp +124 -0
- package/cpp/llama.cpp/src/models/models.h +485 -0
- package/cpp/llama.cpp/src/models/mpt.cpp +126 -0
- package/cpp/llama.cpp/src/models/nemotron-h.cpp +121 -0
- package/cpp/llama.cpp/src/models/nemotron.cpp +122 -0
- package/cpp/llama.cpp/src/models/neo-bert.cpp +104 -0
- package/cpp/llama.cpp/src/models/olmo.cpp +121 -0
- package/cpp/llama.cpp/src/models/olmo2.cpp +150 -0
- package/cpp/llama.cpp/src/models/olmoe.cpp +124 -0
- package/cpp/llama.cpp/src/models/openai-moe-iswa.cpp +124 -0
- package/cpp/llama.cpp/src/models/openelm.cpp +124 -0
- package/cpp/llama.cpp/src/models/orion.cpp +123 -0
- package/cpp/llama.cpp/src/models/pangu-embedded.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi2.cpp +121 -0
- package/cpp/llama.cpp/src/models/phi3.cpp +152 -0
- package/cpp/llama.cpp/src/models/plamo.cpp +110 -0
- package/cpp/llama.cpp/src/models/plamo2.cpp +316 -0
- package/cpp/llama.cpp/src/models/plm.cpp +168 -0
- package/cpp/llama.cpp/src/models/qwen.cpp +108 -0
- package/cpp/llama.cpp/src/models/qwen2.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen2moe.cpp +151 -0
- package/cpp/llama.cpp/src/models/qwen2vl.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3.cpp +117 -0
- package/cpp/llama.cpp/src/models/qwen3moe.cpp +124 -0
- package/cpp/llama.cpp/src/models/qwen3vl-moe.cpp +149 -0
- package/cpp/llama.cpp/src/models/qwen3vl.cpp +141 -0
- package/cpp/llama.cpp/src/models/refact.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6-base.cpp +162 -0
- package/cpp/llama.cpp/src/models/rwkv6.cpp +94 -0
- package/cpp/llama.cpp/src/models/rwkv6qwen2.cpp +86 -0
- package/cpp/llama.cpp/src/models/rwkv7-base.cpp +135 -0
- package/cpp/llama.cpp/src/models/rwkv7.cpp +90 -0
- package/cpp/llama.cpp/src/models/seed-oss.cpp +124 -0
- package/cpp/llama.cpp/src/models/smallthinker.cpp +120 -0
- package/cpp/llama.cpp/src/models/smollm3.cpp +128 -0
- package/cpp/llama.cpp/src/models/stablelm.cpp +146 -0
- package/cpp/llama.cpp/src/models/starcoder.cpp +100 -0
- package/cpp/llama.cpp/src/models/starcoder2.cpp +121 -0
- package/cpp/llama.cpp/src/models/t5-dec.cpp +166 -0
- package/cpp/llama.cpp/src/models/t5-enc.cpp +96 -0
- package/cpp/llama.cpp/src/models/wavtokenizer-dec.cpp +149 -0
- package/cpp/llama.cpp/src/models/xverse.cpp +108 -0
- package/cpp/llama.cpp/src/unicode.cpp +77 -0
- package/cpp/llama.cpp/src/unicode.h +43 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/CMakeLists.txt +94 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.cpp +9339 -0
- package/cpp/llama.cpp/vendor/cpp-httplib/httplib.h +433 -8222
- package/cpp/llama.cpp/vendor/cpp-httplib/patch-boringssl.cmake +6 -0
- package/cpp/llama.cpp/vendor/miniaudio/miniaudio.h +4179 -1900
- package/cpp/llama.cpp/vendor/minja/chat-template.hpp +9 -2
- package/cpp/llama.cpp/vendor/minja/minja.hpp +101 -22
- package/ios/include/chat.h +16 -3
- package/ios/include/common/minja/chat-template.hpp +9 -2
- package/ios/include/common/minja/minja.hpp +101 -22
- package/ios/include/common.h +57 -19
- package/ios/include/json-schema-to-grammar.h +2 -0
- package/ios/include/llama.h +44 -21
- package/ios/include/log.h +12 -4
- package/ios/include/sampling.h +3 -1
- package/ios/libs/llama.xcframework/Info.plist +20 -20
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6399 -5557
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +6362 -5520
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +4813 -4241
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +19 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +1 -1
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +1 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +199 -6
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/Headers/llama.h +44 -21
- package/ios/libs/llama.xcframework/ios-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/package.json +10 -4
- package/cpp/llama.cpp/ggml/src/ggml-cann/Doxyfile +0 -2579
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cu +0 -371
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f16.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cu +0 -379
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-tile-f32.cuh +0 -3
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f16.cuh +0 -495
- package/cpp/llama.cpp/ggml/src/ggml-cuda/fattn-vec-f32.cuh +0 -486
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu +0 -5
- package/cpp/llama.cpp/ggml/src/ggml-metal/ggml-metal.m +0 -6886
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_base.comp +0 -154
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +0 -105
- package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/rope_head.comp +0 -55
- package/cpp/llama.cpp/ggml/src/ggml-webgpu/wgsl-shaders/cpy.wgsl +0 -60
- package/cpp/llama.cpp/ggml/src/ggml-zdnn/ggml-zdnn-impl.h +0 -97
- package/cpp/llama.cpp/models/ggml-vocab-aquila.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-baichuan.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-bert-bge.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-command-r.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-coder.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-deepseek-llm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-falcon.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-gpt-2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-gpt-neox.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-bpe.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-llama-spm.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-mpt.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-nomic-bert-moe.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-phi-3.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-qwen2.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-refact.gguf.out +0 -46
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf +0 -0
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.inp +0 -112
- package/cpp/llama.cpp/models/ggml-vocab-starcoder.gguf.out +0 -46
- package/cpp/llama.cpp/models/templates/ByteDance-Seed-OSS.jinja +0 -171
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r-plus-tool_use.jinja +0 -202
- package/cpp/llama.cpp/models/templates/CohereForAI-c4ai-command-r7b-12-2024-tool_use.jinja +0 -156
- package/cpp/llama.cpp/models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja +0 -124
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja +0 -152
- package/cpp/llama.cpp/models/templates/Qwen-QwQ-32B.jinja +0 -62
- package/cpp/llama.cpp/models/templates/Qwen-Qwen2.5-7B-Instruct.jinja +0 -54
- package/cpp/llama.cpp/models/templates/Qwen-Qwen3-0.6B.jinja +0 -85
- package/cpp/llama.cpp/models/templates/README.md +0 -25
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Llama-8B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/deepseek-ai-DeepSeek-R1-Distill-Qwen-32B.jinja +0 -1
- package/cpp/llama.cpp/models/templates/fireworks-ai-llama-3-firefunction-v2.jinja +0 -57
- package/cpp/llama.cpp/models/templates/google-gemma-2-2b-it.jinja +0 -4
- package/cpp/llama.cpp/models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja +0 -59
- package/cpp/llama.cpp/models/templates/llama-cpp-deepseek-r1.jinja +0 -76
- package/cpp/llama.cpp/models/templates/llama-cpp-rwkv-world.jinja +0 -34
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.1.jinja +0 -58
- package/cpp/llama.cpp/models/templates/meetkai-functionary-medium-v3.2.jinja +0 -287
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja +0 -93
- package/cpp/llama.cpp/models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja +0 -109
- package/cpp/llama.cpp/models/templates/microsoft-Phi-3.5-mini-instruct.jinja +0 -8
- package/cpp/llama.cpp/models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja +0 -87
- package/cpp/llama.cpp/models/templates/moonshotai-Kimi-K2.jinja +0 -43
- package/cpp/llama.cpp/models/templates/openai-gpt-oss-120b.jinja +0 -331
- package/cpp/llama.cpp/models/templates/unsloth-mistral-Devstral-Small-2507.jinja +0 -105
- package/cpp/llama.cpp/prompts/LLM-questions.txt +0 -49
- package/cpp/llama.cpp/prompts/alpaca.txt +0 -1
- package/cpp/llama.cpp/prompts/assistant.txt +0 -31
- package/cpp/llama.cpp/prompts/chat-with-baichuan.txt +0 -4
- package/cpp/llama.cpp/prompts/chat-with-bob.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-qwen.txt +0 -1
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -7
- package/cpp/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -7
- package/cpp/llama.cpp/prompts/chat.txt +0 -28
- package/cpp/llama.cpp/prompts/dan-modified.txt +0 -1
- package/cpp/llama.cpp/prompts/dan.txt +0 -1
- package/cpp/llama.cpp/prompts/mnemonics.txt +0 -93
- package/cpp/llama.cpp/prompts/parallel-questions.txt +0 -43
- package/cpp/llama.cpp/prompts/reason-act.txt +0 -18
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4247
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/A/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/Resources/Info.plist +0 -32
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/Versions/Current/llama +0 -0
- package/ios/libs/llama.xcframework/macos-arm64_x86_64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5561
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5524
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4246
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Info.plist +0 -35
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/tvos-arm64_x86_64-simulator/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5558
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64/llama.framework/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Info.plist +0 -20
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/DWARF/llama +0 -0
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/aarch64/llama.yml +0 -5520
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/dSYMs/llama.dSYM/Contents/Resources/Relocations/x86_64/llama.yml +0 -4243
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-alloc.h +0 -76
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-backend.h +0 -354
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-blas.h +0 -25
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-cpu.h +0 -145
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-metal.h +0 -66
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml-opt.h +0 -256
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/ggml.h +0 -2492
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/gguf.h +0 -202
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Headers/llama.h +0 -1391
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Info.plist +0 -32
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/Modules/module.modulemap +0 -17
- package/ios/libs/llama.xcframework/xros-arm64_x86_64-simulator/llama.framework/llama +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_bfloat16_support.comp → feature-tests/bfloat16.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat_support.comp → feature-tests/coopmat.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_coopmat2_support.comp → feature-tests/coopmat2.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{test_integer_dot_support.comp → feature-tests/integer_dot.comp} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_head.comp → generic_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{generic_unary_head.comp → generic_unary_head.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{glu_main.comp → glu_main.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{rte.comp → rte.glsl} +0 -0
- /package/cpp/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/{utils.comp → utils.glsl} +0 -0
|
@@ -22,24 +22,24 @@
|
|
|
22
22
|
|
|
23
23
|
#include "ggml-cann.h"
|
|
24
24
|
|
|
25
|
+
#include "ggml-backend-impl.h"
|
|
26
|
+
#include "ggml-cann/aclnn_ops.h"
|
|
27
|
+
#include "ggml-cann/common.h"
|
|
28
|
+
#include "ggml-impl.h"
|
|
29
|
+
#include "ggml.h"
|
|
30
|
+
|
|
25
31
|
#include <acl/acl.h>
|
|
26
|
-
#include <stdarg.h>
|
|
27
32
|
#include <aclnnop/aclnn_trans_matmul_weight.h>
|
|
33
|
+
#include <stdarg.h>
|
|
28
34
|
|
|
35
|
+
#include <chrono>
|
|
29
36
|
#include <cmath>
|
|
30
37
|
#include <cstdio>
|
|
31
38
|
#include <cstring>
|
|
32
39
|
#include <mutex>
|
|
40
|
+
#include <optional>
|
|
33
41
|
#include <queue>
|
|
34
|
-
#include <chrono>
|
|
35
42
|
#include <unordered_set>
|
|
36
|
-
#include <optional>
|
|
37
|
-
|
|
38
|
-
#include "ggml-impl.h"
|
|
39
|
-
#include "ggml-backend-impl.h"
|
|
40
|
-
#include "ggml-cann/aclnn_ops.h"
|
|
41
|
-
#include "ggml-cann/common.h"
|
|
42
|
-
#include "ggml.h"
|
|
43
43
|
|
|
44
44
|
#define GGML_COMMON_DECL_C
|
|
45
45
|
|
|
@@ -56,33 +56,41 @@
|
|
|
56
56
|
* @param line The line number where the error occurred.
|
|
57
57
|
* @param msg The error message.
|
|
58
58
|
*/
|
|
59
|
-
[[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
|
|
60
|
-
const char* file, int line, const char* msg) {
|
|
59
|
+
[[noreturn]] void ggml_cann_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
|
|
61
60
|
int32_t id = -1;
|
|
62
61
|
aclrtGetDevice(&id);
|
|
63
62
|
|
|
64
63
|
GGML_LOG_ERROR("CANN error: %s\n", msg);
|
|
65
|
-
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
|
66
|
-
file, line);
|
|
64
|
+
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
|
|
67
65
|
GGML_LOG_ERROR(" %s\n", stmt);
|
|
68
66
|
// abort with GGML_ASSERT to get a stack trace
|
|
69
67
|
GGML_ABORT("CANN error");
|
|
70
68
|
}
|
|
71
69
|
|
|
70
|
+
// Thread-local variable to record the current device of this thread.
|
|
71
|
+
thread_local int g_current_cann_device = -1;
|
|
72
|
+
|
|
72
73
|
/**
|
|
73
|
-
* @brief
|
|
74
|
+
* @brief Set the CANN device to be used.
|
|
74
75
|
*
|
|
75
|
-
* @param device The device ID to set.
|
|
76
|
+
* @param device The target device ID to set.
|
|
76
77
|
*/
|
|
77
78
|
void ggml_cann_set_device(const int32_t device) {
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
//
|
|
79
|
+
// int current_device = -1;
|
|
80
|
+
// Note: In some CANN versions, if no device has been set yet,
|
|
81
|
+
// aclrtGetDevice(¤t_device) may return 0 by default.
|
|
82
|
+
// aclrtGetDevice(¤t_device);
|
|
81
83
|
|
|
82
|
-
//
|
|
83
|
-
|
|
84
|
-
|
|
84
|
+
// If the current device is already the target one, no need to switch.
|
|
85
|
+
if (device == g_current_cann_device) {
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Switch to the new device.
|
|
85
90
|
ACL_CHECK(aclrtSetDevice(device));
|
|
91
|
+
|
|
92
|
+
// Update the global device record.
|
|
93
|
+
g_current_cann_device = device;
|
|
86
94
|
}
|
|
87
95
|
|
|
88
96
|
/**
|
|
@@ -100,9 +108,11 @@ int32_t ggml_cann_get_device() {
|
|
|
100
108
|
* @brief Get the value of the specified environment variable (name).
|
|
101
109
|
* if not empty, return a std::string object
|
|
102
110
|
*/
|
|
103
|
-
std::optional<std::string> get_env(const std::string& name) {
|
|
104
|
-
const char* val = std::getenv(name.c_str());
|
|
105
|
-
if (!val)
|
|
111
|
+
std::optional<std::string> get_env(const std::string & name) {
|
|
112
|
+
const char * val = std::getenv(name.c_str());
|
|
113
|
+
if (!val) {
|
|
114
|
+
return std::nullopt;
|
|
115
|
+
}
|
|
106
116
|
std::string res = std::string(val);
|
|
107
117
|
std::transform(res.begin(), res.end(), res.begin(), ::tolower);
|
|
108
118
|
return res;
|
|
@@ -111,11 +121,29 @@ std::optional<std::string> get_env(const std::string& name) {
|
|
|
111
121
|
/**
|
|
112
122
|
* @brief Verify whether the environment variable is a valid value.
|
|
113
123
|
*/
|
|
114
|
-
bool parse_bool(const std::string& value) {
|
|
115
|
-
std::unordered_set<std::string> valid_values = {"on", "1", "yes", "y", "enable", "true"};
|
|
124
|
+
bool parse_bool(const std::string & value) {
|
|
125
|
+
std::unordered_set<std::string> valid_values = { "on", "1", "yes", "y", "enable", "true" };
|
|
116
126
|
return valid_values.find(value) != valid_values.end();
|
|
117
127
|
}
|
|
118
128
|
|
|
129
|
+
/**
|
|
130
|
+
* @brief Parse a string as an integer, returning 0 if invalid.
|
|
131
|
+
*
|
|
132
|
+
* This function attempts to convert the input string `value` to an `int`.
|
|
133
|
+
* If the string is not a valid integer or is out of the `int` range,
|
|
134
|
+
* it returns 0.
|
|
135
|
+
*
|
|
136
|
+
* @param value The string to parse.
|
|
137
|
+
* @return The parsed integer, or 0 if conversion fails.
|
|
138
|
+
*/
|
|
139
|
+
int parse_integer(const std::string & value) {
|
|
140
|
+
try {
|
|
141
|
+
return std::stoi(value);
|
|
142
|
+
} catch (...) {
|
|
143
|
+
return 0;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
119
147
|
/**
|
|
120
148
|
* @brief Initialize the CANN device information.
|
|
121
149
|
*
|
|
@@ -127,11 +155,10 @@ bool parse_bool(const std::string& value) {
|
|
|
127
155
|
static ggml_cann_device_info ggml_cann_init() {
|
|
128
156
|
ggml_cann_device_info info = {};
|
|
129
157
|
|
|
130
|
-
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
|
158
|
+
aclError err = aclrtGetDeviceCount((uint32_t *) &info.device_count);
|
|
131
159
|
|
|
132
160
|
if (err != ACL_SUCCESS) {
|
|
133
|
-
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
|
134
|
-
__func__, aclGetRecentErrMsg());
|
|
161
|
+
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", __func__, aclGetRecentErrMsg());
|
|
135
162
|
return info;
|
|
136
163
|
}
|
|
137
164
|
|
|
@@ -139,16 +166,15 @@ static ggml_cann_device_info ggml_cann_init() {
|
|
|
139
166
|
|
|
140
167
|
for (int id = 0; id < info.device_count; ++id) {
|
|
141
168
|
aclrtPhysicalMemProp prop = {};
|
|
142
|
-
prop.handleType
|
|
143
|
-
prop.allocationType
|
|
144
|
-
prop.memAttr
|
|
145
|
-
prop.location.type
|
|
146
|
-
prop.location.id
|
|
147
|
-
prop.reserve
|
|
148
|
-
err
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
info.devices[id].vmm = err == ACL_SUCCESS;
|
|
169
|
+
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
|
|
170
|
+
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
171
|
+
prop.memAttr = ACL_HBM_MEM_HUGE;
|
|
172
|
+
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
173
|
+
prop.location.id = id;
|
|
174
|
+
prop.reserve = 0;
|
|
175
|
+
err = aclrtMemGetAllocationGranularity(&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
|
|
176
|
+
&info.devices[id].vmm_granularity);
|
|
177
|
+
info.devices[id].vmm = err == ACL_SUCCESS;
|
|
152
178
|
|
|
153
179
|
size_t free, total;
|
|
154
180
|
ggml_backend_cann_get_device_memory(id, &free, &total);
|
|
@@ -168,7 +194,7 @@ static ggml_cann_device_info ggml_cann_init() {
|
|
|
168
194
|
*
|
|
169
195
|
* @return A reference to the structure containing the device information.
|
|
170
196
|
*/
|
|
171
|
-
const ggml_cann_device_info& ggml_cann_info() {
|
|
197
|
+
const ggml_cann_device_info & ggml_cann_info() {
|
|
172
198
|
static ggml_cann_device_info info = ggml_cann_init();
|
|
173
199
|
return info;
|
|
174
200
|
}
|
|
@@ -188,7 +214,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
188
214
|
/**
|
|
189
215
|
* @brief The minimum free margin for a buffer.
|
|
190
216
|
*/
|
|
191
|
-
static const size_t min_free_margin = 1ull << 20;
|
|
217
|
+
static const size_t min_free_margin = 1ull << 20; // 1MB
|
|
192
218
|
|
|
193
219
|
/**
|
|
194
220
|
* @brief The alignment for buffer allocation.
|
|
@@ -209,22 +235,18 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
209
235
|
* @brief Structure representing a CANN buffer.
|
|
210
236
|
*/
|
|
211
237
|
struct ggml_cann_buffer {
|
|
212
|
-
void*
|
|
213
|
-
size_t
|
|
214
|
-
std::chrono::steady_clock::time_point last_used;
|
|
238
|
+
void * ptr = nullptr; ///< Pointer to the buffer.
|
|
239
|
+
size_t size = 0; ///< Size of the buffer.
|
|
240
|
+
std::chrono::steady_clock::time_point last_used; ///< Last used time.
|
|
215
241
|
|
|
216
|
-
bool operator>(const ggml_cann_buffer& other) const {
|
|
217
|
-
return size > other.size;
|
|
218
|
-
}
|
|
242
|
+
bool operator>(const ggml_cann_buffer & other) const { return size > other.size; }
|
|
219
243
|
};
|
|
220
244
|
|
|
221
245
|
/**
|
|
222
246
|
* @brief Array of CANN buffers in the pool.
|
|
223
247
|
*/
|
|
224
|
-
std::unordered_map<void*, size_t>
|
|
225
|
-
std::priority_queue<ggml_cann_buffer,
|
|
226
|
-
std::vector<ggml_cann_buffer>,
|
|
227
|
-
std::greater<>> free_buffers ;
|
|
248
|
+
std::unordered_map<void *, size_t> buffer_pool;
|
|
249
|
+
std::priority_queue<ggml_cann_buffer, std::vector<ggml_cann_buffer>, std::greater<>> free_buffers;
|
|
228
250
|
|
|
229
251
|
/**
|
|
230
252
|
* @brief Total size of all buffers in the pool.
|
|
@@ -245,7 +267,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
245
267
|
*/
|
|
246
268
|
~ggml_cann_pool_buf_prio() {
|
|
247
269
|
ggml_cann_set_device(device);
|
|
248
|
-
for (auto& [b_ptr, b_size] : buffer_pool) {
|
|
270
|
+
for (auto & [b_ptr, b_size] : buffer_pool) {
|
|
249
271
|
aclrtFree(b_ptr);
|
|
250
272
|
pool_size -= b_size;
|
|
251
273
|
}
|
|
@@ -261,14 +283,14 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
261
283
|
* the allocated buffer.
|
|
262
284
|
* @return A pointer to the allocated buffer.
|
|
263
285
|
*/
|
|
264
|
-
void* alloc(size_t size, size_t* actual_size) override {
|
|
286
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
265
287
|
size = GGML_PAD(size, alignment);
|
|
266
288
|
if (size == 0) {
|
|
267
289
|
size = alignment;
|
|
268
290
|
}
|
|
269
291
|
|
|
270
|
-
void* ptr = nullptr;
|
|
271
|
-
auto
|
|
292
|
+
void * ptr = nullptr;
|
|
293
|
+
auto now = std::chrono::steady_clock::now();
|
|
272
294
|
|
|
273
295
|
std::vector<ggml_cann_buffer> free_buffers_rest;
|
|
274
296
|
free_buffers_rest.reserve(free_buffers.size());
|
|
@@ -281,24 +303,22 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
281
303
|
const size_t margin = b.size - size;
|
|
282
304
|
if (margin <= max_reuse_margin) {
|
|
283
305
|
*actual_size = b.size;
|
|
284
|
-
ptr
|
|
306
|
+
ptr = b.ptr;
|
|
285
307
|
#ifdef DEBUG_CANN_MALLOC
|
|
286
308
|
GGML_LOG_INFO(
|
|
287
309
|
"cann pool[%d]: reused %p, "
|
|
288
310
|
"pool_size = %5u MB, "
|
|
289
311
|
"size = %5u MB, "
|
|
290
312
|
"margin = %5u MB\n",
|
|
291
|
-
device, b.ptr,
|
|
292
|
-
(uint32_t)(GGML_PAD(
|
|
293
|
-
(uint32_t)(GGML_PAD(
|
|
294
|
-
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
|
|
313
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
314
|
+
(uint32_t) (GGML_PAD(size, 1048576) / 1048576),
|
|
315
|
+
(uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
|
|
295
316
|
#endif
|
|
296
317
|
break;
|
|
297
318
|
}
|
|
298
319
|
}
|
|
299
320
|
|
|
300
|
-
bool should_clean = !disable_clean &&
|
|
301
|
-
b.size > min_free_margin &&
|
|
321
|
+
bool should_clean = !disable_clean && b.size > min_free_margin &&
|
|
302
322
|
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
|
|
303
323
|
if (should_clean) {
|
|
304
324
|
// free the buffer if the size is needed to be freed
|
|
@@ -310,20 +330,20 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
310
330
|
"cann pool[%d]: clean %p, "
|
|
311
331
|
"pool_size = %5u MB, "
|
|
312
332
|
"size = %5u MB\n",
|
|
313
|
-
device, b.ptr,
|
|
314
|
-
(uint32_t)(GGML_PAD(
|
|
315
|
-
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
|
333
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
334
|
+
(uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
|
|
316
335
|
#endif
|
|
317
336
|
continue;
|
|
318
337
|
}
|
|
319
338
|
free_buffers_rest.push_back(b);
|
|
320
339
|
}
|
|
321
|
-
for (ggml_cann_buffer &b : free_buffers_rest) {
|
|
340
|
+
for (ggml_cann_buffer & b : free_buffers_rest) {
|
|
322
341
|
free_buffers.push(std::move(b));
|
|
323
342
|
}
|
|
324
343
|
|
|
325
344
|
#ifdef DEBUG_CANN_MALLOC
|
|
326
|
-
GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
|
|
345
|
+
GGML_LOG_INFO("cann pool[%d] free pool_size = %5u MB\n\n", device,
|
|
346
|
+
(uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
|
|
327
347
|
#endif
|
|
328
348
|
if (ptr != nullptr) {
|
|
329
349
|
return ptr;
|
|
@@ -339,8 +359,8 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
339
359
|
"cann pool[%d]: allocate %p, "
|
|
340
360
|
"pool_size = %5u MB, "
|
|
341
361
|
"size = %5u MB\n",
|
|
342
|
-
device, ptr, (uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576),
|
|
343
|
-
(uint32_t)(GGML_PAD(size, 1048576) / 1048576));
|
|
362
|
+
device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
363
|
+
(uint32_t) (GGML_PAD(size, 1048576) / 1048576));
|
|
344
364
|
#endif
|
|
345
365
|
buffer_pool.emplace(ptr, size);
|
|
346
366
|
return ptr;
|
|
@@ -352,7 +372,7 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
352
372
|
* @param ptr Pointer to the buffer to free.
|
|
353
373
|
* @param size Size of the buffer to free.
|
|
354
374
|
*/
|
|
355
|
-
void free(void* ptr, size_t size) override {
|
|
375
|
+
void free(void * ptr, size_t size) override {
|
|
356
376
|
GGML_UNUSED(size);
|
|
357
377
|
auto it = buffer_pool.find(ptr);
|
|
358
378
|
if (it == buffer_pool.end()) {
|
|
@@ -360,13 +380,12 @@ struct ggml_cann_pool_buf_prio : public ggml_cann_pool {
|
|
|
360
380
|
}
|
|
361
381
|
|
|
362
382
|
auto now = std::chrono::steady_clock::now();
|
|
363
|
-
free_buffers.emplace(ggml_cann_buffer{ptr, it->second, now});
|
|
383
|
+
free_buffers.emplace(ggml_cann_buffer{ ptr, it->second, now });
|
|
364
384
|
#ifdef DEBUG_CANN_MALLOC
|
|
365
385
|
GGML_LOG_INFO(
|
|
366
386
|
"cann pool[%d]: return %p, "
|
|
367
387
|
"pool_size = %5u MB\n",
|
|
368
|
-
device, ptr,
|
|
369
|
-
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
|
|
388
|
+
device, ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
|
|
370
389
|
#endif
|
|
371
390
|
}
|
|
372
391
|
};
|
|
@@ -385,7 +404,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
385
404
|
/**
|
|
386
405
|
* @brief The minimum free margin for a buffer.
|
|
387
406
|
*/
|
|
388
|
-
static const size_t min_free_margin = 1ull << 20;
|
|
407
|
+
static const size_t min_free_margin = 1ull << 20; // 1MB
|
|
389
408
|
|
|
390
409
|
/**
|
|
391
410
|
* @brief The alignment for buffer allocation.
|
|
@@ -411,10 +430,10 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
411
430
|
* @brief Structure representing a CANN buffer.
|
|
412
431
|
*/
|
|
413
432
|
struct ggml_cann_buffer {
|
|
414
|
-
void*
|
|
415
|
-
size_t
|
|
416
|
-
bool
|
|
417
|
-
std::chrono::steady_clock::time_point last_used;
|
|
433
|
+
void * ptr = nullptr; ///< Pointer to the buffer memory.
|
|
434
|
+
size_t size = 0; ///< Size of the buffer.
|
|
435
|
+
bool used = false; ///< Whether the buffer is currently in use.
|
|
436
|
+
std::chrono::steady_clock::time_point last_used; ///< Last used time.
|
|
418
437
|
};
|
|
419
438
|
|
|
420
439
|
/**
|
|
@@ -442,7 +461,7 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
442
461
|
~ggml_cann_pool_buf() {
|
|
443
462
|
ggml_cann_set_device(device);
|
|
444
463
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
445
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
464
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
446
465
|
if (b.ptr != nullptr) {
|
|
447
466
|
aclrtFree(b.ptr);
|
|
448
467
|
pool_size -= b.size;
|
|
@@ -459,18 +478,18 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
459
478
|
* the allocated buffer.
|
|
460
479
|
* @return A pointer to the allocated buffer.
|
|
461
480
|
*/
|
|
462
|
-
void* alloc(size_t size, size_t* actual_size) override {
|
|
481
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
463
482
|
size = GGML_PAD(size, alignment);
|
|
464
483
|
if (size == 0) {
|
|
465
484
|
size = alignment;
|
|
466
485
|
}
|
|
467
486
|
|
|
468
|
-
void* ptr = nullptr;
|
|
469
|
-
auto
|
|
487
|
+
void * ptr = nullptr;
|
|
488
|
+
auto now = std::chrono::steady_clock::now();
|
|
470
489
|
|
|
471
490
|
int i = 0;
|
|
472
491
|
for (; i < MAX_BUFFERS; ++i) {
|
|
473
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
492
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
474
493
|
if (b.ptr == nullptr) {
|
|
475
494
|
break;
|
|
476
495
|
}
|
|
@@ -482,25 +501,23 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
482
501
|
const size_t margin = b.size - size;
|
|
483
502
|
if (margin <= max_reuse_margin) {
|
|
484
503
|
*actual_size = b.size;
|
|
485
|
-
b.used
|
|
486
|
-
ptr
|
|
504
|
+
b.used = true;
|
|
505
|
+
ptr = b.ptr;
|
|
487
506
|
#ifdef DEBUG_CANN_MALLOC
|
|
488
507
|
GGML_LOG_INFO(
|
|
489
508
|
"cann pool[%d]: reused %p, "
|
|
490
509
|
"pool_size = %5u MB, "
|
|
491
510
|
"size = %5u MB, "
|
|
492
511
|
"margin = %5u MB\n",
|
|
493
|
-
device, b.ptr,
|
|
494
|
-
(uint32_t)(GGML_PAD(
|
|
495
|
-
(uint32_t)(GGML_PAD(
|
|
496
|
-
(uint32_t)(GGML_PAD(margin, 1048576) / 1048576));
|
|
512
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
513
|
+
(uint32_t) (GGML_PAD(size, 1048576) / 1048576),
|
|
514
|
+
(uint32_t) (GGML_PAD(margin, 1048576) / 1048576));
|
|
497
515
|
#endif
|
|
498
516
|
break;
|
|
499
517
|
}
|
|
500
518
|
}
|
|
501
519
|
|
|
502
|
-
bool should_clean = !disable_clean &&
|
|
503
|
-
b.size > min_free_margin &&
|
|
520
|
+
bool should_clean = !disable_clean && b.size > min_free_margin &&
|
|
504
521
|
std::chrono::duration_cast<std::chrono::milliseconds>(now - b.last_used).count() > 100;
|
|
505
522
|
if (should_clean) {
|
|
506
523
|
// free the buffer if the size is needed to be freed
|
|
@@ -511,9 +528,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
511
528
|
"cann pool[%d]: clean %p, "
|
|
512
529
|
"pool_size = %5u MB, "
|
|
513
530
|
"size = %5u MB\n",
|
|
514
|
-
device, b.ptr,
|
|
515
|
-
(uint32_t)(GGML_PAD(
|
|
516
|
-
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
|
531
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
532
|
+
(uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
|
|
517
533
|
#endif
|
|
518
534
|
b.ptr = nullptr;
|
|
519
535
|
}
|
|
@@ -524,13 +540,13 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
524
540
|
|
|
525
541
|
if (i < MAX_BUFFERS) {
|
|
526
542
|
// allocate a new buffer if no buffer can be reused
|
|
527
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
543
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
528
544
|
ggml_cann_set_device(device);
|
|
529
545
|
ACL_CHECK(aclrtMalloc(&b.ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
|
|
530
546
|
pool_size += size;
|
|
531
547
|
*actual_size = size;
|
|
532
|
-
b.size
|
|
533
|
-
b.used
|
|
548
|
+
b.size = size;
|
|
549
|
+
b.used = true;
|
|
534
550
|
if (i >= MAX_BUFFERS - 8) {
|
|
535
551
|
GGML_LOG_WARN("cann pool[%d]: slots almost full\n", device);
|
|
536
552
|
}
|
|
@@ -539,9 +555,8 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
539
555
|
"cann pool[%d]: allocate %p, "
|
|
540
556
|
"pool_size = %5u MB, "
|
|
541
557
|
"size = %5u MB\n",
|
|
542
|
-
device, b.ptr,
|
|
543
|
-
(uint32_t)(GGML_PAD(
|
|
544
|
-
(uint32_t)(GGML_PAD(b.size, 1048576) / 1048576));
|
|
558
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576),
|
|
559
|
+
(uint32_t) (GGML_PAD(b.size, 1048576) / 1048576));
|
|
545
560
|
#endif
|
|
546
561
|
return b.ptr;
|
|
547
562
|
}
|
|
@@ -555,21 +570,20 @@ struct ggml_cann_pool_buf : public ggml_cann_pool {
|
|
|
555
570
|
* @param ptr Pointer to the buffer to free.
|
|
556
571
|
* @param size Size of the buffer to free.
|
|
557
572
|
*/
|
|
558
|
-
void free(void* ptr, size_t size) override {
|
|
573
|
+
void free(void * ptr, size_t size) override {
|
|
559
574
|
GGML_UNUSED(size);
|
|
560
575
|
for (int i = 0; i < MAX_BUFFERS; ++i) {
|
|
561
|
-
ggml_cann_buffer& b = buffer_pool[i];
|
|
576
|
+
ggml_cann_buffer & b = buffer_pool[i];
|
|
562
577
|
if (b.ptr != ptr) {
|
|
563
578
|
continue;
|
|
564
579
|
}
|
|
565
|
-
b.used
|
|
580
|
+
b.used = false;
|
|
566
581
|
b.last_used = std::chrono::steady_clock::now();
|
|
567
582
|
#ifdef DEBUG_CANN_MALLOC
|
|
568
583
|
GGML_LOG_INFO(
|
|
569
584
|
"cann pool[%d]: return %p, "
|
|
570
585
|
"pool_size = %5u MB\n",
|
|
571
|
-
device, b.ptr,
|
|
572
|
-
(uint32_t)(GGML_PAD(pool_size, 1048576) / 1048576));
|
|
586
|
+
device, b.ptr, (uint32_t) (GGML_PAD(pool_size, 1048576) / 1048576));
|
|
573
587
|
#endif
|
|
574
588
|
return;
|
|
575
589
|
}
|
|
@@ -597,7 +611,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
597
611
|
/**
|
|
598
612
|
* @brief Pointer to the start of the virtual memory pool.
|
|
599
613
|
*/
|
|
600
|
-
void* pool_addr = 0;
|
|
614
|
+
void * pool_addr = 0;
|
|
601
615
|
|
|
602
616
|
/**
|
|
603
617
|
* @brief Amount of virtual memory used in the pool.
|
|
@@ -622,7 +636,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
622
636
|
/**
|
|
623
637
|
* @brief Offsets for the mapped memory regions.
|
|
624
638
|
*/
|
|
625
|
-
std::vector<void*> map_offsets;
|
|
639
|
+
std::vector<void *> map_offsets;
|
|
626
640
|
|
|
627
641
|
/**
|
|
628
642
|
* @brief Constructor to initialize the buffer pool with virtual memory for
|
|
@@ -630,11 +644,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
630
644
|
*
|
|
631
645
|
* @param device The device ID to associate with this buffer pool.
|
|
632
646
|
*/
|
|
633
|
-
explicit ggml_cann_pool_vmm(int device)
|
|
634
|
-
|
|
635
|
-
auto dev = ggml_cann_info().devices[device];
|
|
647
|
+
explicit ggml_cann_pool_vmm(int device) : device(device) {
|
|
648
|
+
auto dev = ggml_cann_info().devices[device];
|
|
636
649
|
granularity = dev.vmm_granularity;
|
|
637
|
-
max_size
|
|
650
|
+
max_size = dev.total_vram;
|
|
638
651
|
}
|
|
639
652
|
|
|
640
653
|
/**
|
|
@@ -642,10 +655,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
642
655
|
*/
|
|
643
656
|
~ggml_cann_pool_vmm() {
|
|
644
657
|
if (pool_addr != 0) {
|
|
645
|
-
for (auto& offset : map_offsets) {
|
|
658
|
+
for (auto & offset : map_offsets) {
|
|
646
659
|
ACL_CHECK(aclrtUnmapMem(offset));
|
|
647
660
|
}
|
|
648
|
-
for (auto& handle : handles) {
|
|
661
|
+
for (auto & handle : handles) {
|
|
649
662
|
ACL_CHECK(aclrtFreePhysical(handle));
|
|
650
663
|
}
|
|
651
664
|
ACL_CHECK(aclrtReleaseMemAddress(pool_addr));
|
|
@@ -660,11 +673,11 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
660
673
|
* the allocated buffer.
|
|
661
674
|
* @return A pointer to the allocated buffer.
|
|
662
675
|
*/
|
|
663
|
-
void* alloc(size_t size, size_t* actual_size) override {
|
|
676
|
+
void * alloc(size_t size, size_t * actual_size) override {
|
|
664
677
|
// round up the allocation size to the alignment to ensure that all
|
|
665
678
|
// allocations are aligned for all data types
|
|
666
679
|
const size_t alignment = 128;
|
|
667
|
-
size
|
|
680
|
+
size = GGML_PAD(size, alignment);
|
|
668
681
|
if (size == 0) {
|
|
669
682
|
size = alignment;
|
|
670
683
|
}
|
|
@@ -674,53 +687,51 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
674
687
|
if (size > avail) {
|
|
675
688
|
// round up to the next multiple of the granularity
|
|
676
689
|
size_t reserve_size = size - avail;
|
|
677
|
-
reserve_size
|
|
690
|
+
reserve_size = GGML_PAD(reserve_size, granularity);
|
|
678
691
|
|
|
679
692
|
GGML_ASSERT(pool_size + reserve_size <= max_size);
|
|
680
693
|
|
|
681
694
|
// allocate more physical memory
|
|
682
695
|
aclrtPhysicalMemProp prop = {};
|
|
683
|
-
prop.handleType
|
|
684
|
-
prop.allocationType
|
|
685
|
-
prop.memAttr
|
|
686
|
-
prop.location.type
|
|
687
|
-
prop.location.id
|
|
688
|
-
prop.reserve
|
|
696
|
+
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
|
|
697
|
+
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
|
|
698
|
+
prop.memAttr = ACL_HBM_MEM_HUGE;
|
|
699
|
+
prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
|
|
700
|
+
prop.location.id = device;
|
|
701
|
+
prop.reserve = 0;
|
|
689
702
|
aclrtDrvMemHandle handle;
|
|
690
703
|
ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0));
|
|
691
704
|
|
|
692
705
|
// reserve virtual address space (if not already reserved)
|
|
693
706
|
if (pool_addr == 0) {
|
|
694
|
-
ACL_CHECK(aclrtReserveMemAddress(
|
|
695
|
-
&pool_addr, max_size, 0, NULL, 1));
|
|
707
|
+
ACL_CHECK(aclrtReserveMemAddress(&pool_addr, max_size, 0, NULL, 1));
|
|
696
708
|
}
|
|
697
709
|
|
|
698
710
|
// map at the end of the pool
|
|
699
|
-
ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0,
|
|
700
|
-
handle, 0));
|
|
711
|
+
ACL_CHECK(aclrtMapMem((char *) pool_addr + pool_size, reserve_size, 0, handle, 0));
|
|
701
712
|
|
|
702
713
|
handles.push_back(handle);
|
|
703
|
-
map_offsets.push_back((char*)pool_addr + pool_size);
|
|
714
|
+
map_offsets.push_back((char *) pool_addr + pool_size);
|
|
704
715
|
|
|
705
716
|
// add to the pool
|
|
706
717
|
pool_size += reserve_size;
|
|
707
718
|
|
|
708
719
|
#ifdef DEBUG_CANN_MALLOC
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
720
|
+
GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", device,
|
|
721
|
+
(unsigned long long) (pool_size / 1024 / 1024),
|
|
722
|
+
(unsigned long long) (reserve_size / 1024 / 1024));
|
|
712
723
|
#endif
|
|
713
724
|
}
|
|
714
725
|
|
|
715
726
|
GGML_ASSERT(pool_addr != 0);
|
|
716
727
|
|
|
717
|
-
void* ptr
|
|
728
|
+
void * ptr = (void *) ((char *) pool_addr + pool_used);
|
|
718
729
|
*actual_size = size;
|
|
719
730
|
pool_used += size;
|
|
720
731
|
|
|
721
732
|
#ifdef DEBUG_CANN_MALLOC
|
|
722
|
-
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
|
723
|
-
|
|
733
|
+
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size,
|
|
734
|
+
(unsigned long long) ptr);
|
|
724
735
|
#endif
|
|
725
736
|
return ptr;
|
|
726
737
|
}
|
|
@@ -731,16 +742,16 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
731
742
|
* @param ptr Pointer to the buffer to free.
|
|
732
743
|
* @param size Size of the buffer to free.
|
|
733
744
|
*/
|
|
734
|
-
void free(void* ptr, size_t size) override {
|
|
745
|
+
void free(void * ptr, size_t size) override {
|
|
735
746
|
#ifdef DEBUG_CANN_MALLOC
|
|
736
|
-
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
|
737
|
-
|
|
747
|
+
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size,
|
|
748
|
+
(unsigned long long) ptr);
|
|
738
749
|
#endif
|
|
739
750
|
|
|
740
751
|
pool_used -= size;
|
|
741
752
|
|
|
742
753
|
// all deallocations must be in reverse order of the allocations
|
|
743
|
-
GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used));
|
|
754
|
+
GGML_ASSERT(ptr == (void *) ((char *) pool_addr + pool_used));
|
|
744
755
|
}
|
|
745
756
|
};
|
|
746
757
|
|
|
@@ -752,8 +763,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|
|
752
763
|
* @param device The device ID for which to create the pool.
|
|
753
764
|
* @return A unique pointer to the created CANN pool.
|
|
754
765
|
*/
|
|
755
|
-
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
|
756
|
-
int device) {
|
|
766
|
+
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(int device) {
|
|
757
767
|
std::string mem_pool_type = get_env("GGML_CANN_MEM_POOL").value_or("");
|
|
758
768
|
|
|
759
769
|
if (mem_pool_type == "prio") {
|
|
@@ -778,9 +788,8 @@ std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
|
|
|
778
788
|
* ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID.
|
|
779
789
|
*/
|
|
780
790
|
struct ggml_backend_cann_buffer_context {
|
|
781
|
-
int32_t device;
|
|
782
|
-
void*
|
|
783
|
-
nullptr; ///< Pointer to the device memory allocated for the buffer.
|
|
791
|
+
int32_t device; ///< The device ID associated with this buffer context.
|
|
792
|
+
void * dev_ptr = nullptr; ///< Pointer to the device memory allocated for the buffer.
|
|
784
793
|
|
|
785
794
|
/**
|
|
786
795
|
* @brief Constructor to initialize the CANN buffer context.
|
|
@@ -788,9 +797,7 @@ struct ggml_backend_cann_buffer_context {
|
|
|
788
797
|
* @param device The device ID associated with this buffer context.
|
|
789
798
|
* @param dev_ptr Pointer to the device memory allocated for the buffer.
|
|
790
799
|
*/
|
|
791
|
-
ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr)
|
|
792
|
-
: device(device),
|
|
793
|
-
dev_ptr(dev_ptr) {}
|
|
800
|
+
ggml_backend_cann_buffer_context(int32_t device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
|
794
801
|
|
|
795
802
|
/**
|
|
796
803
|
* @brief Destructor to free the device memory allocated for the buffer.
|
|
@@ -808,8 +815,8 @@ struct ggml_backend_cann_buffer_context {
|
|
|
808
815
|
* @return true if the buffer is a CANN buffer, false otherwise.
|
|
809
816
|
*/
|
|
810
817
|
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
|
|
811
|
-
|
|
812
|
-
|
|
818
|
+
|
|
819
|
+
static bool ggml_backend_buffer_is_cann(ggml_backend_buffer_t buffer) {
|
|
813
820
|
return ggml_backend_buft_is_cann(buffer->buft);
|
|
814
821
|
}
|
|
815
822
|
|
|
@@ -821,10 +828,8 @@ static bool ggml_backend_buffer_is_cann(
|
|
|
821
828
|
*
|
|
822
829
|
* @param buffer The CANN buffer to free.
|
|
823
830
|
*/
|
|
824
|
-
static void ggml_backend_cann_buffer_free_buffer(
|
|
825
|
-
|
|
826
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
827
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
831
|
+
static void ggml_backend_cann_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
|
832
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
828
833
|
delete ctx;
|
|
829
834
|
}
|
|
830
835
|
|
|
@@ -837,10 +842,8 @@ static void ggml_backend_cann_buffer_free_buffer(
|
|
|
837
842
|
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
|
838
843
|
* @return A pointer to the base of the device memory allocated for the buffer.
|
|
839
844
|
*/
|
|
840
|
-
static void* ggml_backend_cann_buffer_get_base(
|
|
841
|
-
|
|
842
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
843
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
845
|
+
static void * ggml_backend_cann_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|
846
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
844
847
|
return ctx->dev_ptr;
|
|
845
848
|
}
|
|
846
849
|
|
|
@@ -857,21 +860,17 @@ static void* ggml_backend_cann_buffer_get_base(
|
|
|
857
860
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
858
861
|
* stored.
|
|
859
862
|
*/
|
|
860
|
-
static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
int64_t n_elems = ggml_nelements(tensor);
|
|
865
|
-
int64_t groups = n_elems / QK4_0;
|
|
866
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
863
|
+
static void ggml_backend_cann_transform_q4_0(ggml_tensor * tensor, const void * src, void * dst) {
|
|
864
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
865
|
+
int64_t groups = n_elems / QK4_0;
|
|
866
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
867
867
|
|
|
868
|
-
uint8_t*
|
|
869
|
-
uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
|
|
868
|
+
uint8_t * quant_offset = (uint8_t *) dst;
|
|
869
|
+
uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
|
|
870
870
|
|
|
871
871
|
for (int i = 0; i < groups; i++) {
|
|
872
|
-
const block_q4_0* group =
|
|
873
|
-
|
|
874
|
-
*scale_offset = group->d;
|
|
872
|
+
const block_q4_0 * group = (const block_q4_0 *) ((const char *) src + i * sizeof(block_q4_0));
|
|
873
|
+
*scale_offset = group->d;
|
|
875
874
|
scale_offset++;
|
|
876
875
|
|
|
877
876
|
// 0-15
|
|
@@ -890,8 +889,7 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
|
890
889
|
}
|
|
891
890
|
|
|
892
891
|
// put (uint4b_t -8) into int4b_t
|
|
893
|
-
for (quant_offset = (uint8_t*)dst;
|
|
894
|
-
quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) {
|
|
892
|
+
for (quant_offset = (uint8_t *) dst; quant_offset < (uint8_t *) dst + quant_bytes; quant_offset++) {
|
|
895
893
|
(*quant_offset) ^= 0x88;
|
|
896
894
|
}
|
|
897
895
|
}
|
|
@@ -909,29 +907,27 @@ static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|
|
909
907
|
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
|
910
908
|
* will be stored.
|
|
911
909
|
*/
|
|
912
|
-
static void ggml_backend_cann_transform_back_q4_0(
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
int64_t groups = n_elems / QK4_0;
|
|
917
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
910
|
+
static void ggml_backend_cann_transform_back_q4_0(const ggml_tensor * tensor, void * src, void * dst) {
|
|
911
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
912
|
+
int64_t groups = n_elems / QK4_0;
|
|
913
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t) / 2;
|
|
918
914
|
|
|
919
|
-
uint8_t*
|
|
920
|
-
uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes);
|
|
915
|
+
uint8_t * quant_offset = (uint8_t *) src;
|
|
916
|
+
uint16_t * scale_offset = (uint16_t *) ((char *) src + quant_bytes);
|
|
921
917
|
|
|
922
|
-
for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) {
|
|
918
|
+
for (; quant_offset < (uint8_t *) src + quant_bytes; quant_offset++) {
|
|
923
919
|
(*quant_offset) ^= 0x88;
|
|
924
920
|
}
|
|
925
|
-
quant_offset = (uint8_t*)src;
|
|
921
|
+
quant_offset = (uint8_t *) src;
|
|
926
922
|
|
|
927
923
|
for (int i = 0; i < groups; i++) {
|
|
928
|
-
block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0));
|
|
929
|
-
group->d
|
|
924
|
+
block_q4_0 * group = (block_q4_0 *) ((char *) dst + i * sizeof(block_q4_0));
|
|
925
|
+
group->d = *scale_offset;
|
|
930
926
|
scale_offset++;
|
|
931
927
|
|
|
932
928
|
// 0-15
|
|
933
929
|
for (int j = 0; j < QK4_0 / 2; j += 2) {
|
|
934
|
-
group->qs[j]
|
|
930
|
+
group->qs[j] = ((*quant_offset) & 0x0F);
|
|
935
931
|
group->qs[j + 1] = ((*quant_offset) >> 4);
|
|
936
932
|
quant_offset++;
|
|
937
933
|
}
|
|
@@ -958,20 +954,17 @@ static void ggml_backend_cann_transform_back_q4_0(
|
|
|
958
954
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
959
955
|
* stored.
|
|
960
956
|
*/
|
|
961
|
-
static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
int64_t groups = n_elems / QK8_0;
|
|
966
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
957
|
+
static void ggml_backend_cann_transform_q8_0(ggml_tensor * tensor, const void * src, void * dst) {
|
|
958
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
959
|
+
int64_t groups = n_elems / QK8_0;
|
|
960
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
967
961
|
|
|
968
|
-
uint8_t*
|
|
969
|
-
uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes);
|
|
962
|
+
uint8_t * quant_offset = (uint8_t *) dst;
|
|
963
|
+
uint16_t * scale_offset = (uint16_t *) ((char *) dst + quant_bytes);
|
|
970
964
|
|
|
971
965
|
for (int i = 0; i < groups; i++) {
|
|
972
|
-
const block_q8_0* group =
|
|
973
|
-
|
|
974
|
-
*scale_offset = group->d;
|
|
966
|
+
const block_q8_0 * group = (const block_q8_0 *) ((const char *) src + i * sizeof(block_q8_0));
|
|
967
|
+
*scale_offset = group->d;
|
|
975
968
|
scale_offset++;
|
|
976
969
|
size_t group_quant_size = QK8_0 * sizeof(uint8_t);
|
|
977
970
|
memcpy(quant_offset, group->qs, group_quant_size);
|
|
@@ -992,19 +985,17 @@ static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|
|
992
985
|
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
|
993
986
|
* will be stored.
|
|
994
987
|
*/
|
|
995
|
-
static void ggml_backend_cann_transform_back_q8_0(
|
|
996
|
-
|
|
997
|
-
int64_t n_elems
|
|
998
|
-
|
|
999
|
-
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
988
|
+
static void ggml_backend_cann_transform_back_q8_0(const ggml_tensor * tensor, const void * src, void * dst) {
|
|
989
|
+
int64_t n_elems = ggml_nelements(tensor);
|
|
990
|
+
int64_t groups = n_elems / QK8_0;
|
|
991
|
+
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
|
1000
992
|
|
|
1001
|
-
const uint8_t*
|
|
1002
|
-
const uint16_t* scale_offset =
|
|
1003
|
-
(const uint16_t*)((const char*)src + quant_bytes);
|
|
993
|
+
const uint8_t * quant_offset = (const uint8_t *) src;
|
|
994
|
+
const uint16_t * scale_offset = (const uint16_t *) ((const char *) src + quant_bytes);
|
|
1004
995
|
|
|
1005
996
|
for (int i = 0; i < groups; i++) {
|
|
1006
|
-
block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0));
|
|
1007
|
-
group->d
|
|
997
|
+
block_q8_0 * group = (block_q8_0 *) ((char *) dst + i * sizeof(block_q8_0));
|
|
998
|
+
group->d = *scale_offset;
|
|
1008
999
|
scale_offset++;
|
|
1009
1000
|
size_t group_quant_size = QK8_0 * sizeof(uint8_t);
|
|
1010
1001
|
memcpy(group->qs, quant_offset, group_quant_size);
|
|
@@ -1024,8 +1015,7 @@ static void ggml_backend_cann_transform_back_q8_0(
|
|
|
1024
1015
|
* @param dst Pointer to the destination buffer where transformed data will be
|
|
1025
1016
|
* stored.
|
|
1026
1017
|
*/
|
|
1027
|
-
static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|
1028
|
-
const void* src, void* dst) {
|
|
1018
|
+
static void ggml_backend_cann_transform(ggml_tensor * tensor, const void * src, void * dst) {
|
|
1029
1019
|
switch (tensor->type) {
|
|
1030
1020
|
case GGML_TYPE_Q4_0:
|
|
1031
1021
|
ggml_backend_cann_transform_q4_0(tensor, src, dst);
|
|
@@ -1050,8 +1040,7 @@ static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|
|
1050
1040
|
* @param dst Pointer to the destination buffer where transformed tensor data
|
|
1051
1041
|
* will be stored.
|
|
1052
1042
|
*/
|
|
1053
|
-
static void ggml_backend_cann_transform_back(
|
|
1054
|
-
const ggml_tensor* tensor, void* src, void* dst) {
|
|
1043
|
+
static void ggml_backend_cann_transform_back(const ggml_tensor * tensor, void * src, void * dst) {
|
|
1055
1044
|
switch (tensor->type) {
|
|
1056
1045
|
case GGML_TYPE_Q4_0:
|
|
1057
1046
|
ggml_backend_cann_transform_back_q4_0(tensor, src, dst);
|
|
@@ -1092,8 +1081,7 @@ static bool need_transform(ggml_type type) {
|
|
|
1092
1081
|
* @param buffer The CANN buffer from which to initialize the tensor.
|
|
1093
1082
|
* @param tensor Pointer to the tensor to be initialized.
|
|
1094
1083
|
*/
|
|
1095
|
-
static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|
1096
|
-
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
|
1084
|
+
static enum ggml_status ggml_backend_cann_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
|
1097
1085
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
|
1098
1086
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
|
1099
1087
|
return GGML_STATUS_SUCCESS;
|
|
@@ -1104,42 +1092,75 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
|
|
|
1104
1092
|
if (ggml_is_quantized(tensor->type)) {
|
|
1105
1093
|
// Initialize padding to 0 to avoid possible NaN values
|
|
1106
1094
|
size_t original_size = ggml_nbytes(tensor);
|
|
1107
|
-
size_t padded_size
|
|
1108
|
-
ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
|
1095
|
+
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
|
1109
1096
|
|
|
1110
1097
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
|
1111
1098
|
size_t memset_size = padded_size - original_size;
|
|
1112
|
-
ACL_CHECK(aclrtMemset((char*)tensor->data + original_size,
|
|
1113
|
-
memset_size, 0, memset_size));
|
|
1099
|
+
ACL_CHECK(aclrtMemset((char *) tensor->data + original_size, memset_size, 0, memset_size));
|
|
1114
1100
|
}
|
|
1115
1101
|
}
|
|
1116
1102
|
return GGML_STATUS_SUCCESS;
|
|
1117
1103
|
}
|
|
1118
1104
|
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1105
|
+
/**
|
|
1106
|
+
* @brief Workspace for caching NZ buffers per device.
|
|
1107
|
+
*
|
|
1108
|
+
* This struct manages a device buffer used in NZ computations. It supports
|
|
1109
|
+
* allocation, reallocation, and clearing of cached memory. The struct is
|
|
1110
|
+
* designed to be used with a global array, one per device.
|
|
1111
|
+
*/
|
|
1112
|
+
struct ggml_cann_nz_workspace {
|
|
1113
|
+
void * ptr; // Pointer to allocated device buffer
|
|
1114
|
+
size_t allocated; // Size of currently allocated buffer in bytes
|
|
1115
|
+
|
|
1116
|
+
/**
|
|
1117
|
+
* @brief Constructor. Initializes the workspace with no allocated memory.
|
|
1118
|
+
*/
|
|
1119
|
+
ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
|
|
1123
1120
|
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1121
|
+
/**
|
|
1122
|
+
* @brief Free cached memory and reset the workspace.
|
|
1123
|
+
*
|
|
1124
|
+
* If a buffer has been allocated, this function releases it using
|
|
1125
|
+
* aclrtFree and resets internal state.
|
|
1126
|
+
*/
|
|
1127
|
+
void clear() {
|
|
1128
|
+
if (ptr) {
|
|
1129
|
+
ACL_CHECK(aclrtFree(ptr));
|
|
1130
|
+
ptr = nullptr;
|
|
1131
|
+
allocated = 0;
|
|
1129
1132
|
}
|
|
1130
1133
|
}
|
|
1131
1134
|
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1135
|
+
/**
|
|
1136
|
+
* @brief Allocate or reallocate the workspace buffer.
|
|
1137
|
+
*
|
|
1138
|
+
* If the requested size is larger than the currently allocated size,
|
|
1139
|
+
* the old buffer will be freed and a new buffer of the requested size
|
|
1140
|
+
* will be allocated on the device.
|
|
1141
|
+
*
|
|
1142
|
+
* @param new_size Size in bytes to allocate for the workspace.
|
|
1143
|
+
*/
|
|
1144
|
+
void realloc(size_t new_size) {
|
|
1145
|
+
if (new_size > allocated) {
|
|
1146
|
+
clear();
|
|
1147
|
+
ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
|
1148
|
+
allocated = new_size;
|
|
1137
1149
|
}
|
|
1138
|
-
ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
|
|
1139
|
-
g_nz_workspace_allocated = new_size;
|
|
1140
|
-
}
|
|
1141
1150
|
}
|
|
1142
|
-
|
|
1151
|
+
|
|
1152
|
+
/**
|
|
1153
|
+
* @brief Get the device buffer pointer.
|
|
1154
|
+
*
|
|
1155
|
+
* @return Pointer to the allocated buffer, or nullptr if not allocated.
|
|
1156
|
+
*/
|
|
1157
|
+
void * get() const { return ptr; }
|
|
1158
|
+
};
|
|
1159
|
+
|
|
1160
|
+
/**
|
|
1161
|
+
* @brief Global array of NZ workspaces, one per device.
|
|
1162
|
+
*/
|
|
1163
|
+
static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
|
|
1143
1164
|
|
|
1144
1165
|
/**
|
|
1145
1166
|
* @brief Convert tensor weights to NZ format using Ascend CANN API.
|
|
@@ -1149,26 +1170,25 @@ namespace {
|
|
|
1149
1170
|
* improve performance on certain hardware.
|
|
1150
1171
|
*
|
|
1151
1172
|
* @param tensor Pointer to the input ggml_tensor containing the weights.
|
|
1152
|
-
* @param data Pointer to the raw data buffer for the tensor weights.
|
|
1153
1173
|
* @param offset Byte offset within the tensor data buffer where weights start.
|
|
1174
|
+
* @param device device id.
|
|
1154
1175
|
*
|
|
1155
1176
|
* @note The workspace buffer used in this function is managed globally and reused
|
|
1156
1177
|
* across calls. This reduces overhead from repeated memory allocation and deallocation.
|
|
1157
1178
|
*/
|
|
1158
|
-
static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
aclOpExecutor *executor;
|
|
1179
|
+
static void weight_format_to_nz(ggml_tensor * tensor, size_t offset, int device) {
|
|
1180
|
+
acl_tensor_ptr weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, tensor->nb, 2, ACL_FORMAT_ND, offset);
|
|
1181
|
+
uint64_t workspaceSize = 0;
|
|
1182
|
+
aclOpExecutor * executor;
|
|
1163
1183
|
|
|
1164
1184
|
// TransMatmulWeight
|
|
1165
|
-
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
|
|
1166
|
-
&workspaceSize, &executor));
|
|
1185
|
+
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed.get(), &workspaceSize, &executor));
|
|
1167
1186
|
// Avoid frequent malloc/free of the workspace.
|
|
1168
|
-
|
|
1187
|
+
g_nz_workspaces[device].realloc(workspaceSize);
|
|
1188
|
+
|
|
1189
|
+
void * g_nz_workspace = g_nz_workspaces[device].get();
|
|
1169
1190
|
|
|
1170
1191
|
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
|
|
1171
|
-
ACL_CHECK(aclDestroyTensor(weightTransposed));
|
|
1172
1192
|
}
|
|
1173
1193
|
|
|
1174
1194
|
// TODO: need handle tensor which has paddings.
|
|
@@ -1184,11 +1204,12 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
|
|
|
1184
1204
|
* @param offset Offset in the source data from where to start copying.
|
|
1185
1205
|
* @param size Size of the data to be copied, in bytes.
|
|
1186
1206
|
*/
|
|
1187
|
-
static void ggml_backend_cann_buffer_set_tensor(
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1207
|
+
static void ggml_backend_cann_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
|
1208
|
+
ggml_tensor * tensor,
|
|
1209
|
+
const void * data,
|
|
1210
|
+
size_t offset,
|
|
1211
|
+
size_t size) {
|
|
1212
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1192
1213
|
|
|
1193
1214
|
ggml_cann_set_device(ctx->device);
|
|
1194
1215
|
// TODO: refer to cann(#6017), it use thread's default stream.
|
|
@@ -1196,22 +1217,19 @@ static void ggml_backend_cann_buffer_set_tensor(
|
|
|
1196
1217
|
// Why aclrtSynchronizeDevice?
|
|
1197
1218
|
|
|
1198
1219
|
// Only check env once.
|
|
1199
|
-
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
|
1220
|
+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
|
1200
1221
|
if (!need_transform(tensor->type)) {
|
|
1201
|
-
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
|
|
1202
|
-
|
|
1203
|
-
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
|
1222
|
+
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
|
1223
|
+
if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
|
1204
1224
|
GGML_ASSERT(tensor->ne[2] == 1);
|
|
1205
1225
|
GGML_ASSERT(tensor->ne[3] == 1);
|
|
1206
|
-
weight_format_to_nz(tensor, offset);
|
|
1226
|
+
weight_format_to_nz(tensor, offset, ctx->device);
|
|
1207
1227
|
}
|
|
1208
1228
|
} else {
|
|
1209
|
-
void *transform_buffer = malloc(size);
|
|
1229
|
+
void * transform_buffer = malloc(size);
|
|
1210
1230
|
ggml_backend_cann_transform(tensor, data, transform_buffer);
|
|
1211
1231
|
|
|
1212
|
-
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
|
|
1213
|
-
transform_buffer, size,
|
|
1214
|
-
ACL_MEMCPY_HOST_TO_DEVICE));
|
|
1232
|
+
ACL_CHECK(aclrtMemcpy((char *) tensor->data + offset, size, transform_buffer, size, ACL_MEMCPY_HOST_TO_DEVICE));
|
|
1215
1233
|
free(transform_buffer);
|
|
1216
1234
|
}
|
|
1217
1235
|
}
|
|
@@ -1229,22 +1247,20 @@ static void ggml_backend_cann_buffer_set_tensor(
|
|
|
1229
1247
|
* @param offset Offset in the destination buffer where to start copying.
|
|
1230
1248
|
* @param size Size of the data to be copied, in bytes.
|
|
1231
1249
|
*/
|
|
1232
|
-
static void ggml_backend_cann_buffer_get_tensor(
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1250
|
+
static void ggml_backend_cann_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
|
1251
|
+
const ggml_tensor * tensor,
|
|
1252
|
+
void * data,
|
|
1253
|
+
size_t offset,
|
|
1254
|
+
size_t size) {
|
|
1255
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1237
1256
|
|
|
1238
1257
|
ggml_cann_set_device(ctx->device);
|
|
1239
1258
|
|
|
1240
1259
|
if (!need_transform(tensor->type)) {
|
|
1241
|
-
ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size,
|
|
1242
|
-
ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1260
|
+
ACL_CHECK(aclrtMemcpy(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1243
1261
|
} else {
|
|
1244
|
-
void* transform_buffer = malloc(size);
|
|
1245
|
-
ACL_CHECK(aclrtMemcpy(transform_buffer, size,
|
|
1246
|
-
(char*)tensor->data + offset, size,
|
|
1247
|
-
ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1262
|
+
void * transform_buffer = malloc(size);
|
|
1263
|
+
ACL_CHECK(aclrtMemcpy(transform_buffer, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST));
|
|
1248
1264
|
ggml_backend_cann_transform_back(tensor, transform_buffer, data);
|
|
1249
1265
|
free(transform_buffer);
|
|
1250
1266
|
}
|
|
@@ -1263,31 +1279,31 @@ static void ggml_backend_cann_buffer_get_tensor(
|
|
|
1263
1279
|
* @param dst Pointer to the destination tensor where the data will be copied.
|
|
1264
1280
|
* @return true if the copy operation succeeded, false otherwise.
|
|
1265
1281
|
*/
|
|
1266
|
-
static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
1267
|
-
|
|
1282
|
+
static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
|
1283
|
+
const ggml_tensor * src,
|
|
1284
|
+
ggml_tensor * dst) {
|
|
1268
1285
|
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
|
1269
|
-
ggml_backend_cann_buffer_context* src_ctx =
|
|
1270
|
-
|
|
1271
|
-
ggml_backend_cann_buffer_context* dst_ctx =
|
|
1272
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
1286
|
+
ggml_backend_cann_buffer_context * src_ctx = (ggml_backend_cann_buffer_context *) src->buffer->context;
|
|
1287
|
+
ggml_backend_cann_buffer_context * dst_ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1273
1288
|
|
|
1274
1289
|
size_t memcpy_size = ggml_nbytes(src);
|
|
1275
1290
|
// Same device.
|
|
1276
1291
|
if (src_ctx->device == dst_ctx->device) {
|
|
1277
|
-
ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
|
|
1278
|
-
(const char*)src->data, memcpy_size,
|
|
1292
|
+
ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
|
|
1279
1293
|
ACL_MEMCPY_DEVICE_TO_DEVICE));
|
|
1280
1294
|
return true;
|
|
1281
1295
|
} else {
|
|
1296
|
+
#ifdef ASCEND_310P
|
|
1297
|
+
// TODO: Support 310p P2P copy
|
|
1298
|
+
return false;
|
|
1299
|
+
#endif
|
|
1282
1300
|
// Different device but can access by peer.
|
|
1283
1301
|
int32_t canAccessPeer = 0;
|
|
1284
|
-
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
|
|
1285
|
-
dst_ctx->device));
|
|
1302
|
+
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, dst_ctx->device));
|
|
1286
1303
|
if (canAccessPeer) {
|
|
1287
1304
|
ggml_cann_set_device(src_ctx->device);
|
|
1288
1305
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0));
|
|
1289
|
-
ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size,
|
|
1290
|
-
(const char*)src->data, memcpy_size,
|
|
1306
|
+
ACL_CHECK(aclrtMemcpy((char *) dst->data, memcpy_size, (const char *) src->data, memcpy_size,
|
|
1291
1307
|
ACL_MEMCPY_DEVICE_TO_DEVICE));
|
|
1292
1308
|
return true;
|
|
1293
1309
|
}
|
|
@@ -1305,10 +1321,8 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
|
|
|
1305
1321
|
* @param buffer The CANN buffer to be cleared.
|
|
1306
1322
|
* @param value The value to which each byte in the buffer will be set.
|
|
1307
1323
|
*/
|
|
1308
|
-
static void ggml_backend_cann_buffer_clear(
|
|
1309
|
-
|
|
1310
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
1311
|
-
(ggml_backend_cann_buffer_context*)buffer->context;
|
|
1324
|
+
static void ggml_backend_cann_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
|
1325
|
+
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
|
1312
1326
|
|
|
1313
1327
|
ggml_cann_set_device(ctx->device);
|
|
1314
1328
|
ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size));
|
|
@@ -1338,9 +1352,8 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
|
|
1338
1352
|
* buffer type.
|
|
1339
1353
|
*/
|
|
1340
1354
|
struct ggml_backend_cann_buffer_type_context {
|
|
1341
|
-
int32_t
|
|
1342
|
-
|
|
1343
|
-
std::string name; /**< Name associated with the buffer context. */
|
|
1355
|
+
int32_t device; /**< Device identifier associated with the buffer context. */
|
|
1356
|
+
std::string name; /**< Name associated with the buffer context. */
|
|
1344
1357
|
};
|
|
1345
1358
|
|
|
1346
1359
|
/**
|
|
@@ -1352,10 +1365,8 @@ struct ggml_backend_cann_buffer_type_context {
|
|
|
1352
1365
|
* @param buft Pointer to the buffer type context.
|
|
1353
1366
|
* @return Const pointer to the C-style string containing the name.
|
|
1354
1367
|
*/
|
|
1355
|
-
static const char* ggml_backend_cann_buffer_type_name(
|
|
1356
|
-
|
|
1357
|
-
ggml_backend_cann_buffer_type_context* buft_ctx =
|
|
1358
|
-
(ggml_backend_cann_buffer_type_context*)buft->context;
|
|
1368
|
+
static const char * ggml_backend_cann_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
|
1369
|
+
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
|
|
1359
1370
|
|
|
1360
1371
|
return buft_ctx->name.c_str();
|
|
1361
1372
|
}
|
|
@@ -1370,34 +1381,27 @@ static const char* ggml_backend_cann_buffer_type_name(
|
|
|
1370
1381
|
* @param size Size in bytes of the buffer to allocate.
|
|
1371
1382
|
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
|
1372
1383
|
*/
|
|
1373
|
-
static ggml_backend_buffer_t
|
|
1374
|
-
|
|
1375
|
-
size_t size) {
|
|
1376
|
-
ggml_backend_cann_buffer_type_context* buft_ctx =
|
|
1377
|
-
(ggml_backend_cann_buffer_type_context*)buft->context;
|
|
1384
|
+
static ggml_backend_buffer_t ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
|
1385
|
+
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
|
|
1378
1386
|
|
|
1379
1387
|
ggml_cann_set_device(buft_ctx->device);
|
|
1380
1388
|
|
|
1381
1389
|
const size_t alignment = 128;
|
|
1382
|
-
size
|
|
1390
|
+
size = GGML_PAD(size, alignment);
|
|
1383
1391
|
if (size == 0) {
|
|
1384
1392
|
size = alignment;
|
|
1385
1393
|
}
|
|
1386
|
-
void*
|
|
1394
|
+
void * dev_ptr;
|
|
1387
1395
|
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
|
1388
1396
|
if (err != ACL_SUCCESS) {
|
|
1389
|
-
GGML_LOG_ERROR(
|
|
1390
|
-
|
|
1391
|
-
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
|
1392
|
-
aclGetRecentErrMsg());
|
|
1397
|
+
GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", __func__,
|
|
1398
|
+
size / 1024.0 / 1024.0, buft_ctx->device, aclGetRecentErrMsg());
|
|
1393
1399
|
return nullptr;
|
|
1394
1400
|
}
|
|
1395
1401
|
|
|
1396
|
-
ggml_backend_cann_buffer_context* ctx =
|
|
1397
|
-
new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
|
|
1402
|
+
ggml_backend_cann_buffer_context * ctx = new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr);
|
|
1398
1403
|
|
|
1399
|
-
return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface,
|
|
1400
|
-
ctx, size);
|
|
1404
|
+
return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, ctx, size);
|
|
1401
1405
|
}
|
|
1402
1406
|
|
|
1403
1407
|
/**
|
|
@@ -1412,8 +1416,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
|
1412
1416
|
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
|
1413
1417
|
* buffers).
|
|
1414
1418
|
*/
|
|
1415
|
-
static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|
1416
|
-
ggml_backend_buffer_type_t buft) {
|
|
1419
|
+
static size_t ggml_backend_cann_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
|
1417
1420
|
return 128;
|
|
1418
1421
|
|
|
1419
1422
|
GGML_UNUSED(buft);
|
|
@@ -1433,13 +1436,13 @@ static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|
|
1433
1436
|
* @return The total allocation size in bytes required for the tensor in the
|
|
1434
1437
|
* CANN buffer.
|
|
1435
1438
|
*/
|
|
1436
|
-
static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|
1437
|
-
|
|
1438
|
-
size_t
|
|
1439
|
-
int64_t ne0
|
|
1439
|
+
static size_t ggml_backend_cann_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft,
|
|
1440
|
+
const ggml_tensor * tensor) {
|
|
1441
|
+
size_t size = ggml_nbytes(tensor);
|
|
1442
|
+
int64_t ne0 = tensor->ne[0];
|
|
1440
1443
|
|
|
1441
1444
|
// Only check env once.
|
|
1442
|
-
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
|
|
1445
|
+
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
|
|
1443
1446
|
|
|
1444
1447
|
// last line must bigger than 32, because every single op deal at
|
|
1445
1448
|
// least 32 bytes.
|
|
@@ -1449,19 +1452,17 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
|
|
1449
1452
|
// size += (line_size_align_32 - line_size);
|
|
1450
1453
|
if (ggml_is_quantized(tensor->type)) {
|
|
1451
1454
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
|
1452
|
-
size += ggml_row_size(
|
|
1453
|
-
tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
1455
|
+
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
|
1454
1456
|
}
|
|
1455
|
-
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
|
|
1457
|
+
} else if (weight_to_nz && is_matmul_weight((const ggml_tensor *) tensor)) {
|
|
1456
1458
|
// NZ format weight are not support quantized yet.
|
|
1457
1459
|
// If ND tensor transform to NZ, size may changed.
|
|
1458
|
-
int64_t shape[] = {tensor->ne[1], tensor->ne[0]};
|
|
1460
|
+
int64_t shape[] = { tensor->ne[1], tensor->ne[0] };
|
|
1459
1461
|
GGML_ASSERT(tensor->ne[2] == 1);
|
|
1460
1462
|
GGML_ASSERT(tensor->ne[3] == 1);
|
|
1461
|
-
const aclIntArray *acl_shape = aclCreateIntArray(shape, 2);
|
|
1462
|
-
size_t
|
|
1463
|
-
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape,
|
|
1464
|
-
ggml_cann_type_mapping(tensor->type), &new_size));
|
|
1463
|
+
const aclIntArray * acl_shape = aclCreateIntArray(shape, 2);
|
|
1464
|
+
size_t new_size;
|
|
1465
|
+
ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, ggml_cann_type_mapping(tensor->type), &new_size));
|
|
1465
1466
|
ACL_CHECK(aclDestroyIntArray(acl_shape));
|
|
1466
1467
|
size = std::max(size, new_size);
|
|
1467
1468
|
}
|
|
@@ -1502,17 +1503,15 @@ static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface
|
|
|
1502
1503
|
* @return A pointer to the buffer type interface for the specified device, or
|
|
1503
1504
|
* nullptr if the device index is out of range.
|
|
1504
1505
|
*/
|
|
1505
|
-
ggml_backend_buffer_type_t
|
|
1506
|
-
|
|
1507
|
-
static std::mutex mutex;
|
|
1506
|
+
ggml_backend_buffer_type_t ggml_backend_cann_buffer_type(int32_t device) {
|
|
1507
|
+
static std::mutex mutex;
|
|
1508
1508
|
std::lock_guard<std::mutex> lock(mutex);
|
|
1509
1509
|
|
|
1510
1510
|
if (device >= ggml_backend_cann_get_device_count()) {
|
|
1511
1511
|
return nullptr;
|
|
1512
1512
|
}
|
|
1513
1513
|
|
|
1514
|
-
static ggml_backend_buffer_type
|
|
1515
|
-
ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
|
|
1514
|
+
static ggml_backend_buffer_type ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES];
|
|
1516
1515
|
|
|
1517
1516
|
static bool ggml_backend_cann_buffer_type_initialized = false;
|
|
1518
1517
|
|
|
@@ -1522,8 +1521,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|
|
1522
1521
|
/* .iface = */ ggml_backend_cann_buffer_type_interface,
|
|
1523
1522
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
|
|
1524
1523
|
/* .context = */
|
|
1525
|
-
|
|
1526
|
-
i, "CANN" + std::to_string(i)},
|
|
1524
|
+
new ggml_backend_cann_buffer_type_context{ i, "CANN" + std::to_string(i) },
|
|
1527
1525
|
};
|
|
1528
1526
|
}
|
|
1529
1527
|
ggml_backend_cann_buffer_type_initialized = true;
|
|
@@ -1587,16 +1585,16 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|
|
1587
1585
|
}
|
|
1588
1586
|
|
|
1589
1587
|
const size_t alignment = 128;
|
|
1590
|
-
size
|
|
1588
|
+
size = GGML_PAD(size, alignment);
|
|
1591
1589
|
if (size == 0) {
|
|
1592
1590
|
size = alignment;
|
|
1593
1591
|
}
|
|
1594
1592
|
|
|
1595
|
-
void *
|
|
1596
|
-
aclError err
|
|
1593
|
+
void * hostPtr = nullptr;
|
|
1594
|
+
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
|
1597
1595
|
if (err != ACL_SUCCESS) {
|
|
1598
|
-
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
|
1599
|
-
|
|
1596
|
+
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size / 1024.0 / 1024.0,
|
|
1597
|
+
aclGetRecentErrMsg());
|
|
1600
1598
|
return nullptr;
|
|
1601
1599
|
}
|
|
1602
1600
|
return hostPtr;
|
|
@@ -1609,7 +1607,8 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|
|
1609
1607
|
* @param size Size in bytes of the host buffer to allocate.
|
|
1610
1608
|
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
|
1611
1609
|
*/
|
|
1612
|
-
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
1610
|
+
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|
1611
|
+
size_t size) {
|
|
1613
1612
|
void * hostPtr = ggml_cann_host_malloc(size);
|
|
1614
1613
|
|
|
1615
1614
|
if (hostPtr == nullptr) {
|
|
@@ -1618,8 +1617,8 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
|
|
|
1618
1617
|
}
|
|
1619
1618
|
|
|
1620
1619
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
|
|
1621
|
-
buffer->buft
|
|
1622
|
-
buffer->iface.free_buffer
|
|
1620
|
+
buffer->buft = buft;
|
|
1621
|
+
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
|
|
1623
1622
|
|
|
1624
1623
|
return buffer;
|
|
1625
1624
|
}
|
|
@@ -1633,14 +1632,15 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
|
|
|
1633
1632
|
ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
|
1634
1633
|
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
|
1635
1634
|
/* .iface = */ {
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1635
|
+
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
|
1636
|
+
/* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
|
|
1637
|
+
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
|
1638
|
+
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
|
1640
1639
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
/* .device = */
|
|
1640
|
+
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
|
1641
|
+
},
|
|
1642
|
+
/* .device = */
|
|
1643
|
+
ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
|
|
1644
1644
|
/* .context = */ nullptr,
|
|
1645
1645
|
};
|
|
1646
1646
|
|
|
@@ -1660,8 +1660,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
|
|
1660
1660
|
* stored.
|
|
1661
1661
|
* @return true if the computation was successful; false otherwise.
|
|
1662
1662
|
*/
|
|
1663
|
-
static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
1664
|
-
struct ggml_tensor* dst) {
|
|
1663
|
+
static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct ggml_tensor * dst) {
|
|
1665
1664
|
switch (dst->op) {
|
|
1666
1665
|
case GGML_OP_REPEAT:
|
|
1667
1666
|
ggml_cann_repeat(ctx, dst);
|
|
@@ -1707,14 +1706,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1707
1706
|
case GGML_UNARY_OP_SILU:
|
|
1708
1707
|
GGML_CANN_CALL_OP_UNARY(Silu);
|
|
1709
1708
|
break;
|
|
1710
|
-
case GGML_UNARY_OP_GELU_QUICK:
|
|
1711
|
-
|
|
1712
|
-
aclTensor* acl_src,
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1709
|
+
case GGML_UNARY_OP_GELU_QUICK:
|
|
1710
|
+
{
|
|
1711
|
+
auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
|
1712
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
|
1713
|
+
};
|
|
1714
|
+
ggml_cann_op_unary(lambda, ctx, dst);
|
|
1715
|
+
}
|
|
1716
|
+
break;
|
|
1718
1717
|
case GGML_UNARY_OP_TANH:
|
|
1719
1718
|
GGML_CANN_CALL_OP_UNARY(Tanh);
|
|
1720
1719
|
break;
|
|
@@ -1759,14 +1758,14 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1759
1758
|
case GGML_GLU_OP_SWIGLU:
|
|
1760
1759
|
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
|
|
1761
1760
|
break;
|
|
1762
|
-
case GGML_GLU_OP_GEGLU_QUICK:
|
|
1763
|
-
|
|
1764
|
-
aclTensor* acl_src,
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1761
|
+
case GGML_GLU_OP_GEGLU_QUICK:
|
|
1762
|
+
{
|
|
1763
|
+
auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
|
1764
|
+
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
|
1765
|
+
};
|
|
1766
|
+
ggml_cann_op_unary_gated(lambda, ctx, dst);
|
|
1767
|
+
}
|
|
1768
|
+
break;
|
|
1770
1769
|
default:
|
|
1771
1770
|
return false;
|
|
1772
1771
|
}
|
|
@@ -1777,6 +1776,12 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1777
1776
|
case GGML_OP_GROUP_NORM:
|
|
1778
1777
|
ggml_cann_group_norm(ctx, dst);
|
|
1779
1778
|
break;
|
|
1779
|
+
case GGML_OP_L2_NORM:
|
|
1780
|
+
ggml_cann_l2_norm(ctx, dst);
|
|
1781
|
+
break;
|
|
1782
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
|
1783
|
+
ggml_cann_cross_entropy_loss(ctx, dst);
|
|
1784
|
+
break;
|
|
1780
1785
|
case GGML_OP_CONCAT:
|
|
1781
1786
|
ggml_cann_concat(ctx, dst);
|
|
1782
1787
|
break;
|
|
@@ -1898,9 +1903,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
|
1898
1903
|
* @param backend Pointer to the CANN backend structure.
|
|
1899
1904
|
* @return A pointer to a constant string representing the backend name.
|
|
1900
1905
|
*/
|
|
1901
|
-
static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|
1902
|
-
ggml_backend_cann_context* cann_ctx =
|
|
1903
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1906
|
+
static const char * ggml_backend_cann_name(ggml_backend_t backend) {
|
|
1907
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1904
1908
|
|
|
1905
1909
|
return cann_ctx->name.c_str();
|
|
1906
1910
|
}
|
|
@@ -1914,8 +1918,7 @@ static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|
|
1914
1918
|
* @param backend Pointer to the CANN backend structure to be freed.
|
|
1915
1919
|
*/
|
|
1916
1920
|
static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
1917
|
-
ggml_backend_cann_context* cann_ctx =
|
|
1918
|
-
(ggml_backend_cann_context*)backend->context;
|
|
1921
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1919
1922
|
ACL_CHECK(aclrtSynchronizeDevice());
|
|
1920
1923
|
ACL_CHECK(aclrtResetDevice(cann_ctx->device));
|
|
1921
1924
|
|
|
@@ -1923,7 +1926,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
|
1923
1926
|
delete backend;
|
|
1924
1927
|
}
|
|
1925
1928
|
|
|
1926
|
-
|
|
1927
1929
|
/**
|
|
1928
1930
|
* @brief Sets tensor data asynchronously in the CANN backend.
|
|
1929
1931
|
*
|
|
@@ -1936,21 +1938,18 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|
|
1936
1938
|
* @param size Size of the data to copy in bytes.
|
|
1937
1939
|
*/
|
|
1938
1940
|
static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
|
1939
|
-
ggml_tensor *tensor,
|
|
1940
|
-
const void *data,
|
|
1941
|
-
size_t
|
|
1942
|
-
size_t
|
|
1943
|
-
ggml_backend_cann_context *cann_ctx =
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
|
1949
|
-
"unsupported buffer type");
|
|
1941
|
+
ggml_tensor * tensor,
|
|
1942
|
+
const void * data,
|
|
1943
|
+
size_t offset,
|
|
1944
|
+
size_t size) {
|
|
1945
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1946
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1947
|
+
|
|
1948
|
+
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
|
|
1950
1949
|
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
|
1951
1950
|
|
|
1952
|
-
|
|
1953
|
-
|
|
1951
|
+
ACL_CHECK(aclrtMemcpyAsync((char *) tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE,
|
|
1952
|
+
cann_ctx->stream()));
|
|
1954
1953
|
}
|
|
1955
1954
|
|
|
1956
1955
|
/**
|
|
@@ -1964,21 +1963,19 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
|
|
1964
1963
|
* @param offset Offset in bytes within the host data.
|
|
1965
1964
|
* @param size Size of the data to copy in bytes.
|
|
1966
1965
|
*/
|
|
1967
|
-
static void ggml_backend_cann_get_tensor_async(
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1966
|
+
static void ggml_backend_cann_get_tensor_async(ggml_backend_t backend,
|
|
1967
|
+
const ggml_tensor * tensor,
|
|
1968
|
+
void * data,
|
|
1969
|
+
size_t offset,
|
|
1970
|
+
size_t size) {
|
|
1971
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
1972
|
+
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
|
1974
1973
|
|
|
1975
|
-
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) &&
|
|
1976
|
-
"unsupported buffer type");
|
|
1974
|
+
GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && "unsupported buffer type");
|
|
1977
1975
|
GGML_ASSERT(!ggml_is_quantized(tensor->type));
|
|
1978
1976
|
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1977
|
+
ACL_CHECK(aclrtMemcpyAsync(data, size, (char *) tensor->data + offset, size, ACL_MEMCPY_DEVICE_TO_HOST,
|
|
1978
|
+
cann_ctx->stream()));
|
|
1982
1979
|
}
|
|
1983
1980
|
|
|
1984
1981
|
/**
|
|
@@ -1994,65 +1991,67 @@ static void ggml_backend_cann_get_tensor_async(
|
|
|
1994
1991
|
* @param dst Pointer to the destination tensor to copy data to.
|
|
1995
1992
|
* @return true if the copy operation succeeds, false otherwise.
|
|
1996
1993
|
*/
|
|
1997
|
-
static bool ggml_backend_cann_cpy_tensor_async(
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
1994
|
+
static bool ggml_backend_cann_cpy_tensor_async(ggml_backend_t backend_src,
|
|
1995
|
+
ggml_backend_t backend_dst,
|
|
1996
|
+
const ggml_tensor * src,
|
|
1997
|
+
ggml_tensor * dst) {
|
|
1998
|
+
GGML_ASSERT(ggml_backend_is_cann(backend_src) || ggml_backend_is_cann(backend_dst));
|
|
1999
|
+
|
|
2000
|
+
GGML_ASSERT(!is_matmul_weight((const ggml_tensor *) src));
|
|
2002
2001
|
|
|
2003
|
-
if (!ggml_backend_buffer_is_cann(src->buffer) ||
|
|
2004
|
-
!ggml_backend_buffer_is_cann(dst->buffer)) {
|
|
2002
|
+
if (!ggml_backend_buffer_is_cann(src->buffer) || !ggml_backend_buffer_is_cann(dst->buffer)) {
|
|
2005
2003
|
return false;
|
|
2006
2004
|
}
|
|
2007
2005
|
|
|
2008
|
-
ggml_backend_buffer_t buf_src =
|
|
2009
|
-
|
|
2010
|
-
ggml_backend_buffer_t buf_dst =
|
|
2011
|
-
dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
2006
|
+
ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer;
|
|
2007
|
+
ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer;
|
|
2012
2008
|
|
|
2013
|
-
ggml_backend_cann_context* cann_ctx_src =
|
|
2014
|
-
|
|
2015
|
-
ggml_backend_cann_context* cann_ctx_dst =
|
|
2016
|
-
(ggml_backend_cann_context*)backend_dst->context;
|
|
2009
|
+
ggml_backend_cann_context * cann_ctx_src = (ggml_backend_cann_context *) backend_src->context;
|
|
2010
|
+
ggml_backend_cann_context * cann_ctx_dst = (ggml_backend_cann_context *) backend_dst->context;
|
|
2017
2011
|
|
|
2018
2012
|
size_t copy_size = ggml_nbytes(dst);
|
|
2019
2013
|
if (copy_size == 0) {
|
|
2020
2014
|
return true;
|
|
2021
2015
|
}
|
|
2022
2016
|
if (backend_src != backend_dst) {
|
|
2023
|
-
|
|
2024
|
-
|
|
2025
|
-
|
|
2026
|
-
|
|
2017
|
+
#ifdef ASCEND_310P
|
|
2018
|
+
// TODO: Support 310p P2P copy
|
|
2019
|
+
return false;
|
|
2020
|
+
#endif
|
|
2021
|
+
ggml_backend_cann_buffer_context * buf_ctx_src = (ggml_backend_cann_buffer_context *) buf_src->context;
|
|
2022
|
+
ggml_backend_cann_buffer_context * buf_ctx_dst = (ggml_backend_cann_buffer_context *) buf_dst->context;
|
|
2027
2023
|
|
|
2028
2024
|
GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device);
|
|
2029
2025
|
GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device);
|
|
2030
2026
|
|
|
2031
2027
|
int32_t canAccessPeer = 0;
|
|
2032
|
-
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device,
|
|
2033
|
-
cann_ctx_dst->device));
|
|
2028
|
+
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, cann_ctx_dst->device));
|
|
2034
2029
|
if (!canAccessPeer) {
|
|
2035
2030
|
return false;
|
|
2036
2031
|
}
|
|
2037
2032
|
|
|
2038
2033
|
// need open both directions for memcpyasync between devices.
|
|
2039
|
-
ggml_cann_set_device(cann_ctx_dst->device);
|
|
2040
2034
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
|
|
2041
2035
|
ggml_cann_set_device(cann_ctx_src->device);
|
|
2042
2036
|
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
|
|
2043
2037
|
|
|
2044
2038
|
// wait for task_queue empty to keep task order.
|
|
2045
|
-
|
|
2046
|
-
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
|
2047
|
-
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
2039
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
2048
2040
|
cann_ctx_src->stream()));
|
|
2049
|
-
|
|
2050
|
-
//TODO:
|
|
2051
|
-
|
|
2041
|
+
// record event on src stream after the copy
|
|
2042
|
+
// TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
|
|
2043
|
+
// if (!cann_ctx_src->copy_event) {
|
|
2044
|
+
// ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
|
|
2045
|
+
// }
|
|
2046
|
+
// ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
|
|
2047
|
+
|
|
2048
|
+
// // wait on dst stream for the copy to complete
|
|
2049
|
+
// ggml_cann_set_device(cann_ctx_dst->device);
|
|
2050
|
+
// ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
|
|
2051
|
+
ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
|
|
2052
2052
|
} else {
|
|
2053
2053
|
// src and dst are on the same backend
|
|
2054
|
-
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
|
|
2055
|
-
ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
2054
|
+
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
|
2056
2055
|
cann_ctx_dst->stream()));
|
|
2057
2056
|
}
|
|
2058
2057
|
|
|
@@ -2068,39 +2067,65 @@ static bool ggml_backend_cann_cpy_tensor_async(
|
|
|
2068
2067
|
* @param backend Pointer to the CANN backend structure to synchronize.
|
|
2069
2068
|
*/
|
|
2070
2069
|
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|
2071
|
-
ggml_backend_cann_context* cann_ctx =
|
|
2072
|
-
(ggml_backend_cann_context*)backend->context;
|
|
2073
|
-
cann_ctx->task_queue.wait();
|
|
2070
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
2074
2071
|
ggml_cann_set_device(cann_ctx->device);
|
|
2075
2072
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
|
2076
2073
|
}
|
|
2077
2074
|
|
|
2078
2075
|
#ifdef USE_ACL_GRAPH
|
|
2079
2076
|
/**
|
|
2080
|
-
* @brief
|
|
2077
|
+
* @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
|
|
2081
2078
|
*
|
|
2082
|
-
* This function
|
|
2083
|
-
*
|
|
2079
|
+
* This function creates a new ggml_cann_graph object and fills its node properties
|
|
2080
|
+
* (operation type, dimensions, strides, input sources, and operation parameters)
|
|
2081
|
+
* based on the current ggml computation graph.
|
|
2084
2082
|
*
|
|
2085
|
-
*
|
|
2086
|
-
*
|
|
2083
|
+
* Each node in the ggml graph is mapped to a property entry in the new CANN graph:
|
|
2084
|
+
* - node address
|
|
2085
|
+
* - operation type
|
|
2086
|
+
* - shape (ne) and strides (nb)
|
|
2087
|
+
* - source tensor addresses
|
|
2088
|
+
* - operation parameters
|
|
2089
|
+
*
|
|
2090
|
+
* After initialization, the new graph is pushed into the LRU cache owned by the
|
|
2091
|
+
* CANN backend context. The cache takes ownership of the graph and manages its
|
|
2092
|
+
* lifetime (including deletion upon eviction).
|
|
2093
|
+
*
|
|
2094
|
+
* @param cann_ctx The CANN backend context containing the graph cache.
|
|
2095
|
+
* @param cgraph The current ggml computation graph.
|
|
2087
2096
|
*/
|
|
2088
|
-
static void
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
|
|
2097
|
+
static void add_lru_matched_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
|
2098
|
+
// Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
|
|
2099
|
+
ggml_cann_graph * new_graph = new ggml_cann_graph();
|
|
2100
|
+
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
|
2093
2101
|
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2102
|
+
for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
|
|
2103
|
+
ggml_tensor * node = cgraph->nodes[node_idx];
|
|
2104
|
+
auto & prop = new_graph->ggml_graph_properties[node_idx];
|
|
2105
|
+
|
|
2106
|
+
prop.node_address = node->data;
|
|
2107
|
+
prop.node_op = node->op;
|
|
2108
|
+
|
|
2109
|
+
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
|
|
2110
|
+
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
|
|
2111
|
+
|
|
2112
|
+
for (int src = 0; src < GGML_MAX_SRC; ++src) {
|
|
2113
|
+
if (node->src[src]) {
|
|
2114
|
+
prop.src_address[src] = node->src[src]->data;
|
|
2115
|
+
std::copy_n(node->src[src]->ne, GGML_MAX_DIMS, prop.src_ne[src]);
|
|
2116
|
+
std::copy_n(node->src[src]->nb, GGML_MAX_DIMS, prop.src_nb[src]);
|
|
2117
|
+
} else {
|
|
2118
|
+
prop.src_address[src] = nullptr;
|
|
2119
|
+
std::fill_n(prop.src_ne[src], GGML_MAX_DIMS, 0);
|
|
2120
|
+
std::fill_n(prop.src_nb[src], GGML_MAX_DIMS, 0);
|
|
2121
|
+
}
|
|
2101
2122
|
}
|
|
2102
|
-
|
|
2123
|
+
|
|
2124
|
+
memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
|
2103
2125
|
}
|
|
2126
|
+
|
|
2127
|
+
// Insert into the LRU cache (cache takes ownership and will delete it when evicted).
|
|
2128
|
+
cann_ctx->graph_lru_cache.push(new_graph);
|
|
2104
2129
|
}
|
|
2105
2130
|
|
|
2106
2131
|
/**
|
|
@@ -2113,14 +2138,16 @@ static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx,
|
|
|
2113
2138
|
* @param graph_node_properties The stored properties of a CANN graph node.
|
|
2114
2139
|
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
|
|
2115
2140
|
*/
|
|
2116
|
-
static bool ggml_graph_node_has_matching_properties(ggml_tensor *
|
|
2117
|
-
|
|
2118
|
-
|
|
2141
|
+
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node,
|
|
2142
|
+
ggml_graph_node_properties * graph_node_properties) {
|
|
2143
|
+
if (node->data != graph_node_properties->node_address && node->op != GGML_OP_VIEW) {
|
|
2119
2144
|
return false;
|
|
2120
2145
|
}
|
|
2146
|
+
|
|
2121
2147
|
if (node->op != graph_node_properties->node_op) {
|
|
2122
2148
|
return false;
|
|
2123
2149
|
}
|
|
2150
|
+
|
|
2124
2151
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
|
2125
2152
|
if (node->ne[i] != graph_node_properties->ne[i]) {
|
|
2126
2153
|
return false;
|
|
@@ -2129,46 +2156,74 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
|
|
|
2129
2156
|
return false;
|
|
2130
2157
|
}
|
|
2131
2158
|
}
|
|
2159
|
+
|
|
2132
2160
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
|
2133
|
-
if (node->src[i]
|
|
2134
|
-
node->src[i]->data != graph_node_properties->src_address[i] &&
|
|
2135
|
-
|
|
2136
|
-
|
|
2137
|
-
|
|
2161
|
+
if (node->src[i]) {
|
|
2162
|
+
if (node->src[i]->data != graph_node_properties->src_address[i] && node->op != GGML_OP_VIEW) {
|
|
2163
|
+
return false;
|
|
2164
|
+
}
|
|
2165
|
+
|
|
2166
|
+
for (int d = 0; d < GGML_MAX_DIMS; d++) {
|
|
2167
|
+
if (node->src[i]->ne[d] != graph_node_properties->src_ne[i][d]) {
|
|
2168
|
+
return false;
|
|
2169
|
+
}
|
|
2170
|
+
if (node->src[i]->nb[d] != graph_node_properties->src_nb[i][d]) {
|
|
2171
|
+
return false;
|
|
2172
|
+
}
|
|
2173
|
+
}
|
|
2174
|
+
} else {
|
|
2175
|
+
if (graph_node_properties->src_address[i] != nullptr) {
|
|
2176
|
+
return false;
|
|
2177
|
+
}
|
|
2138
2178
|
}
|
|
2139
2179
|
}
|
|
2140
|
-
|
|
2141
|
-
|
|
2142
|
-
return
|
|
2180
|
+
|
|
2181
|
+
if (node->op == GGML_OP_SCALE || node->op == GGML_OP_UNARY || node->op == GGML_OP_GLU) {
|
|
2182
|
+
return memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) == 0;
|
|
2143
2183
|
}
|
|
2144
2184
|
return true;
|
|
2145
2185
|
}
|
|
2146
2186
|
|
|
2147
2187
|
/**
|
|
2148
|
-
* @brief
|
|
2188
|
+
* @brief Check whether there is a cached CANN graph that matches the current ggml graph.
|
|
2189
|
+
*
|
|
2190
|
+
* This function iterates through the cached CANN graphs stored in the LRU cache and
|
|
2191
|
+
* compares them against the given ggml computation graph. A match requires that the
|
|
2192
|
+
* number of nodes is the same and that each node’s properties (operation type,
|
|
2193
|
+
* dimensions, strides, inputs, and operation parameters) are identical.
|
|
2149
2194
|
*
|
|
2150
|
-
*
|
|
2151
|
-
*
|
|
2195
|
+
* If a matching graph is found, it is promoted to the front of the LRU cache and the
|
|
2196
|
+
* function returns true. Otherwise, the function returns false, indicating that a new
|
|
2197
|
+
* CANN graph needs to be captured.
|
|
2152
2198
|
*
|
|
2153
|
-
* @param cann_ctx The CANN backend context.
|
|
2199
|
+
* @param cann_ctx The CANN backend context containing the graph cache.
|
|
2154
2200
|
* @param cgraph The current ggml computation graph.
|
|
2155
|
-
* @return true if
|
|
2156
|
-
*/
|
|
2157
|
-
static bool
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2201
|
+
* @return true if a matching cached graph exists; false otherwise.
|
|
2202
|
+
*/
|
|
2203
|
+
static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
|
2204
|
+
ggml_cann_graph_lru_cache & lru_cache = cann_ctx->graph_lru_cache;
|
|
2205
|
+
for (auto & graph_ptr : lru_cache.cache_list) {
|
|
2206
|
+
// Skip graphs with a different number of nodes.
|
|
2207
|
+
if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
|
|
2208
|
+
continue;
|
|
2209
|
+
}
|
|
2163
2210
|
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
cgraph->nodes[i], &
|
|
2168
|
-
|
|
2211
|
+
// Check if all nodes match.
|
|
2212
|
+
bool all_match = true;
|
|
2213
|
+
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
|
2214
|
+
if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
|
|
2215
|
+
all_match = false;
|
|
2216
|
+
break;
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
|
|
2220
|
+
if (all_match) {
|
|
2221
|
+
// update cache_list && renturn graph_ptr
|
|
2222
|
+
lru_cache.move_to_front(graph_ptr);
|
|
2169
2223
|
return true;
|
|
2170
2224
|
}
|
|
2171
2225
|
}
|
|
2226
|
+
|
|
2172
2227
|
return false;
|
|
2173
2228
|
}
|
|
2174
2229
|
#endif // USE_ACL_GRAPH
|
|
@@ -2186,25 +2241,23 @@ static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx,
|
|
|
2186
2241
|
* @param use_cann_graph Whether to use CANN graph execution.
|
|
2187
2242
|
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
|
|
2188
2243
|
*/
|
|
2189
|
-
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
|
|
2190
|
-
|
|
2244
|
+
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx,
|
|
2245
|
+
ggml_cgraph * cgraph,
|
|
2246
|
+
bool & use_cann_graph,
|
|
2247
|
+
bool & cann_graph_update_required) {
|
|
2191
2248
|
#ifdef USE_ACL_GRAPH
|
|
2192
|
-
if (use_cann_graph && cann_graph_update_required) {
|
|
2193
|
-
if (cann_ctx->cann_graph->graph != nullptr) {
|
|
2194
|
-
ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
|
|
2195
|
-
cann_ctx->cann_graph->graph = nullptr;
|
|
2196
|
-
}
|
|
2249
|
+
if (use_cann_graph && cann_graph_update_required) { // Begin CANN graph capture
|
|
2197
2250
|
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
|
2198
2251
|
}
|
|
2199
|
-
#endif
|
|
2200
|
-
|
|
2252
|
+
#endif // USE_ACL_GRAPH
|
|
2201
2253
|
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
|
2202
2254
|
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
|
2203
2255
|
if (!use_cann_graph || cann_graph_update_required) {
|
|
2204
2256
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
2205
2257
|
ggml_tensor * node = cgraph->nodes[i];
|
|
2206
2258
|
|
|
2207
|
-
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
|
|
2259
|
+
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE ||
|
|
2260
|
+
node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
|
2208
2261
|
continue;
|
|
2209
2262
|
}
|
|
2210
2263
|
|
|
@@ -2217,18 +2270,19 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
|
|
2217
2270
|
}
|
|
2218
2271
|
|
|
2219
2272
|
#ifdef USE_ACL_GRAPH
|
|
2220
|
-
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
|
|
2221
|
-
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
|
|
2222
|
-
}
|
|
2223
|
-
|
|
2224
2273
|
if (use_cann_graph) {
|
|
2225
|
-
|
|
2226
|
-
|
|
2274
|
+
ggml_cann_graph * matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
|
|
2275
|
+
|
|
2276
|
+
if (cann_graph_update_required) { // End CANN graph capture
|
|
2277
|
+
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
|
|
2278
|
+
}
|
|
2279
|
+
|
|
2280
|
+
// Execute CANN graph
|
|
2281
|
+
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
|
|
2227
2282
|
}
|
|
2228
|
-
#endif
|
|
2283
|
+
#endif // USE_ACL_GRAPH
|
|
2229
2284
|
}
|
|
2230
2285
|
|
|
2231
|
-
|
|
2232
2286
|
/**
|
|
2233
2287
|
* @brief Computes a computational graph using a CANN backend.
|
|
2234
2288
|
*
|
|
@@ -2241,36 +2295,50 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
|
|
|
2241
2295
|
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
|
2242
2296
|
* completes successfully, otherwise an appropriate error status.
|
|
2243
2297
|
*/
|
|
2244
|
-
static enum ggml_status ggml_backend_cann_graph_compute(
|
|
2245
|
-
|
|
2246
|
-
ggml_backend_cann_context* cann_ctx =
|
|
2247
|
-
(ggml_backend_cann_context*)backend->context;
|
|
2298
|
+
static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
2299
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
2248
2300
|
ggml_cann_set_device(cann_ctx->device);
|
|
2249
|
-
|
|
2301
|
+
g_nz_workspaces[cann_ctx->device].clear();
|
|
2302
|
+
|
|
2303
|
+
// calculate rope cache for fist layer in current device.
|
|
2304
|
+
cann_ctx->rope_cache.cached = false;
|
|
2305
|
+
|
|
2250
2306
|
#ifdef USE_ACL_GRAPH
|
|
2251
|
-
bool use_cann_graph
|
|
2307
|
+
bool use_cann_graph = true;
|
|
2252
2308
|
bool cann_graph_update_required = false;
|
|
2253
2309
|
|
|
2254
|
-
|
|
2255
|
-
|
|
2256
|
-
|
|
2257
|
-
|
|
2310
|
+
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
|
|
2311
|
+
if (!prefill_use_graph) {
|
|
2312
|
+
// Do not use acl_graph for prefill.
|
|
2313
|
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
2314
|
+
ggml_tensor * node = cgraph->nodes[i];
|
|
2315
|
+
// TODO: Optimize here. Currently, we can only
|
|
2316
|
+
// get seq_len by FA's input.
|
|
2317
|
+
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
|
|
2318
|
+
// Q -> src[0], shape: [B, S, N, D]
|
|
2319
|
+
use_cann_graph = (node->src[0]->ne[1] == 1);
|
|
2320
|
+
break;
|
|
2321
|
+
}
|
|
2258
2322
|
}
|
|
2323
|
+
}
|
|
2324
|
+
|
|
2325
|
+
if (!cann_ctx->acl_graph_mode) {
|
|
2326
|
+
use_cann_graph = false;
|
|
2327
|
+
}
|
|
2259
2328
|
|
|
2260
|
-
|
|
2261
|
-
|
|
2329
|
+
if (use_cann_graph) {
|
|
2330
|
+
// If no matching graph is found, the graph needs to be recaptured.
|
|
2331
|
+
cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
|
|
2332
|
+
if (cann_graph_update_required) {
|
|
2333
|
+
// If no matching graph is found, add a new ACL graph.
|
|
2334
|
+
add_lru_matched_graph_node_properties(cann_ctx, cgraph);
|
|
2335
|
+
}
|
|
2262
2336
|
}
|
|
2263
2337
|
#else
|
|
2264
|
-
bool use_cann_graph
|
|
2338
|
+
bool use_cann_graph = false;
|
|
2265
2339
|
bool cann_graph_update_required = false;
|
|
2266
2340
|
#endif // USE_ACL_GRAPH
|
|
2267
|
-
|
|
2268
|
-
evaluate_and_capture_cann_graph(
|
|
2269
|
-
cann_ctx,
|
|
2270
|
-
cgraph,
|
|
2271
|
-
use_cann_graph,
|
|
2272
|
-
cann_graph_update_required
|
|
2273
|
-
);
|
|
2341
|
+
evaluate_and_capture_cann_graph(cann_ctx, cgraph, use_cann_graph, cann_graph_update_required);
|
|
2274
2342
|
|
|
2275
2343
|
return GGML_STATUS_SUCCESS;
|
|
2276
2344
|
}
|
|
@@ -2287,8 +2355,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
|
|
2287
2355
|
* @return bool Returns true if the operation is supported by the backend,
|
|
2288
2356
|
* otherwise false.
|
|
2289
2357
|
*/
|
|
2290
|
-
static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
2291
|
-
const ggml_tensor* op) {
|
|
2358
|
+
static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
|
2292
2359
|
switch (op->op) {
|
|
2293
2360
|
case GGML_OP_UNARY:
|
|
2294
2361
|
switch (ggml_get_unary_op(op)) {
|
|
@@ -2323,24 +2390,24 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
|
2323
2390
|
return false;
|
|
2324
2391
|
}
|
|
2325
2392
|
break;
|
|
2326
|
-
case GGML_OP_MUL_MAT:
|
|
2327
|
-
|
|
2328
|
-
|
|
2329
|
-
|
|
2330
|
-
|
|
2331
|
-
|
|
2332
|
-
|
|
2393
|
+
case GGML_OP_MUL_MAT:
|
|
2394
|
+
{
|
|
2395
|
+
switch (op->src[0]->type) {
|
|
2396
|
+
case GGML_TYPE_F16:
|
|
2397
|
+
case GGML_TYPE_F32:
|
|
2398
|
+
return true;
|
|
2399
|
+
case GGML_TYPE_Q8_0:
|
|
2400
|
+
case GGML_TYPE_Q4_0:
|
|
2333
2401
|
#ifdef ASCEND_310P
|
|
2334
|
-
|
|
2335
|
-
|
|
2402
|
+
// Q4 && Q8 per group is not support on 310p device
|
|
2403
|
+
return false;
|
|
2336
2404
|
#endif
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2405
|
+
// only support contiguous for quantized types.
|
|
2406
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
2407
|
+
default:
|
|
2408
|
+
return false;
|
|
2409
|
+
}
|
|
2342
2410
|
}
|
|
2343
|
-
}
|
|
2344
2411
|
case GGML_OP_MUL_MAT_ID:
|
|
2345
2412
|
switch (op->src[0]->type) {
|
|
2346
2413
|
case GGML_TYPE_F16:
|
|
@@ -2353,106 +2420,115 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
|
2353
2420
|
return false;
|
|
2354
2421
|
#endif
|
|
2355
2422
|
// only support contiguous for quantized types.
|
|
2356
|
-
return ggml_is_contiguous(op->src[0]) &&
|
|
2357
|
-
ggml_is_contiguous(op->src[1]);
|
|
2423
|
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
|
2358
2424
|
default:
|
|
2359
2425
|
return false;
|
|
2360
2426
|
}
|
|
2361
2427
|
// embedding
|
|
2362
|
-
case GGML_OP_GET_ROWS:
|
|
2363
|
-
|
|
2364
|
-
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
|
|
2368
|
-
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
|
|
2372
|
-
case GGML_OP_SET_ROWS: {
|
|
2373
|
-
switch (op->type) {
|
|
2374
|
-
case GGML_TYPE_F32:
|
|
2375
|
-
case GGML_TYPE_F16:
|
|
2376
|
-
return true;
|
|
2377
|
-
default:
|
|
2378
|
-
return false;
|
|
2428
|
+
case GGML_OP_GET_ROWS:
|
|
2429
|
+
{
|
|
2430
|
+
switch (op->src[0]->type) {
|
|
2431
|
+
case GGML_TYPE_F32:
|
|
2432
|
+
case GGML_TYPE_F16:
|
|
2433
|
+
case GGML_TYPE_Q8_0:
|
|
2434
|
+
return true;
|
|
2435
|
+
default:
|
|
2436
|
+
return false;
|
|
2437
|
+
}
|
|
2379
2438
|
}
|
|
2380
|
-
|
|
2381
|
-
case
|
|
2382
|
-
|
|
2383
|
-
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
|
|
2387
|
-
|
|
2439
|
+
break;
|
|
2440
|
+
case GGML_OP_SET_ROWS:
|
|
2441
|
+
{
|
|
2442
|
+
switch (op->type) {
|
|
2443
|
+
case GGML_TYPE_F32:
|
|
2444
|
+
case GGML_TYPE_F16:
|
|
2445
|
+
return true;
|
|
2446
|
+
default:
|
|
2447
|
+
return false;
|
|
2448
|
+
}
|
|
2388
2449
|
}
|
|
2389
|
-
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
return true;
|
|
2397
|
-
default:
|
|
2450
|
+
break;
|
|
2451
|
+
case GGML_OP_CPY:
|
|
2452
|
+
{
|
|
2453
|
+
ggml_tensor * src = op->src[0];
|
|
2454
|
+
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
|
2455
|
+
(src->type != GGML_TYPE_F32 && src->type != GGML_TYPE_F16)) {
|
|
2456
|
+
// only support F32 and F16.
|
|
2398
2457
|
return false;
|
|
2458
|
+
}
|
|
2459
|
+
return true;
|
|
2399
2460
|
}
|
|
2400
|
-
|
|
2401
|
-
case
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
return false;
|
|
2412
|
-
}
|
|
2413
|
-
|
|
2414
|
-
const int mode = ((const int32_t *) op->op_params)[2];
|
|
2415
|
-
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
2416
|
-
return false;
|
|
2417
|
-
}
|
|
2418
|
-
if (mode & GGML_ROPE_TYPE_VISION) {
|
|
2419
|
-
return false;
|
|
2461
|
+
break;
|
|
2462
|
+
case GGML_OP_CONT:
|
|
2463
|
+
{
|
|
2464
|
+
// TODO: support GGML_TYPE_BF16
|
|
2465
|
+
switch (op->src[0]->type) {
|
|
2466
|
+
case GGML_TYPE_F32:
|
|
2467
|
+
case GGML_TYPE_F16:
|
|
2468
|
+
return true;
|
|
2469
|
+
default:
|
|
2470
|
+
return false;
|
|
2471
|
+
}
|
|
2420
2472
|
}
|
|
2473
|
+
case GGML_OP_ROPE:
|
|
2474
|
+
{
|
|
2475
|
+
// TODO: with ops-test v == 1
|
|
2476
|
+
// TODO: n_dims <= ne0
|
|
2477
|
+
if (op->src[0]->ne[0] != op->op_params[1]) {
|
|
2478
|
+
return false;
|
|
2479
|
+
}
|
|
2421
2480
|
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
|
|
2427
|
-
|
|
2428
|
-
|
|
2429
|
-
|
|
2430
|
-
|
|
2431
|
-
|
|
2481
|
+
const int mode = ((const int32_t *) op->op_params)[2];
|
|
2482
|
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
|
2483
|
+
return false;
|
|
2484
|
+
}
|
|
2485
|
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
|
2486
|
+
return false;
|
|
2487
|
+
}
|
|
2488
|
+
if (op->src[0]->ne[0] > 896) {
|
|
2489
|
+
return false;
|
|
2490
|
+
}
|
|
2491
|
+
#ifdef ASCEND_310P
|
|
2492
|
+
if (!ggml_is_contiguous(op->src[0])) {
|
|
2493
|
+
return false;
|
|
2494
|
+
}
|
|
2495
|
+
#endif
|
|
2496
|
+
return true;
|
|
2432
2497
|
}
|
|
2433
|
-
|
|
2434
|
-
|
|
2498
|
+
case GGML_OP_UPSCALE:
|
|
2499
|
+
{
|
|
2500
|
+
// aclnnUpsampleNearest2dGetWorkspaceSize not support
|
|
2501
|
+
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
|
|
2502
|
+
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
|
|
2503
|
+
return false;
|
|
2504
|
+
}
|
|
2505
|
+
if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
|
|
2506
|
+
return false;
|
|
2507
|
+
}
|
|
2508
|
+
return true;
|
|
2435
2509
|
}
|
|
2436
|
-
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
const int32_t * opts = (const int32_t *) op->op_params;
|
|
2510
|
+
case GGML_OP_POOL_2D:
|
|
2511
|
+
{
|
|
2512
|
+
const int32_t * opts = (const int32_t *) op->op_params;
|
|
2440
2513
|
#ifdef ASCEND_310P
|
|
2441
|
-
|
|
2442
|
-
|
|
2443
|
-
|
|
2444
|
-
|
|
2514
|
+
enum ggml_op_pool opt = static_cast<ggml_op_pool>(opts[0]);
|
|
2515
|
+
if (opt == GGML_OP_POOL_MAX) {
|
|
2516
|
+
return false;
|
|
2517
|
+
}
|
|
2445
2518
|
#endif
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
2451
|
-
|
|
2452
|
-
|
|
2453
|
-
|
|
2454
|
-
case GGML_OP_DUP:
|
|
2519
|
+
const int k0 = opts[1];
|
|
2520
|
+
const int k1 = opts[2];
|
|
2521
|
+
const int p0 = opts[5];
|
|
2522
|
+
const int p1 = opts[6];
|
|
2523
|
+
// value of paddingH should be at most half of kernelH
|
|
2524
|
+
// value of paddingW should be at most half of kernelW
|
|
2525
|
+
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
|
|
2526
|
+
}
|
|
2455
2527
|
case GGML_OP_SUM:
|
|
2528
|
+
return ggml_is_contiguous_rows(op->src[0]);
|
|
2529
|
+
case GGML_OP_L2_NORM:
|
|
2530
|
+
case GGML_OP_CROSS_ENTROPY_LOSS:
|
|
2531
|
+
case GGML_OP_DUP:
|
|
2456
2532
|
case GGML_OP_IM2COL:
|
|
2457
2533
|
case GGML_OP_CONCAT:
|
|
2458
2534
|
case GGML_OP_REPEAT:
|
|
@@ -2483,63 +2559,60 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
|
2483
2559
|
case GGML_OP_ARGMAX:
|
|
2484
2560
|
case GGML_OP_COS:
|
|
2485
2561
|
case GGML_OP_SIN:
|
|
2486
|
-
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
2487
2562
|
case GGML_OP_LOG:
|
|
2488
2563
|
case GGML_OP_MEAN:
|
|
2489
2564
|
case GGML_OP_PAD_REFLECT_1D:
|
|
2490
2565
|
case GGML_OP_COUNT_EQUAL:
|
|
2491
2566
|
return true;
|
|
2567
|
+
case GGML_OP_CONV_TRANSPOSE_1D:
|
|
2568
|
+
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
|
|
2569
|
+
return (op->src[0]->ne[0] - 1) <= 255;
|
|
2492
2570
|
case GGML_OP_SCALE:
|
|
2493
2571
|
float bias;
|
|
2494
|
-
memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
|
|
2495
|
-
return bias == 0.0f;
|
|
2572
|
+
memcpy(&bias, (const float *) (op->op_params) + 1, sizeof(float));
|
|
2573
|
+
return bias == 0.0f; // TODO: support bias != 0.0f
|
|
2496
2574
|
case GGML_OP_SOFT_MAX:
|
|
2497
2575
|
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
|
2498
2576
|
if (op->src[2]) {
|
|
2499
2577
|
return false;
|
|
2500
2578
|
}
|
|
2501
2579
|
return true;
|
|
2502
|
-
case GGML_OP_FLASH_ATTN_EXT:
|
|
2580
|
+
case GGML_OP_FLASH_ATTN_EXT:
|
|
2581
|
+
{
|
|
2503
2582
|
#ifdef ASCEND_310P
|
|
2504
|
-
|
|
2505
|
-
return false;
|
|
2506
|
-
#endif
|
|
2507
|
-
// derived from [ggml-cuda.cu]
|
|
2508
|
-
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
|
|
2509
|
-
return false;
|
|
2510
|
-
}
|
|
2511
|
-
if(op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 && op->src[1]->type != GGML_TYPE_BF16){
|
|
2512
|
-
return false;
|
|
2513
|
-
}
|
|
2514
|
-
if(op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16){
|
|
2515
|
-
return false;
|
|
2516
|
-
}
|
|
2517
|
-
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
|
2518
|
-
if (op->src[4]) {
|
|
2519
|
-
return false;
|
|
2520
|
-
}
|
|
2521
|
-
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
|
2522
|
-
// different head sizes of K and V are not supported yet
|
|
2523
|
-
return false;
|
|
2524
|
-
}
|
|
2525
|
-
if (op->src[0]->ne[0] == 192) {
|
|
2526
|
-
return false;
|
|
2527
|
-
}
|
|
2528
|
-
if (op->src[0]->ne[0] == 576) {
|
|
2529
|
-
// DeepSeek MLA
|
|
2530
|
-
return false;
|
|
2531
|
-
}
|
|
2532
|
-
if (op->src[0]->ne[0] % 16 != 0) {
|
|
2533
|
-
// TODO: padding to support
|
|
2534
|
-
return false;
|
|
2535
|
-
}
|
|
2536
|
-
float logitSoftcap = 0.0f;
|
|
2537
|
-
memcpy(&logitSoftcap, (const float *)(op->op_params) + 2, sizeof(float));
|
|
2538
|
-
if(logitSoftcap != 0.0f) {
|
|
2583
|
+
// FA not support on 310p device
|
|
2539
2584
|
return false;
|
|
2585
|
+
#endif
|
|
2586
|
+
// derived from [ggml-cuda.cu]
|
|
2587
|
+
if (op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16) {
|
|
2588
|
+
return false;
|
|
2589
|
+
}
|
|
2590
|
+
if (op->src[1]->type != GGML_TYPE_F16 && op->src[1]->type != GGML_TYPE_F32 &&
|
|
2591
|
+
op->src[1]->type != GGML_TYPE_BF16) {
|
|
2592
|
+
return false;
|
|
2593
|
+
}
|
|
2594
|
+
if (op->type != GGML_TYPE_F16 && op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_BF16) {
|
|
2595
|
+
return false;
|
|
2596
|
+
}
|
|
2597
|
+
// TODO: support attention sinks [TAG_ATTN_SINKS]
|
|
2598
|
+
if (op->src[4]) {
|
|
2599
|
+
return false;
|
|
2600
|
+
}
|
|
2601
|
+
if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
|
|
2602
|
+
// different head sizes of K and V are not supported yet
|
|
2603
|
+
return false;
|
|
2604
|
+
}
|
|
2605
|
+
if (op->src[0]->ne[0] % 16 != 0) {
|
|
2606
|
+
// TODO: padding to support
|
|
2607
|
+
return false;
|
|
2608
|
+
}
|
|
2609
|
+
float logitSoftcap = 0.0f;
|
|
2610
|
+
memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
|
|
2611
|
+
if (logitSoftcap != 0.0f) {
|
|
2612
|
+
return false;
|
|
2613
|
+
}
|
|
2614
|
+
return true;
|
|
2540
2615
|
}
|
|
2541
|
-
return true;
|
|
2542
|
-
}
|
|
2543
2616
|
default:
|
|
2544
2617
|
return false;
|
|
2545
2618
|
}
|
|
@@ -2576,8 +2649,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
|
|
2576
2649
|
* @return bool Returns true if the operation should be offloaded, otherwise
|
|
2577
2650
|
* false.
|
|
2578
2651
|
*/
|
|
2579
|
-
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
|
|
2580
|
-
const ggml_tensor* op) {
|
|
2652
|
+
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
|
|
2581
2653
|
const int min_batch_size = 32;
|
|
2582
2654
|
GGML_UNUSED(dev);
|
|
2583
2655
|
|
|
@@ -2593,9 +2665,8 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
|
|
|
2593
2665
|
* @param event Pointer to the event structure to be recorded.
|
|
2594
2666
|
*/
|
|
2595
2667
|
static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
2596
|
-
ggml_backend_cann_context* cann_ctx =
|
|
2597
|
-
|
|
2598
|
-
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
|
|
2668
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
2669
|
+
ACL_CHECK(aclrtRecordEvent((aclrtEvent) event->context, cann_ctx->stream()));
|
|
2599
2670
|
}
|
|
2600
2671
|
|
|
2601
2672
|
/**
|
|
@@ -2608,13 +2679,10 @@ static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_
|
|
|
2608
2679
|
* @param event Pointer to the event structure that the backend needs to wait
|
|
2609
2680
|
* for.
|
|
2610
2681
|
*/
|
|
2611
|
-
static void ggml_backend_cann_event_wait(ggml_backend_t backend,
|
|
2612
|
-
|
|
2613
|
-
ggml_backend_cann_context* cann_ctx =
|
|
2614
|
-
(ggml_backend_cann_context*)backend->context;
|
|
2682
|
+
static void ggml_backend_cann_event_wait(ggml_backend_t backend, ggml_backend_event_t event) {
|
|
2683
|
+
ggml_backend_cann_context * cann_ctx = (ggml_backend_cann_context *) backend->context;
|
|
2615
2684
|
if (ggml_backend_is_cann(backend)) {
|
|
2616
|
-
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
|
|
2617
|
-
(aclrtEvent)event->context));
|
|
2685
|
+
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), (aclrtEvent) event->context));
|
|
2618
2686
|
} else {
|
|
2619
2687
|
GGML_ABORT("fatal error");
|
|
2620
2688
|
}
|
|
@@ -2641,6 +2709,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|
|
2641
2709
|
/* .graph_compute = */ ggml_backend_cann_graph_compute,
|
|
2642
2710
|
/* .event_record = */ ggml_backend_cann_event_record,
|
|
2643
2711
|
/* .event_wait = */ ggml_backend_cann_event_wait,
|
|
2712
|
+
/* .graph_optimize = */ NULL,
|
|
2644
2713
|
};
|
|
2645
2714
|
|
|
2646
2715
|
/**
|
|
@@ -2652,30 +2721,30 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
|
|
2652
2721
|
* @return A pointer to the static GUID.
|
|
2653
2722
|
*/
|
|
2654
2723
|
static ggml_guid_t ggml_backend_cann_guid() {
|
|
2655
|
-
static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
|
|
2656
|
-
|
|
2724
|
+
static ggml_guid guid = { 0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34,
|
|
2725
|
+
0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64 };
|
|
2657
2726
|
return &guid;
|
|
2658
2727
|
}
|
|
2659
2728
|
|
|
2660
2729
|
// backend device
|
|
2661
2730
|
struct ggml_backend_cann_device_context {
|
|
2662
|
-
int
|
|
2731
|
+
int device;
|
|
2663
2732
|
std::string name;
|
|
2664
2733
|
std::string description;
|
|
2665
2734
|
};
|
|
2666
2735
|
|
|
2667
2736
|
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
|
|
2668
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2737
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2669
2738
|
return ctx->name.c_str();
|
|
2670
2739
|
}
|
|
2671
2740
|
|
|
2672
|
-
static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
|
|
2673
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2741
|
+
static const char * ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
|
|
2742
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2674
2743
|
return ctx->description.c_str();
|
|
2675
2744
|
}
|
|
2676
2745
|
|
|
2677
2746
|
static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
|
2678
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2747
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2679
2748
|
ggml_backend_cann_get_device_memory(ctx->device, free, total);
|
|
2680
2749
|
}
|
|
2681
2750
|
|
|
@@ -2702,7 +2771,7 @@ static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_back
|
|
|
2702
2771
|
|
|
2703
2772
|
static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
|
|
2704
2773
|
GGML_UNUSED(params);
|
|
2705
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2774
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2706
2775
|
return ggml_backend_cann_init(ctx->device);
|
|
2707
2776
|
}
|
|
2708
2777
|
|
|
@@ -2719,19 +2788,17 @@ static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, cons
|
|
|
2719
2788
|
* @return bool Returns true if the CANN backend supports the buffer type,
|
|
2720
2789
|
* otherwise false.
|
|
2721
2790
|
*/
|
|
2722
|
-
static bool ggml_backend_cann_supports_buft(
|
|
2723
|
-
ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
2791
|
+
static bool ggml_backend_cann_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
|
2724
2792
|
if (ggml_backend_buft_is_cann(buft)) {
|
|
2725
|
-
ggml_backend_cann_device_context *
|
|
2726
|
-
ggml_backend_cann_buffer_type_context * buft_ctx =
|
|
2727
|
-
(ggml_backend_cann_buffer_type_context *)buft->context;
|
|
2793
|
+
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2794
|
+
ggml_backend_cann_buffer_type_context * buft_ctx = (ggml_backend_cann_buffer_type_context *) buft->context;
|
|
2728
2795
|
return buft_ctx->device == dev_ctx->device;
|
|
2729
2796
|
}
|
|
2730
2797
|
return false;
|
|
2731
2798
|
}
|
|
2732
2799
|
|
|
2733
2800
|
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
|
|
2734
|
-
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2801
|
+
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2735
2802
|
return ggml_backend_cann_buffer_type(ctx->device);
|
|
2736
2803
|
}
|
|
2737
2804
|
|
|
@@ -2750,9 +2817,8 @@ static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(
|
|
|
2750
2817
|
* @param backend Pointer to the CANN backend.
|
|
2751
2818
|
* @return ggml_backend_event_t Returns a pointer to the new event structure.
|
|
2752
2819
|
*/
|
|
2753
|
-
static ggml_backend_event_t ggml_backend_cann_device_event_new(
|
|
2754
|
-
|
|
2755
|
-
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
|
|
2820
|
+
static ggml_backend_event_t ggml_backend_cann_device_event_new(ggml_backend_dev_t dev) {
|
|
2821
|
+
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *) dev->context;
|
|
2756
2822
|
|
|
2757
2823
|
ggml_cann_set_device(dev_ctx->device);
|
|
2758
2824
|
|
|
@@ -2774,7 +2840,7 @@ static ggml_backend_event_t ggml_backend_cann_device_event_new(
|
|
|
2774
2840
|
* @param event Pointer to the event structure to be freed.
|
|
2775
2841
|
*/
|
|
2776
2842
|
static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
|
2777
|
-
ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
|
|
2843
|
+
ACL_CHECK(aclrtDestroyEvent((aclrtEvent) event->context));
|
|
2778
2844
|
|
|
2779
2845
|
delete event;
|
|
2780
2846
|
GGML_UNUSED(dev);
|
|
@@ -2788,7 +2854,7 @@ static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_bac
|
|
|
2788
2854
|
* @param event Pointer to the event structure to be synchronized.
|
|
2789
2855
|
*/
|
|
2790
2856
|
static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
|
|
2791
|
-
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
|
|
2857
|
+
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent) event->context));
|
|
2792
2858
|
|
|
2793
2859
|
GGML_UNUSED(dev);
|
|
2794
2860
|
}
|
|
@@ -2799,10 +2865,10 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
|
|
|
2799
2865
|
/* .get_memory = */ ggml_backend_cann_device_get_memory,
|
|
2800
2866
|
/* .get_type = */ ggml_backend_cann_device_get_type,
|
|
2801
2867
|
/* .get_props = */ ggml_backend_cann_device_get_props,
|
|
2802
|
-
/* .init_backend = */ ggml_backend_cann_device_init,
|
|
2868
|
+
/* .init_backend = */ ggml_backend_cann_device_init, // called for every card
|
|
2803
2869
|
/* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
|
|
2804
2870
|
/* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
|
|
2805
|
-
/* .buffer_from_host_ptr = */ NULL,
|
|
2871
|
+
/* .buffer_from_host_ptr = */ NULL, // not supported for CANN
|
|
2806
2872
|
/* .supports_op = */ ggml_backend_cann_supports_op,
|
|
2807
2873
|
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
|
2808
2874
|
/* .offload_op = */ ggml_backend_cann_offload_op,
|
|
@@ -2811,7 +2877,6 @@ static const ggml_backend_device_i ggml_backend_cann_device_interface = {
|
|
|
2811
2877
|
/* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
|
|
2812
2878
|
};
|
|
2813
2879
|
|
|
2814
|
-
|
|
2815
2880
|
// backend reg
|
|
2816
2881
|
struct ggml_backend_cann_reg_context {
|
|
2817
2882
|
std::vector<ggml_backend_dev_t> devices;
|
|
@@ -2823,12 +2888,12 @@ static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
|
|
|
2823
2888
|
}
|
|
2824
2889
|
|
|
2825
2890
|
static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
|
|
2826
|
-
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
|
|
2891
|
+
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
|
|
2827
2892
|
return ctx->devices.size();
|
|
2828
2893
|
}
|
|
2829
2894
|
|
|
2830
2895
|
static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
|
2831
|
-
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
|
|
2896
|
+
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *) reg->context;
|
|
2832
2897
|
GGML_ASSERT(index < ctx->devices.size());
|
|
2833
2898
|
return ctx->devices[index];
|
|
2834
2899
|
}
|
|
@@ -2850,34 +2915,30 @@ static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
|
|
|
2850
2915
|
// backend registry, called only once for cann backend
|
|
2851
2916
|
ggml_backend_reg_t ggml_backend_cann_reg() {
|
|
2852
2917
|
static ggml_backend_reg reg;
|
|
2853
|
-
static bool
|
|
2918
|
+
static bool initialized = false;
|
|
2854
2919
|
|
|
2855
2920
|
{
|
|
2856
|
-
static std::mutex
|
|
2921
|
+
static std::mutex mutex;
|
|
2857
2922
|
std::lock_guard<std::mutex> lock(mutex);
|
|
2858
2923
|
if (!initialized) {
|
|
2859
2924
|
aclInit(nullptr);
|
|
2860
2925
|
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
|
|
2861
2926
|
|
|
2862
2927
|
for (int i = 0; i < ggml_cann_info().device_count; i++) {
|
|
2863
|
-
ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
|
|
2864
|
-
dev_ctx->description
|
|
2865
|
-
dev_ctx->device
|
|
2866
|
-
dev_ctx->name
|
|
2928
|
+
ggml_backend_cann_device_context * dev_ctx = new ggml_backend_cann_device_context();
|
|
2929
|
+
dev_ctx->description = aclrtGetSocName();
|
|
2930
|
+
dev_ctx->device = i;
|
|
2931
|
+
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
|
|
2867
2932
|
ggml_cann_set_device(i);
|
|
2868
|
-
ggml_backend_dev_t dev = new ggml_backend_device
|
|
2869
|
-
|
|
2870
|
-
|
|
2871
|
-
/* .context = */ dev_ctx
|
|
2872
|
-
};
|
|
2933
|
+
ggml_backend_dev_t dev = new ggml_backend_device{ /* .iface = */ ggml_backend_cann_device_interface,
|
|
2934
|
+
/* .reg = */ ®,
|
|
2935
|
+
/* .context = */ dev_ctx };
|
|
2873
2936
|
ctx->devices.push_back(dev);
|
|
2874
2937
|
}
|
|
2875
2938
|
|
|
2876
|
-
reg = ggml_backend_reg
|
|
2877
|
-
|
|
2878
|
-
|
|
2879
|
-
/* .context = */ ctx
|
|
2880
|
-
};
|
|
2939
|
+
reg = ggml_backend_reg{ /* .api_version = */ GGML_BACKEND_API_VERSION,
|
|
2940
|
+
/* .iface = */ ggml_backend_cann_reg_interface,
|
|
2941
|
+
/* .context = */ ctx };
|
|
2881
2942
|
}
|
|
2882
2943
|
|
|
2883
2944
|
initialized = true;
|
|
@@ -2893,39 +2954,36 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
|
|
2893
2954
|
return nullptr;
|
|
2894
2955
|
}
|
|
2895
2956
|
|
|
2896
|
-
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
|
2957
|
+
ggml_backend_cann_context * ctx = new ggml_backend_cann_context(device);
|
|
2897
2958
|
if (ctx == nullptr) {
|
|
2898
2959
|
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
|
2899
2960
|
return nullptr;
|
|
2900
2961
|
}
|
|
2901
2962
|
ggml_cann_set_device(ctx->device);
|
|
2902
2963
|
ggml_backend_t cann_backend =
|
|
2903
|
-
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2964
|
+
new ggml_backend{ /* .guid = */ ggml_backend_cann_guid(),
|
|
2965
|
+
/* .interface = */ ggml_backend_cann_interface,
|
|
2966
|
+
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
|
|
2967
|
+
/* .context = */ ctx };
|
|
2907
2968
|
|
|
2908
2969
|
return cann_backend;
|
|
2909
2970
|
}
|
|
2910
2971
|
|
|
2911
2972
|
bool ggml_backend_is_cann(ggml_backend_t backend) {
|
|
2912
|
-
return backend != NULL &&
|
|
2913
|
-
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
|
2973
|
+
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
|
2914
2974
|
}
|
|
2915
2975
|
|
|
2916
2976
|
int32_t ggml_backend_cann_get_device_count() {
|
|
2917
2977
|
return ggml_cann_info().device_count;
|
|
2918
2978
|
}
|
|
2919
2979
|
|
|
2920
|
-
void ggml_backend_cann_get_device_description(
|
|
2921
|
-
int32_t device, char* description, size_t description_size) {
|
|
2980
|
+
void ggml_backend_cann_get_device_description(int32_t device, char * description, size_t description_size) {
|
|
2922
2981
|
ggml_cann_set_device(device);
|
|
2923
|
-
const char* soc_name = aclrtGetSocName();
|
|
2982
|
+
const char * soc_name = aclrtGetSocName();
|
|
2924
2983
|
snprintf(description, description_size, "%s", soc_name);
|
|
2925
2984
|
}
|
|
2926
2985
|
|
|
2927
|
-
void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
|
2928
|
-
size_t* total) {
|
|
2986
|
+
void ggml_backend_cann_get_device_memory(int32_t device, size_t * free, size_t * total) {
|
|
2929
2987
|
ggml_cann_set_device(device);
|
|
2930
2988
|
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
|
2931
2989
|
}
|